diff --git a/Cargo.lock b/Cargo.lock
index 8a1e1f08c3..af10c55a23 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -934,6 +934,15 @@ version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
 
+[[package]]
+name = "crc32c"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
+dependencies = [
+ "rustc_version",
+]
+
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@@ -1665,6 +1674,26 @@ dependencies = [
  "vm_resource",
 ]
 
+[[package]]
+name = "disklayer_vhdx"
+version = "0.0.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "blocking",
+ "disk_backend",
+ "disk_backend_resources",
+ "disk_layered",
+ "guestmem",
+ "inspect",
+ "pal_async",
+ "scsi_buffers",
+ "tempfile",
+ "thiserror 2.0.16",
+ "vhdx",
+ "vm_resource",
+]
+
 [[package]]
 name = "dissimilar"
 version = "1.0.10"
@@ -1688,9 +1717,9 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
 [[package]]
 name = "elfcore"
-version = "2.0.1"
+version = "2.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e254a61387a9d5706e00576e8ddc08705a8ce3f2d306280459ece426378f94f2"
+checksum = "0fdaa3d1c27119b3394513f4596894a40cd53cb4acec7fce636a9ca0c4abb171"
 dependencies = [
  "libc",
  "nix 0.31.2",
@@ -4768,9 +4797,9 @@ dependencies = [
 
 [[package]]
 name = "ntapi"
-version = "0.4.3"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
+checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
 dependencies = [
  "winapi",
 ]
@@ -5474,6 +5503,7 @@ dependencies = [
  "disk_backend_resources",
  "disk_vhd1",
  "disk_vhdmp",
+ "disklayer_vhdx",
  "fs-err",
  "get_resources",
  "hypervisor_resources",
@@ -5532,6 +5562,7 @@ dependencies = [
  "disk_vhdmp",
  "disklayer_ram",
  "disklayer_sqlite",
+ "disklayer_vhdx",
  "gdma",
  "guest_crash_device",
  "guest_emulation_device",
@@ -8779,6 +8810,26 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "vhdx"
+version = "0.0.0"
+dependencies = [
+ "bitfield-struct 0.11.0",
+ "bitvec",
+ "crc32c",
+ "event-listener",
+ "futures",
+ "guid",
+ "mesh",
+ "pal_async",
+ "parking_lot",
+ "tempfile",
+ "thiserror 2.0.16",
+ "tracing",
+ "windows 0.62.0",
+ "zerocopy",
+]
+
 [[package]]
 name = "vhost_user_backend"
 version = "0.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index 6184da3a66..ab831f75d3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -308,6 +308,7 @@ disk_delay = { path = "vm/devices/storage/disk_delay" }
 disk_prwrap = { path = "vm/devices/storage/disk_prwrap" }
 disk_striped = { path = "vm/devices/storage/disk_striped" }
 disk_vhd1 = { path = "vm/devices/storage/disk_vhd1" }
+disklayer_vhdx = { path = "vm/devices/storage/disklayer_vhdx" }
 disk_vhdmp = { path = "vm/devices/storage/disk_vhdmp" }
 disklayer_ram = { path = "vm/devices/storage/disklayer_ram" }
 disklayer_sqlite = { path = "vm/devices/storage/disklayer_sqlite" }
@@ -325,6 +326,7 @@ storvsc_driver = { path = "vm/devices/storage/storvsc_driver" }
 storvsp = { path = "vm/devices/storage/storvsp" }
 storvsp_protocol = { path = "vm/devices/storage/storvsp_protocol" }
 storvsp_resources = { path = "vm/devices/storage/storvsp_resources" }
+vhdx = { path = "vm/devices/storage/vhdx" }
 device_emulators = { path = "vm/devices/support/device_emulators" }
 fuse = { path = "vm/devices/support/fs/fuse" }
 lx = { path = "vm/devices/support/fs/lx" }
@@ -455,6 +457,7 @@ cc = "1.2.34"
 cfg-if = "1"
 clap = "4.2"
 constant_time_eq = "0.3"
+crc32c = "0.6"
 crc32fast = { version = "1.3.2", default-features = false }
 criterion = { version = "0.7", default-features = false }
 crossterm = { version = "0.29.0", default-features = false }
diff --git a/Guide/src/SUMMARY.md b/Guide/src/SUMMARY.md
index b9533b7948..077def0349 100644
--- a/Guide/src/SUMMARY.md
+++ b/Guide/src/SUMMARY.md
@@ -123,6 +123,7 @@
   - [Serial]()
   - [Graphics and Input]()
   - [Storage](./reference/backends/storage.md)
+    - [VHDX Parser](./reference/backends/vhdx.md)
   - [Networking](./reference/backends/networking.md)
     - [Consomme](./reference/backends/consomme.md)
 - [Architecture](./reference/architecture.md)
diff --git a/Guide/src/reference/architecture/devices/storage.md b/Guide/src/reference/architecture/devices/storage.md
index e71fabe898..2e3fc0cbc7 100644
--- a/Guide/src/reference/architecture/devices/storage.md
+++ b/Guide/src/reference/architecture/devices/storage.md
@@ -172,6 +172,7 @@ For the OpenHCL settings model (`StorageController`, `Lun`, `PhysicalDevice`), s
 | FileDisk | [`disk_file`](https://openvmm.dev/rustdoc/linux/disk_file/index.html) | Host file | Cross-platform | Simplest backend |
 | Vhd1Disk | [`disk_vhd1`](https://openvmm.dev/rustdoc/linux/disk_vhd1/index.html) | VHD1 fixed file | Cross-platform | Parses VHD footer |
 | VhdmpDisk | `disk_vhdmp` | Windows vhdmp driver | Windows | Dynamic/differencing VHD/VHDX |
+| VhdxDisk | [`vhdx`](../../backends/vhdx.md) | VHDX file | Cross-platform | Pure-Rust VHDX parser |
 | BlobDisk | [`disk_blob`](https://openvmm.dev/rustdoc/linux/disk_blob/index.html) | HTTP / Azure Blob | Cross-platform | Read-only, HTTP range requests |
 | BlockDeviceDisk | [`disk_blockdevice`](https://openvmm.dev/rustdoc/linux/disk_blockdevice/index.html) | Linux block device | Linux | io_uring, resize via uevent, PR passthrough |
 | NvmeDisk | [`disk_nvme`](https://openvmm.dev/rustdoc/linux/disk_nvme/index.html) | Physical NVMe (VFIO) | Linux/Windows | User-mode NVMe driver, resize via AEN |
diff --git a/Guide/src/reference/backends/storage.md b/Guide/src/reference/backends/storage.md
index 96e60e4ce8..065cb0f1cb 100644
--- a/Guide/src/reference/backends/storage.md
+++ b/Guide/src/reference/backends/storage.md
@@ -16,6 +16,7 @@ blob, or a layered composition of multiple backends.
 | FileDisk | [`disk_file`](https://openvmm.dev/rustdoc/linux/disk_file/index.html) | Host file | Cross-platform | Simplest backend. Blocking I/O via `unblock()`. |
 | Vhd1Disk | [`disk_vhd1`](https://openvmm.dev/rustdoc/linux/disk_vhd1/index.html) | VHD1 fixed file | Cross-platform | Parses VHD footer for geometry. |
 | VhdmpDisk | `disk_vhdmp` | Windows vhdmp driver | Windows | Dynamic and differencing VHD/VHDX. |
+| VhdxDisk | [`vhdx`](vhdx.md) | VHDX file | Cross-platform | Pure-Rust VHDX parser. Dynamic, fixed, and differencing. |
 | BlobDisk | [`disk_blob`](https://openvmm.dev/rustdoc/linux/disk_blob/index.html) | HTTP / Azure Blob | Cross-platform | Read-only. HTTP range requests. |
 | BlockDeviceDisk | [`disk_blockdevice`](https://openvmm.dev/rustdoc/linux/disk_blockdevice/index.html) | Linux block device or file | Linux | io_uring, resize via uevent, PR passthrough. Default for raw files on Linux in both OpenHCL and OpenVMM. |
 | NvmeDisk | [`disk_nvme`](https://openvmm.dev/rustdoc/linux/disk_nvme/index.html) | Physical NVMe (VFIO) | Linux/Windows | User-mode NVMe driver. Resize via AEN. |
diff --git a/Guide/src/reference/backends/vhdx.md b/Guide/src/reference/backends/vhdx.md
new file mode 100644
index 0000000000..2f0ebad334
--- /dev/null
+++ b/Guide/src/reference/backends/vhdx.md
@@ -0,0 +1,109 @@
+# VHDX parser
+
+The `vhdx` crate (`vm/devices/storage/vhdx/`) is a pure-Rust
+implementation of the
+[VHDX format specification](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-vhdx/).
+It supports dynamic, fixed, and differencing VHDX virtual hard disk
+files on all platforms — no Windows APIs or kernel drivers required.
+
+## Features
+
+- **Create** and **open** VHDX files (read-only or writable)
+- Dynamic block allocation with four-priority free space management
+- Write-ahead log (WAL) for crash-consistent metadata updates
+- Sector bitmap tracking for partially-present (differencing) blocks
+- Block trim/unmap with multiple modes (file space, free space, zero,
+  transparent, soft-anchor removal)
+- Concurrent flush coalescing
+- Parent locator parsing for differencing disk chains
+
+## Architecture
+
+A VHDX file stores a virtual disk as a collection of fixed-size data
+blocks (default 2 MiB) tracked by a Block Allocation Table (BAT).
+The crate's write path uses a three-stage pipeline for crash
+consistency:
+
+```text
+┌───────────┐   commit   ┌──────────┐   apply   ┌────────────┐
+│   Cache   │ ──────────►│ Log Task │ ─────────►│ Apply Task │
+│ (dirty    │  dirty     │ (WAL     │  logged   │ (final     │
+│  pages)   │  pages     │  writer) │  pages    │  offsets)  │
+└───────────┘            └──────────┘           └────────────┘
+```
+
+1. The **cache** accumulates dirty 4 KiB metadata pages (BAT entries,
+   sector bitmap bits). When the dirty count reaches a threshold or
+   `flush()` is called, pages are committed to the log task.
+2. The **log task** writes WAL entries to the circular log region in
+   the VHDX file. On crash, `replay_log()` restores metadata from
+   the WAL.
+3. The **apply task** writes logged pages to their final file offsets.
+
+Backpressure is managed by a permit semaphore that limits in-flight
+pages. A flush sequencer coalesces concurrent flush requests so at
+most one file flush is in progress at a time.
+
+## Lifecycle
+
+```rust,ignore
+// Create a new empty VHDX file.
+create::create(&file, &mut params).await?;
+
+// Open for writing.
+let vhdx = VhdxFile::open(file)
+    .block_alignment(2 * 1024 * 1024)
+    .writable(&spawner)
+    .await?;
+
+// Resolve a read — returns file-level ranges.
+let mut ranges = Vec::new();
+let guard = vhdx.resolve_read(offset, len, &mut ranges).await?;
+// ... perform file I/O at the returned offsets ...
+drop(guard);
+
+// Resolve a write — returns file-level ranges + I/O guard.
+let mut ranges = Vec::new();
+let guard = vhdx.resolve_write(offset, len, &mut ranges).await?;
+// ... write data at the returned offsets ...
+guard.complete().await?;
+
+// Flush to stable storage.
+vhdx.flush().await?;
+
+// Clean close (clears log GUID).
+vhdx.close().await?;
+```
+
+## I/O model
+
+The crate separates **metadata I/O** from **payload I/O**.
+
+Metadata I/O (headers, BAT pages, sector bitmaps, WAL entries) is
+handled internally through the `AsyncFile` trait — the caller provides
+an `AsyncFile` implementation at open time and never thinks about
+metadata again.
+
+Payload I/O (guest data reads and writes) is the caller's
+responsibility. `resolve_read()` and `resolve_write()` translate
+virtual disk offsets into file-level byte ranges (`ReadRange` /
+`WriteRange`). The caller performs its own data I/O at those offsets
+using whatever mechanism it prefers (io_uring, standard file I/O,
+etc.), then finalizes metadata via the returned I/O guard. This
+separation lets the caller use a different, potentially more
+performant I/O path for bulk data without the crate imposing any
+particular strategy.
+
+- The `vhdx` crate provides the low-level VHDX format implementation
+  and I/O resolution API. For OpenVMM integration, the `disklayer_vhdx`
+  crate supplies a `LayerIo`-compatible backend used in the layered
+  disk storage pipeline.
+- For differencing disks, the `vhdx` crate parses parent locator
+  metadata, while `disklayer_vhdx::chain::open_vhdx_chain` walks and
+  opens parent chains automatically.
+
+## Related pages
+
+- [Storage backends](./storage.md) — catalog of all storage backends
+- [Storage pipeline](../architecture/devices/storage.md) — how
+  frontends, backends, and layers connect
diff --git a/Guide/src/reference/openvmm/management/cli.md b/Guide/src/reference/openvmm/management/cli.md
index a042451178..a9bdb1a291 100644
--- a/Guide/src/reference/openvmm/management/cli.md
+++ b/Guide/src/reference/openvmm/management/cli.md
@@ -17,7 +17,7 @@ as well as the generated CLI help (via `cargo run -- --help`).
   pass `--hv`. The `DISK` argument can be:
   * A flat binary disk image
   * A VHD file with an extension of .vhd (Windows host only)
-  * A VHDX file with an extension of .vhdx (Windows host only)
+  * A VHDX file with an extension of .vhdx
 
   On Linux, raw files and block devices use the `disk_blockdevice` backend
   (io_uring-based async I/O) by default. Append `;direct` to the path to
diff --git a/Guide/src/user_guide/openvmm/run.md b/Guide/src/user_guide/openvmm/run.md
index 5d5883e26c..605ee5e39b 100644
--- a/Guide/src/user_guide/openvmm/run.md
+++ b/Guide/src/user_guide/openvmm/run.md
@@ -133,12 +133,14 @@ docs.
 
 The file `windows.vhdx` can be any format of VHD(X).
 
-Note that OpenVMM does not currently support using dynamic VHD/VHDX files on
-Linux hosts. Unless you have a fixed VHD1 image, you will need to convert the
-image to raw format, using the following command:
+VHDX files (dynamic, fixed, and differencing) are supported on non-Windows
+platforms via the pure-Rust [`vhdx`](../../reference/backends/vhdx.md)
+parser. On Windows, `.vhdx` files use the native kernel-mode VHD path
+instead. Fixed VHD1 images work on all platforms. Dynamic and differencing VHD1
+files are **not** supported — convert them to VHDX first:
 
-```shell
-qemu-img convert -f vhdx -O raw windows.vhdx windows.img
+```bash
+qemu-img convert -f vpc -O vhdx dynamic.vhd converted.vhdx
 ```
 
 Also, note the use of `memdiff`, which creates a memory-backed "differencing
@@ -157,7 +159,7 @@ integration tests.
 First, build the test artifacts from Linux or WSL using `vmm-tests-run --build-only`.
 The IGVM must be built on Linux:
 
-```shell
+```bash
 cargo xflowey vmm-tests-run --build-only --dir <out> --target windows-x64
 ```
 
diff --git a/openvmm/openvmm_helpers/Cargo.toml b/openvmm/openvmm_helpers/Cargo.toml
index 6ebefa738d..60c3a6e4c9 100644
--- a/openvmm/openvmm_helpers/Cargo.toml
+++ b/openvmm/openvmm_helpers/Cargo.toml
@@ -9,6 +9,7 @@ rust-version.workspace = true
 [dependencies]
 disk_backend_resources.workspace = true
 disk_vhd1.workspace = true
+disklayer_vhdx.workspace = true
 get_resources.workspace = true
 hypervisor_resources.workspace = true
 openvmm_defs.workspace = true
diff --git a/openvmm/openvmm_helpers/src/disk.rs b/openvmm/openvmm_helpers/src/disk.rs
index b1d45f016a..dbaf852a7a 100644
--- a/openvmm/openvmm_helpers/src/disk.rs
+++ b/openvmm/openvmm_helpers/src/disk.rs
@@ -34,8 +34,12 @@ pub struct OpenDiskOptions {
 /// Opens the resources needed for using a disk from a file at `path`.
 ///
 /// If the file ends with .vhd and is a fixed VHD1, it will be opened using
-/// the user-mode VHD parser. Otherwise, if the file ends with .vhd or
-/// .vhdx, the file will be opened using the kernel-mode VHD parser.
+/// the user-mode VHD parser. Otherwise, if the file ends with .vhd, the
+/// file will be opened using the kernel-mode VHD parser (Windows only).
+///
+/// If the file ends with .vhdx, the kernel-mode VHD parser is used on
+/// Windows. On Linux, the pure-Rust VHDX parser is used, with automatic
+/// parent-locator walking for differencing chains.
 pub async fn open_disk_type(
     path: &Path,
     options: OpenDiskOptions,
@@ -91,7 +95,10 @@ pub async fn open_disk_type(
                 ))
             }
             #[cfg(not(windows))]
-            anyhow::bail!("VHDX not supported on Linux");
+            {
+                ensure_no_direct(".vhdx")?;
+                disklayer_vhdx::chain::open_vhdx_chain(path, read_only).await?
+            }
         }
         Some("iso") if !read_only => {
             anyhow::bail!("iso file cannot be opened as read/write")
diff --git a/openvmm/openvmm_resources/Cargo.toml b/openvmm/openvmm_resources/Cargo.toml
index 33aea95551..cb34d64783 100644
--- a/openvmm/openvmm_resources/Cargo.toml
+++ b/openvmm/openvmm_resources/Cargo.toml
@@ -44,6 +44,7 @@ disk_file.workspace = true
 disk_layered.workspace = true
 disk_prwrap.workspace = true
 disk_vhd1.workspace = true
+disklayer_vhdx.workspace = true
 disklayer_ram.workspace = true
 disklayer_sqlite = { workspace = true, optional = true }
 
diff --git a/openvmm/openvmm_resources/src/lib.rs b/openvmm/openvmm_resources/src/lib.rs
index 81e40e4103..eabb8c873b 100644
--- a/openvmm/openvmm_resources/src/lib.rs
+++ b/openvmm/openvmm_resources/src/lib.rs
@@ -66,6 +66,7 @@ vm_resource::register_static_resolvers! {
     disklayer_ram::resolver::RamDiskLayerResolver,
     #[cfg(feature = "disklayer_sqlite")]
     disklayer_sqlite::resolver::SqliteDiskLayerResolver,
+    disklayer_vhdx::resolver::VhdxDiskLayerResolver,
 
     // PCI devices
     gdma::resolver::GdmaDeviceResolver,
diff --git a/vm/devices/storage/disk_backend_resources/src/layer.rs b/vm/devices/storage/disk_backend_resources/src/layer.rs
index 08aa58bc8a..eab80c6a71 100644
--- a/vm/devices/storage/disk_backend_resources/src/layer.rs
+++ b/vm/devices/storage/disk_backend_resources/src/layer.rs
@@ -74,3 +74,16 @@ pub struct SqliteAutoCacheDiskLayerHandle {
 impl ResourceId<DiskLayerHandleKind> for SqliteAutoCacheDiskLayerHandle {
     const ID: &'static str = "sqlite-autocache";
 }
+
+/// Handle for a VHDX disk layer.
+#[derive(MeshPayload)]
+pub struct VhdxDiskLayerHandle {
+    /// The open file handle for the VHDX file.
+    pub file: std::fs::File,
+    /// Whether to open the VHDX as read-only.
+    pub read_only: bool,
+}
+
+impl ResourceId<DiskLayerHandleKind> for VhdxDiskLayerHandle {
+    const ID: &'static str = "vhdx";
+}
diff --git a/vm/devices/storage/disklayer_vhdx/Cargo.toml b/vm/devices/storage/disklayer_vhdx/Cargo.toml
new file mode 100644
index 0000000000..e93c77eed1
--- /dev/null
+++ b/vm/devices/storage/disklayer_vhdx/Cargo.toml
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+[package]
+name = "disklayer_vhdx"
+edition.workspace = true
+rust-version.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+async-trait.workspace = true
+blocking.workspace = true
+disk_backend.workspace = true
+disk_backend_resources.workspace = true
+disk_layered.workspace = true
+guestmem.workspace = true
+inspect.workspace = true
+scsi_buffers.workspace = true
+thiserror.workspace = true
+vhdx.workspace = true
+vm_resource.workspace = true
+
+[dev-dependencies]
+pal_async.workspace = true
+tempfile.workspace = true
+
+[lints]
+workspace = true
diff --git a/vm/devices/storage/disklayer_vhdx/src/chain.rs b/vm/devices/storage/disklayer_vhdx/src/chain.rs
new file mode 100644
index 0000000000..0c6597ca77
--- /dev/null
+++ b/vm/devices/storage/disklayer_vhdx/src/chain.rs
@@ -0,0 +1,314 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! VHDX chain helpers.
+//!
+//! Functions for opening one or more VHDX files as a
+//! [`LayeredDiskHandle`](disk_backend_resources::LayeredDiskHandle) ready for
+//! resource resolution.
+
+use anyhow::Context;
+use disk_backend_resources::DiskLayerDescription;
+use disk_backend_resources::LayeredDiskHandle;
+use disk_backend_resources::layer::VhdxDiskLayerHandle;
+use std::path::Path;
+use vm_resource::IntoResource;
+use vm_resource::Resource;
+use vm_resource::kind::DiskHandleKind;
+
+/// Open a single VHDX file as a [`LayeredDiskHandle`] with one layer.
+///
+/// Use this for base (non-differencing) VHDX files. For differencing chains,
+/// use [`open_vhdx_chain_explicit`] or [`open_vhdx_chain`].
+///
+/// The file is opened for read+write unless `read_only` is true.
+pub fn open_vhdx_single(path: &Path, read_only: bool) -> anyhow::Result<Resource<DiskHandleKind>> {
+    let file = std::fs::OpenOptions::new()
+        .read(true)
+        .write(!read_only)
+        .open(path)?;
+
+    Ok(Resource::new(LayeredDiskHandle::single_layer(
+        VhdxDiskLayerHandle { file, read_only },
+    )))
+}
+
+/// Open a VHDX differencing chain from an explicit list of file paths.
+///
+/// `paths` must be ordered from **leaf** (child, index 0) to **base**
+/// (parent, last index). The leaf is opened for read+write (unless
+/// `read_only` is true); all parent files are opened read-only.
+///
+/// Returns a [`LayeredDiskHandle`] with layers ordered top (leaf) to
+/// bottom (base), matching the order expected by
+/// [`LayeredDisk`](disk_layered::LayeredDisk).
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - `paths` is empty
+/// - Any file cannot be opened
+///
+/// # Example
+///
+/// ```no_run
+/// # use disklayer_vhdx::chain::open_vhdx_chain_explicit;
+/// # use std::path::Path;
+/// let resource = open_vhdx_chain_explicit(
+///     &[Path::new("child.vhdx"), Path::new("base.vhdx")],
+///     false,
+/// ).unwrap();
+/// ```
+pub fn open_vhdx_chain_explicit(
+    paths: &[&Path],
+    read_only: bool,
+) -> anyhow::Result<Resource<DiskHandleKind>> {
+    anyhow::ensure!(!paths.is_empty(), "vhdx chain must have at least one file");
+
+    let layers: Vec<DiskLayerDescription> = paths
+        .iter()
+        .enumerate()
+        .map(|(i, path)| {
+            let is_leaf = i == 0;
+            let layer_read_only = !is_leaf || read_only;
+
+            let file = std::fs::OpenOptions::new()
+                .read(true)
+                .write(!layer_read_only)
+                .open(path)
+                .with_context(|| format!("failed to open vhdx layer {}: {}", i, path.display()))?;
+
+            let handle = VhdxDiskLayerHandle {
+                file,
+                read_only: layer_read_only,
+            };
+
+            Ok(DiskLayerDescription {
+                layer: handle.into_resource(),
+                read_cache: false,
+                write_through: false,
+            })
+        })
+        .collect::<anyhow::Result<Vec<_>>>()?;
+
+    Ok(Resource::new(LayeredDiskHandle { layers }))
+}
+
+/// Open a VHDX differencing chain by auto-walking parent locators.
+///
+/// Starting from the file at `path`, reads each VHDX file's parent locator
+/// to discover the next parent in the chain, continuing until a base
+/// (non-differencing) disk is found.
+///
+/// The leaf file is opened for read+write (unless `read_only` is true);
+/// all parent files are opened read-only.
+///
+/// Parent path resolution order:
+/// 1. `relative_path` — resolved relative to the child's directory
+/// 2. `absolute_win32_path` — absolute path (platform-dependent)
+/// 3. `volume_path` — volume GUID path (Windows-specific)
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The leaf file cannot be opened or parsed
+/// - A parent locator specifies no usable path
+/// - A parent file cannot be found at any of the locator paths
+/// - The chain exceeds a reasonable depth limit (detect cycles)
+pub async fn open_vhdx_chain(
+    path: &Path,
+    read_only: bool,
+) -> anyhow::Result<Resource<DiskHandleKind>> {
+    // Reasonable depth limit to detect cycles or absurdly long chains.
+    const MAX_CHAIN_DEPTH: usize = 256;
+
+    let mut paths: Vec<std::path::PathBuf> = vec![path.to_path_buf()];
+    let mut current_path = path.to_path_buf();
+
+    loop {
+        if paths.len() > MAX_CHAIN_DEPTH {
+            anyhow::bail!(
+                "vhdx chain exceeds maximum depth of {} — possible cycle",
+                MAX_CHAIN_DEPTH
+            );
+        }
+
+        // Open the current file read-only just to read metadata.
+        // The actual read-write open happens later via open_vhdx_chain_explicit.
+        let bf = crate::io::BlockingFile::open(&current_path, true)
+            .with_context(|| format!("failed to open vhdx file: {}", current_path.display()))?;
+        let vhdx = vhdx::VhdxFile::open(bf)
+            .read_only()
+            .await
+            .with_context(|| format!("failed to parse vhdx file: {}", current_path.display()))?;
+
+        if !vhdx.has_parent() {
+            // Base disk — chain is complete.
+            break;
+        }
+
+        // Read the parent locator.
+        let locator = vhdx
+            .parent_locator()
+            .await
+            .with_context(|| {
+                format!(
+                    "failed to read parent locator from: {}",
+                    current_path.display()
+                )
+            })?
+            .context("differencing disk has no parent locator")?;
+
+        let parent_paths = locator.parent_paths();
+        let child_dir = current_path.parent().unwrap_or_else(|| Path::new("."));
+
+        // Try to resolve the parent path in order of preference.
+        let parent_path = resolve_parent_path(child_dir, &parent_paths).with_context(|| {
+            format!(
+                "could not find parent for vhdx file: {}",
+                current_path.display()
+            )
+        })?;
+
+        paths.push(parent_path.clone());
+        current_path = parent_path;
+    }
+
+    // Convert PathBufs to Path references for open_vhdx_chain_explicit.
+    let path_refs: Vec<&Path> = paths.iter().map(|p| p.as_path()).collect();
+    open_vhdx_chain_explicit(&path_refs, read_only)
+}
+
+/// Try to resolve a parent path from the locator's well-known keys.
+///
+/// Tries paths in order: relative_path, absolute_win32_path, volume_path.
+/// Returns the first path that exists on disk, or an error if none work.
+fn resolve_parent_path(
+    child_dir: &Path,
+    parent_paths: &vhdx::ParentPaths,
+) -> anyhow::Result<std::path::PathBuf> {
+    let mut candidates: Vec<std::path::PathBuf> = Vec::new();
+
+    // 1. Relative path — resolve relative to the child's directory.
+    if let Some(ref rel) = parent_paths.relative_path {
+        // VHDX relative paths use Windows separators (backslash).
+        // Normalize to the platform's separator.
+        let normalized: String = rel
+            .chars()
+            .map(|c| {
+                if c == '\\' {
+                    std::path::MAIN_SEPARATOR
+                } else {
+                    c
+                }
+            })
+            .collect();
+        // Strip leading ".\" or "./" if present.
+        let stripped = normalized
+            .strip_prefix(&format!(".{}", std::path::MAIN_SEPARATOR))
+            .unwrap_or(&normalized);
+        candidates.push(child_dir.join(stripped));
+    }
+
+    // 2. Absolute Win32 path (Windows-specific).
+    if cfg!(windows) {
+        if let Some(ref abs) = parent_paths.absolute_win32_path {
+            candidates.push(std::path::PathBuf::from(abs));
+        }
+    }
+
+    // 3. Volume path (Windows-specific).
+    if cfg!(windows) {
+        if let Some(ref vol) = parent_paths.volume_path {
+            candidates.push(std::path::PathBuf::from(vol));
+        }
+    }
+
+    for candidate in &candidates {
+        if candidate.exists() {
+            return Ok(candidate.clone());
+        }
+    }
+
+    if candidates.is_empty() {
+        anyhow::bail!("parent locator contains no path entries");
+    }
+
+    // None of the candidates exist. Report all attempted paths.
+    let tried: Vec<String> = candidates.iter().map(|p| p.display().to_string()).collect();
+    anyhow::bail!("parent not found at any locator path: {}", tried.join(", "))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn open_single_creates_one_layer() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.vhdx");
+
+        let path2 = path.clone();
+        pal_async::DefaultPool::run_with(|_driver| async move {
+            let bf = crate::io::BlockingFile::open(&path2, false).unwrap();
+            let mut params = vhdx::CreateParams {
+                disk_size: 1024 * 1024,
+                ..Default::default()
+            };
+            vhdx::create(&bf, &mut params).await.unwrap();
+        });
+
+        let resource = open_vhdx_single(&path, false).unwrap();
+        let _ = resource;
+    }
+
+    #[test]
+    fn explicit_chain_empty_errors() {
+        let result = open_vhdx_chain_explicit(&[], false);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn explicit_chain_single_path() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("base.vhdx");
+
+        let path2 = path.clone();
+        pal_async::DefaultPool::run_with(|_driver| async move {
+            let bf = crate::io::BlockingFile::open(&path2, false).unwrap();
+            let mut params = vhdx::CreateParams {
+                disk_size: 1024 * 1024,
+                ..Default::default()
+            };
+            vhdx::create(&bf, &mut params).await.unwrap();
+        });
+
+        let resource = open_vhdx_chain_explicit(&[path.as_path()], false).unwrap();
+        let _ = resource;
+    }
+
+    #[test]
+    fn explicit_chain_missing_file_errors() {
+        let result = open_vhdx_chain_explicit(&[Path::new("nonexistent.vhdx")], false);
+        assert!(result.is_err());
+    }
+
+    #[pal_async::async_test]
+    async fn auto_walk_base_disk() {
+        // Create a base (non-differencing) VHDX, then auto-walk it.
+        // Should produce a single-layer chain.
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("base.vhdx");
+
+        let bf = crate::io::BlockingFile::open(&path, false).unwrap();
+        let mut params = vhdx::CreateParams {
+            disk_size: 1024 * 1024,
+            ..Default::default()
+        };
+        vhdx::create(&bf, &mut params).await.unwrap();
+        drop(bf);
+
+        let resource = open_vhdx_chain(&path, false).await.unwrap();
+        let _ = resource;
+    }
+}
diff --git a/vm/devices/storage/disklayer_vhdx/src/io.rs b/vm/devices/storage/disklayer_vhdx/src/io.rs
new file mode 100644
index 0000000000..a696bcd3ca
--- /dev/null
+++ b/vm/devices/storage/disklayer_vhdx/src/io.rs
@@ -0,0 +1,217 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Async file I/O backends for the `vhdx` crate.
+//!
+//! [`BlockingFile`] implements [`vhdx::AsyncFile`] using `blocking::unblock`
+//! and positional I/O (pread/pwrite on Unix, seek_read/seek_write on Windows).
+//! No Mutex is needed — `Arc<File>` with positional I/O is inherently safe
+//! for concurrent access.
+
+use std::borrow::Borrow;
+use std::fs;
+use std::io;
+use std::path::Path;
+use std::sync::Arc;
+use vhdx::AsyncFile;
+
+/// Platform-specific positional read.
+#[cfg(unix)]
+fn file_read_at(file: &fs::File, buf: &mut [u8], offset: u64) -> io::Result<usize> {
+    std::os::unix::fs::FileExt::read_at(file, buf, offset)
+}
+
+/// Platform-specific positional read.
+#[cfg(windows)]
+fn file_read_at(file: &fs::File, buf: &mut [u8], offset: u64) -> io::Result<usize> {
+    std::os::windows::fs::FileExt::seek_read(file, buf, offset)
+}
+
+/// Platform-specific positional write.
+#[cfg(unix)]
+fn file_write_at(file: &fs::File, buf: &[u8], offset: u64) -> io::Result<usize> {
+    std::os::unix::fs::FileExt::write_at(file, buf, offset)
+}
+
+/// Platform-specific positional write.
+#[cfg(windows)]
+fn file_write_at(file: &fs::File, buf: &[u8], offset: u64) -> io::Result<usize> {
+    std::os::windows::fs::FileExt::seek_write(file, buf, offset)
+}
+
+/// Read exactly `buf.len()` bytes at `offset`, looping on short reads.
+fn read_exact_at(file: &fs::File, mut buf: &mut [u8], mut offset: u64) -> io::Result<()> {
+    while !buf.is_empty() {
+        let n = file_read_at(file, buf, offset)?;
+        if n == 0 {
+            return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "short read"));
+        }
+        offset += n as u64;
+        buf = &mut buf[n..];
+    }
+    Ok(())
+}
+
+/// Write exactly `buf.len()` bytes at `offset`, looping on short writes.
+fn write_exact_at(file: &fs::File, mut buf: &[u8], mut offset: u64) -> io::Result<()> {
+    while !buf.is_empty() {
+        let n = file_write_at(file, buf, offset)?;
+        if n == 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::WriteZero,
+                "failed to write any bytes",
+            ));
+        }
+        offset += n as u64;
+        buf = &buf[n..];
+    }
+    Ok(())
+}
+
+/// A concrete [`AsyncFile`] backed by `Arc<std::fs::File>`.
+///
+/// Uses positional I/O so no seek state or Mutex is needed. Multiple I/Os
+/// can be dispatched concurrently. Each operation runs on the `blocking`
+/// crate's thread pool via `blocking::unblock`.
+#[derive(Clone)]
+pub struct BlockingFile {
+    file: Arc<fs::File>,
+}
+
+impl BlockingFile {
+    /// Wrap an existing open file.
+    pub fn new(file: fs::File) -> Self {
+        Self {
+            file: Arc::new(file),
+        }
+    }
+
+    /// Open a file at the given path.
+    ///
+    /// If `read_only`, the file is opened for reading only.
+    /// Otherwise, it is opened for reading, writing, and creation.
+    pub fn open(path: &Path, read_only: bool) -> io::Result<Self> {
+        let file = fs::OpenOptions::new()
+            .read(true)
+            .write(!read_only)
+            .create(!read_only)
+            .open(path)?;
+        Ok(Self::new(file))
+    }
+
+    /// Returns a clone of the inner `Arc<File>`.
+    ///
+    /// Useful when the caller needs to perform additional file operations
+    /// (e.g., data I/O on resolved ranges in the LayerIo implementation).
+    pub fn clone_arc(&self) -> Arc<fs::File> {
+        self.file.clone()
+    }
+}
+
+impl AsyncFile for BlockingFile {
+    type Buffer = Vec<u8>;
+
+    fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+        vec![0u8; len]
+    }
+
+    async fn read_into(&self, offset: u64, buf: Vec<u8>) -> Result<Vec<u8>, io::Error> {
+        let file = self.file.clone();
+        blocking::unblock(move || {
+            let mut buf = buf;
+            read_exact_at(&file, &mut buf, offset)?;
+            Ok(buf)
+        })
+        .await
+    }
+
+    async fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Vec<u8>> + Send + 'static,
+    ) -> Result<(), io::Error> {
+        let file = self.file.clone();
+        blocking::unblock(move || write_exact_at(&file, buf.borrow().as_ref(), offset)).await
+    }
+
+    async fn flush(&self) -> Result<(), io::Error> {
+        let file = self.file.clone();
+        blocking::unblock(move || file.sync_all()).await
+    }
+
+    async fn file_size(&self) -> Result<u64, io::Error> {
+        Ok(self.file.metadata()?.len())
+    }
+
+    async fn set_file_size(&self, size: u64) -> Result<(), io::Error> {
+        let file = self.file.clone();
+        blocking::unblock(move || file.set_len(size)).await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pal_async::async_test;
+
+    #[async_test]
+    async fn round_trip_read_write() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.bin");
+
+        // Create file, write data, read back
+        let bf = BlockingFile::open(&path, false).unwrap();
+        // Set a size
+        bf.set_file_size(4096).await.unwrap();
+
+        let write_data = Arc::new(vec![0xAB_u8; 512]);
+        bf.write_from(0, write_data.clone()).await.unwrap();
+        bf.write_from(1024, write_data.clone()).await.unwrap();
+
+        let read_buf = bf.read_into(0, vec![0u8; 512]).await.unwrap();
+        assert_eq!(read_buf, *write_data);
+
+        let read_buf = bf.read_into(1024, vec![0u8; 512]).await.unwrap();
+        assert_eq!(read_buf, *write_data);
+
+        // Verify gap is zeros
+        let read_buf = bf.read_into(512, vec![0u8; 512]).await.unwrap();
+        assert_eq!(read_buf, vec![0u8; 512]);
+
+        // Verify file_size
+        assert_eq!(bf.file_size().await.unwrap(), 4096);
+    }
+
+    #[async_test]
+    async fn flush_succeeds() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.bin");
+        let bf = BlockingFile::open(&path, false).unwrap();
+        bf.set_file_size(4096).await.unwrap();
+        bf.flush().await.unwrap();
+    }
+
+    #[async_test]
+    async fn open_with_vhdx() {
+        // Create a VHDX in memory, write to disk, open with BlockingFile,
+        // validate VhdxFile::open works
+        use vhdx::VhdxFile;
+
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.vhdx");
+
+        // Step 1: Create via BlockingFile
+        let bf = BlockingFile::open(&path, false).unwrap();
+        let mut params = vhdx::CreateParams {
+            disk_size: 1024 * 1024, // 1 MiB
+            ..Default::default()
+        };
+        vhdx::create(&bf, &mut params).await.unwrap();
+
+        // Step 2: Re-open and validate
+        let bf = BlockingFile::open(&path, false).unwrap();
+        let vhdx = VhdxFile::open(bf).read_only().await.unwrap();
+        assert_eq!(vhdx.disk_size(), 1024 * 1024);
+        assert_eq!(vhdx.logical_sector_size(), 512);
+    }
+}
diff --git a/vm/devices/storage/disklayer_vhdx/src/lib.rs b/vm/devices/storage/disklayer_vhdx/src/lib.rs
new file mode 100644
index 0000000000..e580785551
--- /dev/null
+++ b/vm/devices/storage/disklayer_vhdx/src/lib.rs
@@ -0,0 +1,491 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! VHDX disk layer for OpenVMM.
+//!
+//! Provides a cross-platform, pure-Rust VHDX backend for the layered disk
+//! stack. Uses the `vhdx` crate for format parsing and the `disk_layered`
+//! crate's `LayerIo` trait for integration.
+//!
+//! # Modules
+//!
+//! - [`io`] — `BlockingFile`: async file I/O via `blocking::unblock`
+
+#![forbid(unsafe_code)]
+
+pub mod chain;
+pub mod io;
+pub mod resolver;
+
+use disk_backend::DiskError;
+use disk_backend::UnmapBehavior;
+use disk_layered::LayerIo;
+use disk_layered::SectorMarker;
+use guestmem::MemoryRead;
+use guestmem::MemoryWrite;
+use inspect::Inspect;
+use io::BlockingFile;
+use scsi_buffers::RequestBuffers;
+use vhdx::AsyncFile;
+use vhdx::ReadRange;
+use vhdx::VhdxFile;
+use vhdx::WriteRange;
+
+/// A VHDX disk layer implementing [`LayerIo`].
+///
+/// Bridges the `vhdx` crate's range-based I/O model ([`ReadRange`]/[`WriteRange`])
+/// to the `disk_layered` crate's buffer-based I/O model ([`RequestBuffers`] +
+/// [`SectorMarker`]).
+#[derive(Inspect)]
+pub struct VhdxLayer {
+    #[inspect(skip)]
+    vhdx: VhdxFile<BlockingFile>,
+    #[inspect(skip)]
+    file: BlockingFile,
+    sector_size: u32,
+    physical_sector_size: u32,
+    sector_count: u64,
+    block_size: u32,
+    has_parent: bool,
+    read_only: bool,
+}
+
+impl VhdxLayer {
+    /// Create a `VhdxLayer` from an open `VhdxFile` and a clone of the
+    /// `BlockingFile` used to open it.
+    ///
+    /// `file` must be a clone of the `BlockingFile` that was passed to
+    /// `VhdxFile::open`. Both share the same `Arc<File>`, so data I/O
+    /// on resolved ranges goes to the same underlying file descriptor.
+    pub fn new(vhdx: VhdxFile<BlockingFile>, file: BlockingFile, read_only: bool) -> Self {
+        let sector_size = vhdx.logical_sector_size();
+        let physical_sector_size = vhdx.physical_sector_size();
+        let sector_count = vhdx.disk_size() / sector_size as u64;
+        let block_size = vhdx.block_size();
+        let has_parent = vhdx.has_parent();
+        Self {
+            vhdx,
+            file,
+            sector_size,
+            physical_sector_size,
+            sector_count,
+            block_size,
+            has_parent,
+            read_only,
+        }
+    }
+}
+
+/// Convert a [`vhdx::VhdxIoError`] to a [`DiskError`].
+fn vhdx_to_disk_error(e: vhdx::VhdxIoError) -> DiskError {
+    match e.kind() {
+        vhdx::VhdxIoErrorKind::ReadOnly => DiskError::ReadOnly,
+        vhdx::VhdxIoErrorKind::InvalidInput => DiskError::InvalidInput,
+        vhdx::VhdxIoErrorKind::InvalidSector => DiskError::IllegalBlock,
+        _ => DiskError::Io(std::io::Error::other(e)),
+    }
+}
+
+impl LayerIo for VhdxLayer {
+    fn layer_type(&self) -> &str {
+        "vhdx"
+    }
+
+    fn sector_count(&self) -> u64 {
+        self.sector_count
+    }
+
+    fn sector_size(&self) -> u32 {
+        self.sector_size
+    }
+
+    fn disk_id(&self) -> Option<[u8; 16]> {
+        Some(self.vhdx.page_83_data().into())
+    }
+
+    fn physical_sector_size(&self) -> u32 {
+        self.physical_sector_size
+    }
+
+    fn is_fua_respected(&self) -> bool {
+        true
+    }
+
+    fn is_logically_read_only(&self) -> bool {
+        self.read_only
+    }
+
+    fn optimal_unmap_sectors(&self) -> u32 {
+        self.block_size / self.sector_size
+    }
+
+    fn unmap_behavior(&self) -> UnmapBehavior {
+        if self.has_parent {
+            UnmapBehavior::Unspecified
+        } else {
+            UnmapBehavior::Zeroes
+        }
+    }
+
+    async fn sync_cache(&self) -> Result<(), DiskError> {
+        self.vhdx.flush().await.map_err(vhdx_to_disk_error)
+    }
+
+    async fn read(
+        &self,
+        buffers: &RequestBuffers<'_>,
+        sector: u64,
+        mut marker: SectorMarker<'_>,
+    ) -> Result<(), DiskError> {
+        let offset = sector * self.sector_size as u64;
+        let len = buffers.len() as u32;
+
+        // Resolve the read into file-level ranges.
+        let mut ranges = Vec::new();
+        let guard = self
+            .vhdx
+            .resolve_read(offset, len, &mut ranges)
+            .await
+            .map_err(vhdx_to_disk_error)?;
+
+        // Process each range.
+        for range in &ranges {
+            match *range {
+                ReadRange::Data {
+                    guest_offset,
+                    length,
+                    file_offset,
+                } => {
+                    let buf_offset = (guest_offset - offset) as usize;
+
+                    // Read from the VHDX file into an owned buffer (zero-copy I/O).
+                    let buf = self.file.alloc_buffer(length as usize);
+                    let buf = self
+                        .file
+                        .read_into(file_offset, buf)
+                        .await
+                        .map_err(DiskError::Io)?;
+
+                    // Write data into the request buffers at the correct position.
+                    buffers
+                        .subrange(buf_offset, length as usize)
+                        .writer()
+                        .write(buf.as_ref())?;
+
+                    // Mark these sectors as present.
+                    let start_sector = guest_offset / self.sector_size as u64;
+                    let sector_count = length as u64 / self.sector_size as u64;
+                    marker.set_range(start_sector..start_sector + sector_count);
+                }
+                ReadRange::Zero {
+                    guest_offset,
+                    length,
+                } => {
+                    let buf_offset = (guest_offset - offset) as usize;
+
+                    // Zero this portion of the request buffers.
+                    buffers
+                        .subrange(buf_offset, length as usize)
+                        .writer()
+                        .zero(length as usize)?;
+
+                    // Mark these sectors as present (they are definitively zero).
+                    let start_sector = guest_offset / self.sector_size as u64;
+                    let sector_count = length as u64 / self.sector_size as u64;
+                    marker.set_range(start_sector..start_sector + sector_count);
+                }
+                ReadRange::Unmapped { .. } => {
+                    // Do NOT mark these sectors. LayeredDisk will read from
+                    // the next layer down in the stack.
+                }
+            }
+        }
+
+        // Drop the guard (decrements per-block refcounts).
+        drop(guard);
+
+        Ok(())
+    }
+
+    async fn write(
+        &self,
+        buffers: &RequestBuffers<'_>,
+        sector: u64,
+        fua: bool,
+    ) -> Result<(), DiskError> {
+        let offset = sector * self.sector_size as u64;
+        let len = buffers.len() as u32;
+
+        // Resolve the write into file-level ranges.
+        let mut ranges = Vec::new();
+        let guard = self
+            .vhdx
+            .resolve_write(offset, len, &mut ranges)
+            .await
+            .map_err(vhdx_to_disk_error)?;
+
+        // Process each range.
+        for range in &ranges {
+            match *range {
+                WriteRange::Data {
+                    guest_offset,
+                    length,
+                    file_offset,
+                } => {
+                    let buf_offset = (guest_offset - offset) as usize;
+
+                    // Read data from the request buffers into an owned buffer.
+                    let mut buf = self.file.alloc_buffer(length as usize);
+                    buffers
+                        .subrange(buf_offset, length as usize)
+                        .reader()
+                        .read(buf.as_mut())?;
+
+                    // Write to the VHDX file at the resolved offset (zero-copy I/O).
+                    self.file
+                        .write_from(file_offset, buf)
+                        .await
+                        .map_err(DiskError::Io)?;
+                }
+                WriteRange::Zero {
+                    file_offset,
+                    length,
+                } => {
+                    // Write zeros to the file at the given offset
+                    // (for newly-allocated block padding).
+                    self.file
+                        .zero_range(file_offset, length as u64)
+                        .await
+                        .map_err(DiskError::Io)?;
+                }
+            }
+        }
+
+        // Complete the write (commits TFP blocks, updates sector bitmaps).
+        guard.complete().await.map_err(vhdx_to_disk_error)?;
+
+        // If FUA, flush to stable storage.
+        if fua {
+            self.vhdx.flush().await.map_err(vhdx_to_disk_error)?;
+        }
+
+        Ok(())
+    }
+
+    async fn unmap(
+        &self,
+        sector: u64,
+        count: u64,
+        _block_level_only: bool,
+        _next_is_zero: bool,
+    ) -> Result<(), DiskError> {
+        let offset = sector * self.sector_size as u64;
+        let length = count * self.sector_size as u64;
+
+        // Use TrimMode::Zero for base disks (unmapped reads will return zero),
+        // TrimMode::MakeTransparent for diff disks (reads fall through to parent).
+        let mode = if self.has_parent {
+            vhdx::TrimMode::MakeTransparent
+        } else {
+            vhdx::TrimMode::Zero
+        };
+
+        self.vhdx
+            .trim(vhdx::TrimRequest::new(mode, offset, length))
+            .await
+            .map_err(vhdx_to_disk_error)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use disk_backend::DiskIo;
+    use disk_layered::DiskLayer;
+    use disk_layered::LayerConfiguration;
+    use disk_layered::LayeredDisk;
+    use guestmem::GuestMemory;
+    use pal_async::DefaultDriver;
+    use pal_async::async_test;
+    use scsi_buffers::OwnedRequestBuffers;
+    use vhdx::VhdxFile;
+
+    /// Create a VHDX file at the given path and return a `VhdxLayer`.
+    async fn create_and_open_layer(path: &std::path::Path, driver: &DefaultDriver) -> VhdxLayer {
+        // Create a 1 MiB VHDX.
+        let bf = BlockingFile::open(path, false).unwrap();
+        let mut params = vhdx::CreateParams {
+            disk_size: 1024 * 1024,
+            ..Default::default()
+        };
+        vhdx::create(&bf, &mut params).await.unwrap();
+
+        // Re-open and wrap as VhdxLayer.
+        let bf = BlockingFile::open(path, false).unwrap();
+        let bf2 = bf.clone();
+        let vhdx = VhdxFile::open(bf).writable(&driver).await.unwrap();
+        VhdxLayer::new(vhdx, bf2, false)
+    }
+
+    /// Wrap a VhdxLayer in a single-layer LayeredDisk.
+    async fn wrap_in_layered_disk(layer: VhdxLayer) -> LayeredDisk {
+        LayeredDisk::new(
+            false,
+            vec![LayerConfiguration {
+                layer: DiskLayer::new(layer),
+                write_through: false,
+                read_cache: false,
+            }],
+        )
+        .await
+        .unwrap()
+    }
+
+    #[async_test]
+    async fn read_empty_disk_via_layer(driver: DefaultDriver) {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.vhdx");
+
+        let layer = create_and_open_layer(&path, &driver).await;
+
+        // Verify metadata.
+        assert_eq!(layer.sector_size(), 512);
+        assert_eq!(layer.sector_count(), 1024 * 1024 / 512);
+        assert_eq!(layer.layer_type(), "vhdx");
+        assert!(!layer.is_logically_read_only());
+
+        let disk = wrap_in_layered_disk(layer).await;
+
+        // Read sector 0 — empty disk should return all zeros.
+        let mem = GuestMemory::allocate(512);
+        let owned = OwnedRequestBuffers::linear(0, 512, true);
+        disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap();
+
+        let mut buf = vec![0u8; 512];
+        mem.read_at(0, &mut buf).unwrap();
+        assert_eq!(buf, vec![0u8; 512]);
+    }
+
+    #[async_test]
+    async fn write_and_read_back(driver: DefaultDriver) {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.vhdx");
+
+        let layer = create_and_open_layer(&path, &driver).await;
+        let disk = wrap_in_layered_disk(layer).await;
+
+        // Write a known pattern to sector 0.
+        let mem = GuestMemory::allocate(512);
+        let pattern: Vec<u8> = (0..512u16).map(|i| (i % 251) as u8).collect();
+        mem.write_at(0, &pattern).unwrap();
+        let owned = OwnedRequestBuffers::linear(0, 512, false);
+        disk.write_vectored(&owned.buffer(&mem), 0, false)
+            .await
+            .unwrap();
+
+        // Read back sector 0, verify data matches.
+        let owned = OwnedRequestBuffers::linear(0, 512, true);
+        disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap();
+        let mut buf = vec![0u8; 512];
+        mem.read_at(0, &mut buf).unwrap();
+        assert_eq!(buf, pattern);
+
+        // Read sector 1 — should be zero.
+        disk.read_vectored(&owned.buffer(&mem), 1).await.unwrap();
+        mem.read_at(0, &mut buf).unwrap();
+        assert_eq!(buf, vec![0u8; 512]);
+    }
+
+    #[async_test]
+    async fn sync_cache_works(driver: DefaultDriver) {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.vhdx");
+
+        let layer = create_and_open_layer(&path, &driver).await;
+        let disk = wrap_in_layered_disk(layer).await;
+
+        disk.sync_cache().await.unwrap();
+    }
+
+    #[async_test]
+    async fn write_close_reopen_read(driver: DefaultDriver) {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.vhdx");
+
+        // Create and write data
+        {
+            let layer = create_and_open_layer(&path, &driver).await;
+            let disk = wrap_in_layered_disk(layer).await;
+
+            let mem = GuestMemory::allocate(512);
+            let pattern: Vec<u8> = (0..512u16).map(|i| (i % 251) as u8).collect();
+            mem.write_at(0, &pattern).unwrap();
+            let owned = OwnedRequestBuffers::linear(0, 512, false);
+            disk.write_vectored(&owned.buffer(&mem), 0, false)
+                .await
+                .unwrap();
+
+            // Flush to ensure data is on disk
+            disk.sync_cache().await.unwrap();
+        }
+
+        // Re-open and read back
+        {
+            let bf = BlockingFile::open(&path, true).unwrap();
+            let bf2 = bf.clone();
+            let vhdx = VhdxFile::open(bf)
+                .allow_replay(true)
+                .read_only()
+                .await
+                .unwrap();
+            let layer = VhdxLayer::new(vhdx, bf2, true);
+            let disk = LayeredDisk::new(
+                true,
+                vec![LayerConfiguration {
+                    layer: DiskLayer::new(layer),
+                    write_through: false,
+                    read_cache: false,
+                }],
+            )
+            .await
+            .unwrap();
+
+            let mem = GuestMemory::allocate(512);
+            let owned = OwnedRequestBuffers::linear(0, 512, true);
+            disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap();
+
+            let mut buf = vec![0u8; 512];
+            mem.read_at(0, &mut buf).unwrap();
+            let expected: Vec<u8> = (0..512u16).map(|i| (i % 251) as u8).collect();
+            assert_eq!(buf, expected);
+        }
+    }
+
+    #[async_test]
+    async fn multi_sector_write_and_read(driver: DefaultDriver) {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.vhdx");
+
+        let layer = create_and_open_layer(&path, &driver).await;
+        let disk = wrap_in_layered_disk(layer).await;
+
+        // Write 4 KiB (8 sectors) starting at sector 0
+        let len = 4096usize;
+        let mem = GuestMemory::allocate(len);
+        let pattern: Vec<u8> = (0..len).map(|i| (i % 137) as u8).collect();
+        mem.write_at(0, &pattern).unwrap();
+
+        let owned = OwnedRequestBuffers::linear(0, len, false);
+        disk.write_vectored(&owned.buffer(&mem), 0, false)
+            .await
+            .unwrap();
+
+        // Read back and verify
+        let owned = OwnedRequestBuffers::linear(0, len, true);
+        disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap();
+
+        let mut buf = vec![0u8; len];
+        mem.read_at(0, &mut buf).unwrap();
+        assert_eq!(buf, pattern);
+    }
+}
diff --git a/vm/devices/storage/disklayer_vhdx/src/resolver.rs b/vm/devices/storage/disklayer_vhdx/src/resolver.rs
new file mode 100644
index 0000000000..3d30dbe377
--- /dev/null
+++ b/vm/devices/storage/disklayer_vhdx/src/resolver.rs
@@ -0,0 +1,65 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Resource resolver for VHDX disk layers.
+
+use crate::VhdxLayer;
+use crate::io::BlockingFile;
+use async_trait::async_trait;
+use disk_backend_resources::layer::VhdxDiskLayerHandle;
+use disk_layered::resolve::ResolveDiskLayerParameters;
+use disk_layered::resolve::ResolvedDiskLayer;
+use thiserror::Error;
+use vhdx::VhdxFile;
+use vm_resource::AsyncResolveResource;
+use vm_resource::ResourceResolver;
+use vm_resource::declare_static_async_resolver;
+use vm_resource::kind::DiskLayerHandleKind;
+
+/// Resolver for [`VhdxDiskLayerHandle`].
+pub struct VhdxDiskLayerResolver;
+
+declare_static_async_resolver!(
+    VhdxDiskLayerResolver,
+    (DiskLayerHandleKind, VhdxDiskLayerHandle)
+);
+
+/// Errors from resolving a VHDX disk layer.
+#[derive(Debug, Error)]
+pub enum ResolveVhdxError {
+    /// Failed to open the VHDX file.
+    #[error("failed to open vhdx")]
+    Open(#[source] vhdx::OpenError),
+}
+
+#[async_trait]
+impl AsyncResolveResource<DiskLayerHandleKind, VhdxDiskLayerHandle> for VhdxDiskLayerResolver {
+    type Output = ResolvedDiskLayer;
+    type Error = ResolveVhdxError;
+
+    async fn resolve(
+        &self,
+        _resolver: &ResourceResolver,
+        resource: VhdxDiskLayerHandle,
+        input: ResolveDiskLayerParameters<'_>,
+    ) -> Result<Self::Output, Self::Error> {
+        let read_only = resource.read_only || input.read_only;
+        let file = BlockingFile::new(resource.file);
+        let file2 = file.clone();
+        let vhdx = if read_only {
+            VhdxFile::open(file)
+                .read_only()
+                .await
+                .map_err(ResolveVhdxError::Open)?
+        } else {
+            let driver = input.driver_source.simple();
+            VhdxFile::open(file)
+                .writable(&driver)
+                .await
+                .map_err(ResolveVhdxError::Open)?
+        };
+        Ok(ResolvedDiskLayer::new(VhdxLayer::new(
+            vhdx, file2, read_only,
+        )))
+    }
+}
diff --git a/vm/devices/storage/vhdx/Cargo.toml b/vm/devices/storage/vhdx/Cargo.toml
new file mode 100644
index 0000000000..2cc7f05a97
--- /dev/null
+++ b/vm/devices/storage/vhdx/Cargo.toml
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+[package]
+name = "vhdx"
+edition.workspace = true
+rust-version.workspace = true
+
+[dependencies]
+bitfield-struct.workspace = true
+bitvec = { workspace = true, features = ["std"] }
+crc32c.workspace = true
+event-listener.workspace = true
+futures.workspace = true
+guid.workspace = true
+mesh.workspace = true
+pal_async.workspace = true
+parking_lot = { workspace = true, features = ["arc_lock"] }
+thiserror.workspace = true
+tracing.workspace = true
+zerocopy.workspace = true
+
+[target.'cfg(windows)'.dev-dependencies]
+windows = { workspace = true, features = [
+    "Win32_Storage_Vhd",
+    "Win32_Foundation",
+    "Win32_Storage_FileSystem",
+    "Win32_System_IO",
+    "Win32_System_Threading",
+] }
+tempfile.workspace = true
+
+[lints]
+workspace = true
diff --git a/vm/devices/storage/vhdx/src/apply_task.rs b/vm/devices/storage/vhdx/src/apply_task.rs
new file mode 100644
index 0000000000..14966a1f5a
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/apply_task.rs
@@ -0,0 +1,95 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Apply task — writes logged pages to their final file offsets.
+//!
+//! The apply task receives [`ApplyBatch`] items from the
+//! [log task](crate::log_task) via a mesh channel. For each batch, it
+//! writes all pages to their final file offsets, **releases log permits**
+//! (via [`LogPermits`](crate::log_permits::LogPermits)), and publishes
+//! `applied_lsn` with the flush sequence number (FSN) needed to make
+//! the writes durable.
+//!
+//! The apply task does **not** flush. Flushing is driven by consumers
+//! who need durability:
+//! - The log task flushes when it needs to advance the log tail
+//!   (on `LogFull` or graceful close).
+//! - [`VhdxFile::flush()`](crate::open::VhdxFile::flush) flushes for crash safety.
+//!
+//! Both callers use
+//! [`FlushSequencer::flush_through()`](crate::flush::FlushSequencer::flush_through)
+//! with the FSN from the watermark, which coalesces naturally.
+
+use crate::AsyncFile;
+use crate::flush::FlushSequencer;
+use crate::log_permits::LogPermits;
+use crate::log_task::LogData;
+use crate::log_task::Lsn;
+use crate::lsn_watermark::LsnWatermark;
+use crate::open::FailureFlag;
+use std::sync::Arc;
+
+/// A batch of page-aligned data that has been logged and needs to be applied
+/// (written to their final file offsets).
+pub(crate) struct ApplyBatch<B> {
+    /// The data to write.
+    pub data: Vec<LogData<B>>,
+    /// The LSN of the log entry that contains these pages.
+    pub lsn: Lsn,
+}
+
+/// Run the apply task main loop.
+///
+/// Receives batches from the log task, writes pages to their final
+/// file offsets, releases log permits, and publishes `applied_lsn`
+/// with the FSN needed for durability.
+pub(crate) async fn run_apply_task<F: AsyncFile>(
+    mut rx: mesh::Receiver<ApplyBatch<F::Buffer>>,
+    file: Arc<F>,
+    flush_sequencer: Arc<FlushSequencer>,
+    applied_lsn: Arc<LsnWatermark>,
+    log_permits: Arc<LogPermits>,
+    failure_flag: Arc<FailureFlag>,
+) {
+    loop {
+        let batch = match rx.recv().await {
+            Ok(batch) => batch,
+            Err(_) => {
+                // Channel closed — log task shut down. Exit.
+                break;
+            }
+        };
+
+        let lsn = batch.lsn;
+        let mut page_count = 0;
+
+        // Write each range to its final file offset (zero-copy via Arc).
+        for entry in batch.data {
+            page_count += entry.page_count();
+            let (file_offset, data) = entry.into_parts();
+            if let Err(e) = file.write_from(file_offset, data).await {
+                tracing::error!(
+                    "VHDX apply task: write error at offset {:#x}: {e}",
+                    file_offset
+                );
+                log_permits.fail(format!("apply write failed: {e}"));
+                applied_lsn.fail(format!("apply write failed: {e}"));
+                failure_flag.set(&e);
+                return;
+            }
+        }
+
+        // The pages have been dropped by this point, so it's safe to release
+        // the permits for this batch.
+        log_permits.release(page_count);
+
+        // Capture the FSN *after* the writes. Flushing through this FSN
+        // will make all the writes above durable. We don't flush here —
+        // the log task or VhdxFile::flush() will do it when needed.
+        let fsn = flush_sequencer.current_fsn();
+
+        // Publish (lsn, fsn): "pages through this LSN are at their final
+        // offsets; flush through this FSN to make them durable."
+        applied_lsn.advance(lsn, fsn);
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/bat.rs b/vm/devices/storage/vhdx/src/bat.rs
new file mode 100644
index 0000000000..1e5514745b
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/bat.rs
@@ -0,0 +1,1395 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! BAT (Block Allocation Table) lookup and management.
+//!
+//! Provides on-demand BAT entry lookup through the [`PageCache`], computing
+//! the correct BAT page offset for any given block number. Handles the
+//! interleaving of payload block entries with sector bitmap entries.
+
+use crate::AsyncFile;
+use crate::cache::PageCache;
+use crate::cache::PageKey;
+use crate::cache::WriteMode;
+use crate::create::ceil_div;
+use crate::create::chunk_block_count;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::error::OpenErrorInner;
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::flush::Fsn;
+use crate::format::BatEntry;
+use crate::format::BatEntryState;
+use crate::format::CACHE_PAGE_SIZE;
+use crate::format::ENTRIES_PER_BAT_PAGE;
+use crate::format::MB1;
+use bitfield_struct::bitfield;
+use std::sync::atomic::AtomicU16;
+use std::sync::atomic::AtomicU32;
+use std::sync::atomic::Ordering;
+use zerocopy::IntoBytes;
+
+use crate::space::EofState;
+use crate::space::FreeSpaceTracker;
+use zerocopy::FromBytes;
+
+/// Cache tag for BAT region pages.
+pub(crate) const BAT_TAG: u8 = 0;
+
+/// Size of a sector bitmap block in bytes (1 MiB).
+pub(crate) const SECTOR_BITMAP_BLOCK_SIZE: u32 = 1024 * 1024;
+
+/// Per-block I/O refcount packed into a `u16`.
+///
+/// Layout:
+/// - Bit 15 (`TRIM_PENDING_BIT`): set by trim to block new I/O acquisitions.
+/// - Bits 0-14: I/O reference count (0..32767).
+///
+/// Valid states:
+/// - `0x0000` — idle, no I/O, no trim.
+/// - `0x0001..MAX_IO_REFCOUNT` — active I/O refcount.
+/// - `TRIM_PENDING` (`0x8000`) — trim pending, I/Os drained, ready to claim.
+/// - `0x8001..0xFFFE` — trim pending + draining I/Os.
+/// - `TRIM_CLAIMED` (`0xFFFF`) — trim owns the block exclusively.
+///
+/// The pending bit gives trim **writer priority**: once set, no new I/O
+/// can increment the refcount, preventing livelock from a steady I/O stream.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+struct IoBlockRef(u16);
+
+impl IoBlockRef {
+    /// High bit: trim is pending (blocks new I/O).
+    const TRIM_PENDING_BIT: u16 = 0x8000;
+    /// Maximum I/O refcount (bits 0-14 all set).
+    const MAX_IO_REFCOUNT: u16 = 0x7FFF;
+    const FREE: Self = Self(0);
+    /// Trim pending, all I/Os drained — ready to finish claiming.
+    const TRIM_PENDING: Self = Self(Self::TRIM_PENDING_BIT);
+    /// Trim has exclusively claimed the block.
+    const TRIM_CLAIMED: Self = Self(u16::MAX);
+
+    /// The I/O refcount (bits 0-14), ignoring the trim-pending bit.
+    fn io_count(self) -> u16 {
+        self.0 & Self::MAX_IO_REFCOUNT
+    }
+
+    /// Whether the trim-pending bit is set.
+    fn trim_pending(self) -> bool {
+        self.0 & Self::TRIM_PENDING_BIT != 0
+    }
+
+    /// Whether new I/O acquisitions should be blocked.
+    ///
+    /// True when the trim-pending bit is set OR the I/O count is at
+    /// the maximum (would overflow into the pending bit).
+    fn blocks_new_io(self) -> bool {
+        self.0 >= Self::MAX_IO_REFCOUNT
+    }
+}
+
+pub(crate) struct Bat {
+    /// Number of data blocks (payload blocks) in the disk.
+    pub data_block_count: u32,
+    /// Number of sector bitmap blocks (chunks). Zero if no parent.
+    pub sector_bitmap_block_count: u32,
+    /// Chunk ratio: number of data blocks per sector bitmap entry.
+    pub chunk_ratio: u32,
+    /// Block size in bytes.
+    pub block_size: u32,
+    /// Whether the disk has a parent (differencing).
+    pub has_parent: bool,
+
+    /// One `AtomicU32` per payload block (indexed by block number).
+    /// Each stores a [`BlockMapping`] bitfield. Lock-free: individual
+    /// entries are read/written atomically without a shared lock.
+    payload_mappings: Vec<AtomicU32>,
+    /// One `AtomicU32` per sector bitmap block (indexed by chunk number).
+    sector_bitmap_mappings: Vec<AtomicU32>,
+
+    /// Per-payload-block I/O refcounts (see [`IoBlockRef`] for layout).
+    io_refcounts: Vec<AtomicU16>,
+
+    /// Notified whenever a block's refcount changes in a way that could
+    /// unblock a waiter: I/O count reaching zero (unblocks trim),
+    /// trim releasing a claim (unblocks I/O), or I/O count dropping
+    /// below the overflow threshold (unblocks I/O).
+    refcount_event: event_listener::Event,
+}
+
+/// In-memory BAT entry. Compact 32-bit representation used in the in-memory
+/// BAT array (not on disk).
+///
+/// Layout: state (3 bits) | transitioning_to_fully_present (1 bit) | file_megabyte (28 bits)
+///
+/// The 28-bit `file_megabyte` field supports files up to 2^28 MB = 256 TB.
+#[bitfield(u32)]
+#[derive(PartialEq, Eq)]
+pub(crate) struct BlockMapping {
+    /// Block state (same values as BatEntryState).
+    #[bits(3)]
+    state: u8,
+    /// Set during allocation: space has been allocated but data I/O may still
+    /// be in flight. Other writers to this block must wait.
+    #[bits(1)]
+    pub transitioning_to_fully_present: bool,
+    /// File offset in megabytes.
+    #[bits(28)]
+    pub file_megabyte: u32,
+}
+
+impl BlockMapping {
+    fn supported_bat_entry_bits() -> u64 {
+        u64::from(
+            BatEntry::new()
+                .with_state(0b111)
+                .with_file_offset_mb((1_u64 << 44) - 1),
+        )
+    }
+
+    /// File byte offset (converts the megabyte field to bytes).
+    pub fn file_offset(self) -> u64 {
+        self.file_megabyte() as u64 * MB1
+    }
+
+    /// Parse the block state.
+    ///
+    /// Panics if the raw state is invalid — this is an internal invariant
+    /// since states are validated at BAT load time and only set to known
+    /// values at runtime.
+    pub fn bat_state(self) -> BatEntryState {
+        BatEntryState::from_raw(self.state()).expect("InternalBlockMapping has invalid state")
+    }
+
+    pub fn with_bat_state(self, state: BatEntryState) -> Self {
+        self.with_state(state as u8)
+    }
+
+    /// Whether this mapping is soft-anchored: unmapped or undefined
+    /// with a non-zero file offset retained for potential reuse.
+    pub fn is_soft_anchored(self) -> bool {
+        let state = self.bat_state();
+        matches!(state, BatEntryState::Unmapped | BatEntryState::Undefined)
+            && self.file_megabyte() != 0
+    }
+
+    /// Create a [`BlockMapping`] from an on-disk [`BatEntry`].
+    ///
+    /// Validates the entry state and file offset. For non-differencing
+    /// disks (`has_parent == false`), normalizes `PartiallyPresent` to
+    /// `FullyPresent` at load time.
+    pub fn from_bat_entry(entry: BatEntry, has_parent: bool) -> Result<Self, OpenError> {
+        if u64::from(entry) & !Self::supported_bat_entry_bits() != 0 {
+            return Err(CorruptionType::ReservedBatEntryFieldNonzero.into());
+        }
+        let raw_state = entry.state();
+        let mut bat_state =
+            BatEntryState::from_raw(raw_state).ok_or(CorruptionType::InvalidBlockState)?;
+        // Normalize PartiallyPresent → FullyPresent for non-diff disks.
+        if !has_parent && bat_state == BatEntryState::PartiallyPresent {
+            bat_state = BatEntryState::FullyPresent;
+        }
+        let file_mb = entry.file_offset_mb();
+        if file_mb > 0x0FFF_FFFF {
+            return Err((CorruptionType::InvalidBlockState).into());
+        }
+        Ok(BlockMapping::new()
+            .with_bat_state(bat_state)
+            .with_transitioning_to_fully_present(false)
+            .with_file_megabyte(file_mb as u32))
+    }
+
+    /// Create a [`BlockMapping`] from an on-disk SBM [`BatEntry`].
+    ///
+    /// Validates the entry state and file offset. Normalizes
+    /// `PartiallyPresent` to `FullyPresent` (compatibility).
+    pub fn from_sbm_bat_entry(entry: BatEntry) -> Result<Self, OpenError> {
+        if u64::from(entry) & !Self::supported_bat_entry_bits() != 0 {
+            return Err(CorruptionType::ReservedBatEntryFieldNonzero.into());
+        }
+        let raw_state = entry.state();
+
+        let mut bat_state =
+            BatEntryState::from_raw(raw_state).ok_or(CorruptionType::InvalidBlockState)?;
+        if bat_state == BatEntryState::PartiallyPresent {
+            bat_state = BatEntryState::FullyPresent;
+        }
+        let file_mb = entry.file_offset_mb();
+        if file_mb > 0x0FFF_FFFF {
+            return Err((CorruptionType::InvalidBlockState).into());
+        }
+        Ok(BlockMapping::new()
+            .with_bat_state(bat_state)
+            .with_transitioning_to_fully_present(false)
+            .with_file_megabyte(file_mb as u32))
+    }
+}
+
+/// Block type discriminator for BAT entries.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum BlockType {
+    /// A data payload block.
+    Payload,
+    /// A sector bitmap block (differencing disks only).
+    SectorBitmap,
+}
+
+impl Bat {
+    /// Create a new BAT manager from parsed metadata.
+    ///
+    /// Computes chunk ratio, data block count, and sector bitmap block count.
+    pub fn new(
+        disk_size: u64,
+        block_size: u32,
+        logical_sector_size: u32,
+        has_parent: bool,
+        bat_length: u32,
+    ) -> Result<Self, OpenError> {
+        let chunk_ratio = chunk_block_count(block_size, logical_sector_size);
+        if chunk_ratio == 0 {
+            return Err((CorruptionType::InvalidBlockSize).into());
+        }
+
+        let data_block_count = ceil_div(disk_size, block_size as u64) as u32;
+        let sector_bitmap_block_count = if has_parent {
+            ceil_div(data_block_count as u64, chunk_ratio as u64) as u32
+        } else {
+            0
+        };
+
+        let entry_count = if has_parent {
+            sector_bitmap_block_count as u64 * (chunk_ratio as u64 + 1)
+        } else {
+            data_block_count as u64
+                + (data_block_count.saturating_sub(1) as u64 / chunk_ratio as u64)
+        };
+
+        let required_bytes = entry_count * size_of::<BatEntry>() as u64;
+        if required_bytes > bat_length as u64 {
+            return Err((CorruptionType::BatTooSmall).into());
+        }
+
+        let payload_mappings = (0..data_block_count).map(|_| AtomicU32::new(0)).collect();
+        let sector_bitmap_mappings = (0..sector_bitmap_block_count)
+            .map(|_| AtomicU32::new(0))
+            .collect();
+        let io_refcounts = (0..data_block_count).map(|_| AtomicU16::new(0)).collect();
+
+        Ok(Bat {
+            data_block_count,
+            sector_bitmap_block_count,
+            chunk_ratio,
+            block_size,
+            has_parent,
+            payload_mappings,
+            sector_bitmap_mappings,
+            io_refcounts,
+            refcount_event: event_listener::Event::new(),
+        })
+    }
+
+    /// Try to atomically increment the I/O refcount for a block.
+    ///
+    /// Returns `true` if the increment succeeded, `false` if new I/O is
+    /// blocked. New I/O is blocked when:
+    /// - The trim-pending bit is set (trim has writer priority).
+    /// - The I/O count is at `MAX_IO_REFCOUNT` (would overflow).
+    /// - The block is trim-claimed (`TRIM_CLAIMED`).
+    fn try_increment_io_refcount(&self, block_number: u32) -> bool {
+        let rc = &self.io_refcounts[block_number as usize];
+        loop {
+            let old = IoBlockRef(rc.load(Ordering::Acquire));
+            if old.blocks_new_io() {
+                return false;
+            }
+            let new = old.0 + 1;
+            match rc.compare_exchange_weak(old.0, new, Ordering::AcqRel, Ordering::Acquire) {
+                Ok(_) => return true,
+                Err(_) => continue,
+            }
+        }
+    }
+
+    /// Atomically decrement the I/O refcount.
+    ///
+    /// The trim-pending bit is preserved — only the I/O count in
+    /// bits 0-14 is decremented. Panics on underflow or if the block
+    /// is trim-claimed.
+    ///
+    /// Returns `true` if callers should notify `refcount_event`:
+    /// when the I/O count hits zero (trim may be waiting), or when
+    /// the count drops from the overflow threshold.
+    #[must_use]
+    fn decrement_io_refcount(&self, block_number: u32) -> bool {
+        let prev =
+            IoBlockRef(self.io_refcounts[block_number as usize].fetch_sub(1, Ordering::AcqRel));
+        assert!(
+            prev.io_count() > 0 && prev != IoBlockRef::TRIM_CLAIMED,
+            "io_refcount underflow or trim claimed on block {block_number} (was {:#06x})",
+            prev.0,
+        );
+        prev.io_count() == 1 || prev.io_count() == IoBlockRef::MAX_IO_REFCOUNT
+    }
+
+    /// Claim a block for trim, with writer priority.
+    pub(crate) async fn claim_for_trim(&self, block_number: u32) -> TrimGuard<'_> {
+        let rc = &self.io_refcounts[block_number as usize];
+        loop {
+            let listener = self.refcount_event.listen();
+            let result = rc.fetch_update(Ordering::AcqRel, Ordering::Acquire, |raw| {
+                let old = IoBlockRef(raw);
+                match old {
+                    // Idle — claim directly.
+                    IoBlockRef::FREE => Some(IoBlockRef::TRIM_CLAIMED.0),
+                    // Pending bit set, I/Os drained — finish claiming.
+                    IoBlockRef::TRIM_PENDING => Some(IoBlockRef::TRIM_CLAIMED.0),
+                    // Already claimed — wait for release.
+                    IoBlockRef::TRIM_CLAIMED => None,
+                    // I/Os active, no pending bit — set it.
+                    _ if !old.trim_pending() => Some(old.0 | IoBlockRef::TRIM_PENDING_BIT),
+                    // Pending bit set, I/Os still draining — wait.
+                    _ => None,
+                }
+            });
+            break match result.map(IoBlockRef) {
+                Ok(IoBlockRef::FREE | IoBlockRef::TRIM_PENDING) => TrimGuard {
+                    bat: self,
+                    block_number,
+                },
+                _ => {
+                    // Wait for the I/O count to reach 0 or for the trim claim to be released.
+                    listener.await;
+                    continue;
+                }
+            };
+        }
+    }
+
+    /// Release a trim claim on a block (store 0), waking blocked I/O paths.
+    fn release_trim_claim(&self, block_number: u32) {
+        let prev = IoBlockRef(self.io_refcounts[block_number as usize].swap(0, Ordering::Release));
+        assert_eq!(
+            prev,
+            IoBlockRef::TRIM_CLAIMED,
+            "release_trim_claim on block {block_number} that wasn't claimed (was {:#06x})",
+            prev.0,
+        );
+        self.refcount_event.notify(usize::MAX);
+    }
+
+    /// Load the current raw I/O refcount for a block (for testing).
+    #[cfg(test)]
+    pub(crate) fn io_refcount(&self, block_number: u32) -> u16 {
+        self.io_refcounts[block_number as usize].load(Ordering::Acquire)
+    }
+
+    /// Compute the BAT entry index for a given data block number.
+    ///
+    /// For every `chunk_ratio` payload entries, one sector bitmap entry is
+    /// interleaved. The entry index accounts for these interleaved entries.
+    pub fn payload_entry_index(&self, block_number: u32) -> u32 {
+        block_number + (block_number / self.chunk_ratio)
+    }
+
+    /// Compute the BAT entry index for a given sector bitmap block (chunk number).
+    ///
+    /// The sector bitmap entry follows every `chunk_ratio` payload entries.
+    pub fn sector_bitmap_entry_index(&self, chunk_number: u32) -> u32 {
+        ((chunk_number + 1) * self.chunk_ratio) + chunk_number
+    }
+
+    /// Reverse-map a flat BAT entry number to (block_type, block_number).
+    ///
+    /// Returns `None` if the entry is beyond the end of the disk.
+    fn entry_number_to_block_id(&self, entry_number: u32) -> Option<(BlockType, u32)> {
+        let group_size = self.chunk_ratio + 1;
+        let group = entry_number / group_size;
+        let position = entry_number % group_size;
+
+        if position == self.chunk_ratio {
+            // This is a sector bitmap / padding entry.
+            if self.has_parent && group < self.sector_bitmap_block_count {
+                Some((BlockType::SectorBitmap, group))
+            } else {
+                None
+            }
+        } else {
+            // This is a payload entry.
+            let block_number = group * self.chunk_ratio + position;
+            if block_number < self.data_block_count {
+                Some((BlockType::Payload, block_number))
+            } else {
+                None
+            }
+        }
+    }
+
+    /// Convert a virtual disk byte offset to a block number.
+    pub fn offset_to_block(&self, offset: u64) -> u32 {
+        (offset / self.block_size as u64) as u32
+    }
+
+    /// Compute the byte offset within a block for a given virtual disk offset.
+    #[cfg(test)]
+    pub fn offset_within_block(&self, offset: u64) -> u32 {
+        (offset % self.block_size as u64) as u32
+    }
+
+    /// Iterate over the block spans touched by a virtual disk range.
+    ///
+    /// Given a guest `offset` and `len`, yields one [`BlockSpan`] per
+    /// block touched, with the block-relative offset and clamped length.
+    /// This is the single source of truth for the block-walk arithmetic
+    /// used by read, write, complete, and abort paths.
+    pub fn block_spans(&self, offset: u64, len: u32) -> BlockSpanIter {
+        BlockSpanIter {
+            block_size: self.block_size,
+            base_offset: offset,
+            total_len: len,
+            current_offset: 0,
+        }
+    }
+
+    /// Serialize a BAT page from in-memory state.
+    ///
+    /// Produces all entries for the given page, with TFP blocks having
+    /// their `file_offset_mb` masked to zero (allocation not committed
+    /// yet).
+    fn produce_page(&self, page_index: usize, buf: &mut [u8; CACHE_PAGE_SIZE as usize]) {
+        let base_entry = page_index as u32 * ENTRIES_PER_BAT_PAGE as u32;
+        for i in 0..ENTRIES_PER_BAT_PAGE as u32 {
+            let entry_number = base_entry + i;
+            let bat_entry = match self.entry_number_to_block_id(entry_number) {
+                Some((BlockType::Payload, block_number)) => {
+                    let mapping = self.get_block_mapping(block_number);
+                    let file_mb = if mapping.transitioning_to_fully_present() {
+                        0
+                    } else {
+                        mapping.file_megabyte() as u64
+                    };
+                    BatEntry::new()
+                        .with_state(mapping.state())
+                        .with_file_offset_mb(file_mb)
+                }
+                Some((BlockType::SectorBitmap, chunk_number)) => {
+                    let mapping = self.get_sector_bitmap_mapping(chunk_number);
+                    BatEntry::new()
+                        .with_state(mapping.state())
+                        .with_file_offset_mb(mapping.file_megabyte() as u64)
+                }
+                None => BatEntry::new(),
+            };
+            let offset = i as usize * size_of::<BatEntry>();
+            buf[offset..offset + size_of::<BatEntry>()].copy_from_slice(bat_entry.as_bytes());
+        }
+    }
+
+    /// Write a block mapping to the cache, converting from in-memory
+    /// representation to on-disk BAT entry format.
+    ///
+    /// Atomically updates the in-memory BAT and the cache page under
+    /// the page lock, ensuring no window where the in-memory state is
+    /// visible but the cache page hasn't been stamped with the FSN.
+    ///
+    /// Uses `Overwrite` mode to avoid unnecessary disk reads. If the
+    /// page is already cached, patches only the single entry. If not
+    /// cached, builds the full page from in-memory state (no disk read).
+    pub async fn write_block_mapping<F: AsyncFile>(
+        &self,
+        cache: &PageCache<F>,
+        block_type: BlockType,
+        block_number: u32,
+        mapping: BlockMapping,
+        pre_log_fsn: Option<Fsn>,
+    ) -> Result<(), VhdxIoError> {
+        let entry_number = match block_type {
+            BlockType::Payload => self.payload_entry_index(block_number),
+            BlockType::SectorBitmap => self.sector_bitmap_entry_index(block_number),
+        };
+        let page_number = entry_number as usize / ENTRIES_PER_BAT_PAGE as usize;
+        let page_offset = page_number as u64 * CACHE_PAGE_SIZE;
+        let entry_within_page = entry_number as usize % ENTRIES_PER_BAT_PAGE as usize;
+
+        let mut guard = cache
+            .acquire_write(
+                PageKey {
+                    tag: BAT_TAG,
+                    offset: page_offset,
+                },
+                WriteMode::Overwrite,
+            )
+            .await
+            .map_err(VhdxIoErrorInner::BatCache)?;
+
+        // Update in-memory BAT under the page lock. This ensures a
+        // concurrent trim on a block sharing the same page can't dirty
+        // the page (and get it flushed to WAL) between our in-memory
+        // update and the FSN stamp below.
+        match block_type {
+            BlockType::Payload => {
+                self.set_block_mapping(block_number, mapping);
+            }
+            BlockType::SectorBitmap => {
+                self.set_sector_bitmap_mapping(block_number, mapping);
+            }
+        }
+
+        if guard.is_overwriting() {
+            // Slow path: page not cached — build from in-memory state.
+            self.produce_page(page_number, &mut guard);
+        } else {
+            // Fast path: page is cached — patch just the one entry.
+            let bat_entry = BatEntry::new()
+                .with_state(mapping.state())
+                .with_file_offset_mb(mapping.file_megabyte() as u64);
+            let byte_offset = entry_within_page * size_of::<BatEntry>();
+            guard[byte_offset..byte_offset + size_of::<BatEntry>()]
+                .copy_from_slice(bat_entry.as_bytes());
+        }
+
+        // Set pre-log FSN while the page lock is still held, so
+        // that the FSN is visible atomically with the dirty-mark.
+        if let Some(fsn) = pre_log_fsn {
+            guard.set_pre_log_fsn(fsn);
+        }
+
+        // BAT pages are always rebuildable from in-memory BatState,
+        // so prefer evicting them over sector bitmap pages.
+        guard.demote();
+
+        Ok(())
+    }
+
+    /// Read chunk size for BAT loading (256 KiB = 32768 entries).
+    const BAT_READ_CHUNK: usize = 256 * 1024;
+
+    /// Load the in-memory BAT state from disk.
+    ///
+    /// Reads the BAT region in fixed-size chunks and does a single
+    /// sequential pass over all entries, dispatching payload vs. SBM
+    /// entries via [`entry_number_to_block_id`]. This avoids both a
+    /// large peak allocation and redundant reads of the same region.
+    ///
+    /// During parse, marks allocated blocks in the FreeSpaceTracker
+    /// and records soft-anchored blocks.
+    pub(crate) async fn load_bat_state<F: AsyncFile>(
+        &mut self,
+        file: &F,
+        bat_offset: u64,
+        bat_length: u32,
+        free_space: &FreeSpaceTracker,
+        eof_state: &mut EofState,
+    ) -> Result<(), OpenError> {
+        let bat_len = bat_length as usize;
+        let total_entries = bat_len / size_of::<BatEntry>();
+        let chunk_size = std::cmp::min(bat_len, Self::BAT_READ_CHUNK);
+        let entries_per_chunk = chunk_size / size_of::<BatEntry>();
+
+        let mut file_pos: usize = 0;
+        let mut entry_num = 0;
+
+        // Allocate a single read buffer, reused across iterations.
+        // On the last iteration, we may read up to `chunk_size` bytes
+        // even if fewer remain — the excess is zero-initialized and
+        // ignored by the parser.
+        let mut buf = file.alloc_buffer(chunk_size);
+
+        while entry_num < total_entries {
+            // Read the next chunk.
+            buf = file
+                .read_into(bat_offset + file_pos as u64, buf)
+                .await
+                .map_err(OpenErrorInner::Io)?;
+
+            let entries_in_chunk = std::cmp::min(entries_per_chunk, total_entries - entry_num);
+            for i in 0..entries_in_chunk {
+                let byte_offset = i * size_of::<BatEntry>();
+                let entry = BatEntry::read_from_bytes(
+                    &buf.as_ref()[byte_offset..byte_offset + size_of::<BatEntry>()],
+                )
+                .map_err(|_| CorruptionType::InvalidBlockState)?;
+
+                match self.entry_number_to_block_id((entry_num + i) as u32) {
+                    Some((BlockType::Payload, block_number)) => {
+                        let mapping = BlockMapping::from_bat_entry(entry, self.has_parent)?;
+                        if mapping.bat_state().is_allocated() {
+                            let file_offset = mapping.file_offset();
+                            if file_offset != 0 {
+                                free_space.mark_range_in_use(
+                                    eof_state,
+                                    file_offset,
+                                    self.block_size,
+                                )?;
+                            }
+                        } else if (mapping.bat_state() == BatEntryState::Unmapped
+                            || mapping.bat_state() == BatEntryState::Undefined)
+                            && mapping.file_megabyte() != 0
+                        {
+                            let file_offset = mapping.file_offset();
+                            free_space.mark_range_in_use(
+                                eof_state,
+                                file_offset,
+                                self.block_size,
+                            )?;
+                            free_space.mark_trimmed_block(
+                                block_number,
+                                file_offset,
+                                self.block_size,
+                            )?;
+                        }
+                        self.payload_mappings[block_number as usize]
+                            .store(mapping.into(), Ordering::Relaxed);
+                    }
+                    Some((BlockType::SectorBitmap, chunk_number)) => {
+                        let mapping = BlockMapping::from_sbm_bat_entry(entry)?;
+                        if mapping.bat_state().is_allocated() {
+                            let file_offset = mapping.file_offset();
+                            if file_offset != 0 {
+                                free_space.mark_range_in_use(
+                                    eof_state,
+                                    file_offset,
+                                    SECTOR_BITMAP_BLOCK_SIZE,
+                                )?;
+                            }
+                        }
+                        self.sector_bitmap_mappings[chunk_number as usize]
+                            .store(mapping.into(), Ordering::Relaxed);
+                    }
+                    None => {
+                        // Entry beyond the disk — padding per the VHDX spec.
+                    }
+                }
+            }
+
+            entry_num += entries_in_chunk;
+            file_pos += chunk_size;
+        }
+
+        // Cross-validate: every PartiallyPresent payload block must have
+        // a corresponding allocated (FullyPresent) SBM block. This is
+        // required by the VHDX spec and enforced at runtime by
+        // ensure_sbm_allocated, but a corrupt file could violate it.
+        if self.has_parent {
+            for block in 0..self.data_block_count {
+                let mapping = BlockMapping::from(
+                    self.payload_mappings[block as usize].load(Ordering::Relaxed),
+                );
+                if mapping.bat_state() == BatEntryState::PartiallyPresent {
+                    let chunk = block / self.chunk_ratio;
+                    let sbm = BlockMapping::from(
+                        self.sector_bitmap_mappings[chunk as usize].load(Ordering::Relaxed),
+                    );
+                    if sbm.bat_state() != BatEntryState::FullyPresent {
+                        return Err(CorruptionType::PartiallyPresentWithoutSectorBitmap.into());
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Atomically increment I/O refcounts for a contiguous range of
+    /// blocks, returning a [`BatGuard`] that releases them on drop.
+    ///
+    /// Blocks are acquired in ascending order. If a block is claimed
+    /// by trim, the caller holds previously-acquired blocks and waits
+    /// for the blocked block to become available. Deadlock-free because
+    /// both I/O and trim always acquire blocks in ascending order.
+    pub async fn acquire_io_refcounts(&self, start_block: u32, block_count: u32) -> BatGuard<'_> {
+        let mut guard = BatGuard {
+            bat: Some(self),
+            start_block,
+            block_count: 0,
+        };
+        for block in start_block..start_block + block_count {
+            while !self.try_increment_io_refcount(block) {
+                let listener = self.refcount_event.listen();
+                if !self.try_increment_io_refcount(block) {
+                    listener.await;
+                }
+            }
+            guard.block_count += 1;
+        }
+        guard
+    }
+
+    /// Look up the payload block mapping for a given data block number.
+    ///
+    /// Returns a point-in-time snapshot. Callers that hold I/O
+    /// refcounts can rely on the following:
+    ///
+    /// - **`file_offset` is stable for allocated blocks.** If the
+    ///   mapping shows `FullyPresent` or `PartiallyPresent`, the file
+    ///   offset won't be reclaimed out from under you — trim must
+    ///   drain I/O refcounts before it can claim the block.
+    /// - **State can only advance, not regress.** A block that is
+    ///   `FullyPresent` won't revert to `NotPresent` while I/O
+    ///   refcounts are held. (Trim sets the pending bit to block new
+    ///   I/O, then waits for existing I/O to drain.)
+    /// - **TFP blocks are in flight.** If `transitioning_to_fully_present`
+    ///   is set, another writer is mid-allocation. The write path waits
+    ///   on `allocation_event` and retries; the read path ignores TFP
+    ///   and uses the current state+offset directly (safe because the
+    ///   file offset is valid and I/O refcounts prevent reclamation).
+    ///
+    /// Without I/O refcounts (or a trim claim), the mapping is purely
+    /// advisory — the block could be trimmed between the load and any
+    /// action on it.
+    pub(crate) fn get_block_mapping(&self, block_number: u32) -> BlockMapping {
+        BlockMapping::from(self.payload_mappings[block_number as usize].load(Ordering::Acquire))
+    }
+
+    /// Look up the sector bitmap block mapping for a given chunk number.
+    ///
+    /// SBM mappings are set once during allocation and never revert,
+    /// so any reader that sees `FullyPresent` can rely on the file
+    /// offset being stable indefinitely. A reader that sees
+    /// `NotPresent` must allocate the SBM block before proceeding
+    /// (see `ensure_sbm_allocated`).
+    pub(crate) fn get_sector_bitmap_mapping(&self, chunk_number: u32) -> BlockMapping {
+        BlockMapping::from(
+            self.sector_bitmap_mappings[chunk_number as usize].load(Ordering::Acquire),
+        )
+    }
+
+    /// Update the payload block mapping for a given data block number.
+    ///
+    /// In-memory only — does not persist to cache or disk. Use
+    /// [`write_block_mapping`](Self::write_block_mapping) to persist.
+    ///
+    /// Allowed transitions and their required guards:
+    ///
+    /// - Unallocated → same state + TFP + file offset: `allocation_lock`.
+    ///   (Unallocated = NotPresent, Zero, Unmapped, or Undefined.)
+    /// - PartiallyPresent → same state + TFP: `allocation_lock`.
+    /// - Any + TFP → original mapping (revert): abort path — TFP
+    ///   acts as an exclusive flag so no other guard is needed.
+    /// - Soft-anchored → same state + file_megabyte=0: `allocation_lock`.
+    ///
+    /// The TFP bit is the key invariant: once set on a block, no other
+    /// allocator will touch that block (they wait on `allocation_event`),
+    /// and trim cannot reach it because the allocator holds I/O
+    /// refcounts on TFP blocks. This makes the setter the exclusive
+    /// owner until TFP is cleared.
+    pub(crate) fn set_block_mapping(&self, block_number: u32, mapping: BlockMapping) {
+        self.payload_mappings[block_number as usize].store(mapping.into(), Ordering::Release);
+    }
+
+    /// Update the sector bitmap block mapping for a given chunk number.
+    ///
+    /// Only called from [`write_block_mapping`](Self::write_block_mapping)
+    /// under the page cache write lock. SBM mappings transition from
+    /// `NotPresent` to `FullyPresent` exactly once and never revert.
+    fn set_sector_bitmap_mapping(&self, chunk_number: u32, mapping: BlockMapping) {
+        self.sector_bitmap_mappings[chunk_number as usize].store(mapping.into(), Ordering::Release);
+    }
+
+    /// Initialize payload mappings for testing. Replaces any existing
+    /// mappings with `data_block_count` entries set to `NotPresent`.
+    #[cfg(test)]
+    pub(crate) fn init_test_payload_mappings(&mut self) {
+        let not_present = BlockMapping::new().with_bat_state(BatEntryState::NotPresent);
+        for mapping in &self.payload_mappings {
+            mapping.store(not_present.into(), Ordering::Relaxed);
+        }
+    }
+}
+
+#[must_use]
+pub struct BatGuard<'a> {
+    bat: Option<&'a Bat>,
+    /// First payload block number with incremented refcount.
+    start_block: u32,
+    /// Number of consecutive payload blocks with incremented refcounts.
+    block_count: u32,
+}
+
+impl<'a> BatGuard<'a> {
+    pub(crate) fn empty() -> Self {
+        Self {
+            bat: None,
+            start_block: 0,
+            block_count: 0,
+        }
+    }
+}
+
+impl Drop for BatGuard<'_> {
+    fn drop(&mut self) {
+        let Some(bat) = self.bat else { return };
+        let mut notify = false;
+        for block in self.start_block..self.start_block + self.block_count {
+            notify |= bat.decrement_io_refcount(block);
+        }
+        if notify {
+            bat.refcount_event.notify(usize::MAX);
+        }
+    }
+}
+
+#[must_use]
+pub struct TrimGuard<'a> {
+    bat: &'a Bat,
+    block_number: u32,
+}
+
+impl Drop for TrimGuard<'_> {
+    fn drop(&mut self) {
+        self.bat.release_trim_claim(self.block_number);
+    }
+}
+
+/// A single block's portion of a virtual disk I/O range.
+///
+/// Produced by [`Bat::block_spans`]. Each span describes one block's
+/// contribution to an `(offset, len)` range.
+#[derive(Debug, Clone, Copy)]
+pub struct BlockSpan {
+    /// Block number within the BAT.
+    pub block_number: u32,
+    /// Byte offset within the block where this span starts.
+    pub block_offset: u32,
+    /// Number of bytes this span covers within the block.
+    pub length: u32,
+    /// Absolute guest virtual disk byte offset for this span.
+    pub virtual_offset: u64,
+}
+
+impl BlockSpan {
+    /// Whether this span covers the entire block.
+    pub fn is_full_block(&self, block_size: u32) -> bool {
+        self.block_offset == 0 && self.length >= block_size
+    }
+}
+
+/// Iterator over [`BlockSpan`]s produced by [`Bat::block_spans`].
+pub struct BlockSpanIter {
+    block_size: u32,
+    base_offset: u64,
+    total_len: u32,
+    current_offset: u32,
+}
+
+impl Iterator for BlockSpanIter {
+    type Item = BlockSpan;
+
+    fn next(&mut self) -> Option<BlockSpan> {
+        if self.current_offset >= self.total_len {
+            return None;
+        }
+        let virtual_offset = self.base_offset + self.current_offset as u64;
+        let block_number = (virtual_offset / self.block_size as u64) as u32;
+        let block_offset = (virtual_offset % self.block_size as u64) as u32;
+        let length = std::cmp::min(
+            self.block_size - block_offset,
+            self.total_len - self.current_offset,
+        );
+        self.current_offset += length;
+        Some(BlockSpan {
+            block_number,
+            block_offset,
+            length,
+            virtual_offset,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::format;
+    use pal_async::async_test;
+    use std::sync::Arc;
+
+    #[test]
+    fn chunk_ratio_default_params() {
+        // 2 MiB blocks, 512-byte sectors → chunk_ratio = 2048
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+        assert_eq!(bat.chunk_ratio, 2048);
+    }
+
+    #[test]
+    fn chunk_ratio_various_sizes() {
+        // 1 MiB blocks, 512 sectors
+        let bat = Bat::new(format::GB1, MB1 as u32, 512, false, MB1 as u32).unwrap();
+        assert_eq!(bat.chunk_ratio, 4096);
+
+        // 4 MiB blocks, 512 sectors
+        let bat = Bat::new(format::GB1, 4 * MB1 as u32, 512, false, MB1 as u32).unwrap();
+        assert_eq!(bat.chunk_ratio, 1024);
+
+        // 32 MiB blocks, 512 sectors
+        let bat = Bat::new(format::GB1, 32 * MB1 as u32, 512, false, MB1 as u32).unwrap();
+        assert_eq!(bat.chunk_ratio, 128);
+
+        // 256 MiB blocks, 512 sectors
+        let bat = Bat::new(format::GB1, 256 * MB1 as u32, 512, false, MB1 as u32).unwrap();
+        assert_eq!(bat.chunk_ratio, 16);
+
+        // 2 MiB blocks, 4096 sectors: sectors_per_block = 512, chunk_ratio = 8388608 / 512 = 16384
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            4096,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+        assert_eq!(bat.chunk_ratio, 16384);
+
+        // 1 MiB blocks, 4096 sectors: sectors_per_block = 256, chunk_ratio = 8388608 / 256 = 32768
+        let bat = Bat::new(format::GB1, MB1 as u32, 4096, false, MB1 as u32).unwrap();
+        assert_eq!(bat.chunk_ratio, 32768);
+    }
+
+    #[test]
+    fn payload_entry_index_calculations() {
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+        // chunk_ratio = 2048
+        assert_eq!(bat.payload_entry_index(0), 0);
+        assert_eq!(bat.payload_entry_index(1), 1);
+        assert_eq!(
+            bat.payload_entry_index(bat.chunk_ratio - 1),
+            bat.chunk_ratio - 1
+        );
+        // At chunk_ratio, we skip one SBM slot.
+        assert_eq!(
+            bat.payload_entry_index(bat.chunk_ratio),
+            bat.chunk_ratio + 1
+        );
+        assert_eq!(
+            bat.payload_entry_index(bat.chunk_ratio + 1),
+            bat.chunk_ratio + 2
+        );
+        // At 2 * chunk_ratio, skip another.
+        assert_eq!(
+            bat.payload_entry_index(2 * bat.chunk_ratio),
+            2 * bat.chunk_ratio + 2
+        );
+    }
+
+    #[test]
+    fn sector_bitmap_entry_index_calculations() {
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            true,
+            MB1 as u32,
+        )
+        .unwrap();
+        // SBM entry 0 is at position chunk_ratio.
+        assert_eq!(bat.sector_bitmap_entry_index(0), bat.chunk_ratio);
+        // SBM entry 1 is at position 2*chunk_ratio + 1.
+        assert_eq!(bat.sector_bitmap_entry_index(1), 2 * bat.chunk_ratio + 1);
+    }
+
+    #[test]
+    fn validate_bat_size_ok() {
+        // For 1 GiB / 2 MiB = 512 data blocks, chunk_ratio = 2048.
+        // entries = 512 + ((512-1)/2048) = 512 + 0 = 512
+        // 512 * 8 = 4096 bytes. Any bat_length >= 4096 is fine.
+        Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn validate_bat_size_too_small() {
+        // 512 entries * 8 bytes = 4096 bytes needed.
+        let result = Bat::new(format::GB1, format::DEFAULT_BLOCK_SIZE, 512, false, 4095);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::BatTooSmall
+            )))
+        ));
+    }
+
+    #[test]
+    fn offset_to_block_calculations() {
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+        assert_eq!(bat.offset_to_block(0), 0);
+        assert_eq!(
+            bat.offset_to_block(format::DEFAULT_BLOCK_SIZE as u64 - 1),
+            0
+        );
+        assert_eq!(bat.offset_to_block(format::DEFAULT_BLOCK_SIZE as u64), 1);
+        assert_eq!(
+            bat.offset_to_block(format::DEFAULT_BLOCK_SIZE as u64 * 10 + 42),
+            10
+        );
+    }
+
+    #[test]
+    fn offset_within_block_calculations() {
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+        assert_eq!(bat.offset_within_block(0), 0);
+        assert_eq!(bat.offset_within_block(512), 512);
+        assert_eq!(
+            bat.offset_within_block(format::DEFAULT_BLOCK_SIZE as u64),
+            0
+        );
+        assert_eq!(
+            bat.offset_within_block(format::DEFAULT_BLOCK_SIZE as u64 + 1024),
+            1024
+        );
+    }
+
+    #[test]
+    fn mapping_max_file_megabyte() {
+        let max_mb: u32 = (1 << 28) - 1; // 268435455
+        let mapping = BlockMapping::new()
+            .with_state(BatEntryState::FullyPresent as u8)
+            .with_file_megabyte(max_mb);
+        assert_eq!(mapping.file_megabyte(), max_mb);
+        assert_eq!(mapping.file_offset(), max_mb as u64 * MB1);
+    }
+
+    #[test]
+    fn mapping_tfp_flag() {
+        let with_tfp = BlockMapping::new()
+            .with_state(BatEntryState::NotPresent as u8)
+            .with_transitioning_to_fully_present(true);
+        assert!(with_tfp.transitioning_to_fully_present());
+
+        let without_tfp = BlockMapping::new()
+            .with_state(BatEntryState::FullyPresent as u8)
+            .with_transitioning_to_fully_present(false);
+        assert!(!without_tfp.transitioning_to_fully_present());
+
+        // TFP is independent of state.
+        assert_eq!(with_tfp.state(), BatEntryState::NotPresent as u8);
+        assert_eq!(without_tfp.state(), BatEntryState::FullyPresent as u8);
+    }
+
+    #[test]
+    fn mapping_from_bat_entry() {
+        let entry = BatEntry::new()
+            .with_state(BatEntryState::FullyPresent as u8)
+            .with_file_offset_mb(100);
+        let mapping = BlockMapping::from_bat_entry(entry, false).unwrap();
+        assert_eq!(mapping.state(), BatEntryState::FullyPresent as u8);
+        assert_eq!(mapping.file_megabyte(), 100);
+        assert!(!mapping.transitioning_to_fully_present());
+    }
+
+    #[test]
+    fn mapping_rejects_reserved_bat_entry_bits() {
+        let entry = BatEntry::from(
+            u64::from(BatEntry::new().with_state(BatEntryState::FullyPresent as u8)) | (1 << 3),
+        );
+        let result = BlockMapping::from_bat_entry(entry, false);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::ReservedBatEntryFieldNonzero
+            )))
+        ));
+    }
+
+    #[test]
+    fn entry_number_to_block_id_payload() {
+        // Non-differencing: all entries are payload.
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+        // chunk_ratio = 2048, data_block_count = 512
+        for i in 0..bat.data_block_count {
+            let entry_index = bat.payload_entry_index(i);
+            let result = bat.entry_number_to_block_id(entry_index);
+            assert_eq!(result, Some((BlockType::Payload, i)), "block {i}");
+        }
+    }
+
+    #[test]
+    fn entry_number_to_block_id_with_sbm() {
+        // Differencing disk with SBM entries.
+        // Use small chunk_ratio to exercise interleaving.
+        // 1 MiB blocks, 4096 sectors → chunk_ratio = 32768.
+        // Use 256 MiB blocks, 512 sectors → chunk_ratio = 16.
+        let bat = Bat::new(format::GB1, 256 * MB1 as u32, 512, true, MB1 as u32).unwrap();
+        assert_eq!(bat.chunk_ratio, 16);
+        // data_block_count = 4, sector_bitmap_block_count = 1
+
+        // Payload entries for group 0: positions 0..15 → blocks 0..3
+        for i in 0..bat.data_block_count {
+            let entry_index = bat.payload_entry_index(i);
+            let result = bat.entry_number_to_block_id(entry_index);
+            assert_eq!(result, Some((BlockType::Payload, i)), "payload block {i}");
+        }
+
+        // SBM entry for chunk 0 at position chunk_ratio = 16
+        let sbm_index = bat.sector_bitmap_entry_index(0);
+        assert_eq!(
+            bat.entry_number_to_block_id(sbm_index),
+            Some((BlockType::SectorBitmap, 0))
+        );
+    }
+
+    #[test]
+    fn entry_number_to_block_id_beyond_end() {
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            false,
+            MB1 as u32,
+        )
+        .unwrap();
+        // Entry beyond all data blocks should return None.
+        let beyond = bat.payload_entry_index(bat.data_block_count);
+        assert_eq!(bat.entry_number_to_block_id(beyond), None);
+    }
+
+    /// Non-differencing disk with data_block_count > chunk_ratio.
+    ///
+    /// The BAT has padding entries at every chunk_ratio boundary. These
+    /// must NOT be misidentified as payload entries.
+    #[test]
+    fn entry_number_to_block_id_padding_not_payload() {
+        // Use 256 MiB blocks so chunk_ratio is small (16 with 512B sectors).
+        // 8 GiB disk → data_block_count = 32 (> chunk_ratio=16).
+        let bat = Bat::new(
+            8 * format::GB1,
+            256 * MB1 as u32,
+            512,
+            false,
+            4 * MB1 as u32, // BAT length large enough
+        )
+        .unwrap();
+        assert_eq!(bat.chunk_ratio, 16);
+        assert_eq!(bat.data_block_count, 32);
+
+        // Entry 16 is the padding entry (position == chunk_ratio in group 0).
+        // It should NOT map to payload block 16.
+        let padding_entry = bat.chunk_ratio; // entry 16
+        let result = bat.entry_number_to_block_id(padding_entry);
+        assert_eq!(
+            result, None,
+            "entry {} is a padding entry on non-diff disk and should return None, \
+             but got {:?}",
+            padding_entry, result
+        );
+
+        // Payload block 16 should be at entry 17 (payload_entry_index(16) = 16 + 16/16 = 17).
+        let real_entry = bat.payload_entry_index(16);
+        assert_eq!(real_entry, 17);
+        let result = bat.entry_number_to_block_id(real_entry);
+        assert_eq!(
+            result,
+            Some((BlockType::Payload, 16)),
+            "entry {} should map to payload block 16",
+            real_entry
+        );
+    }
+
+    // ---- Refcount async behavior tests ----
+
+    fn make_test_bat() -> Bat {
+        Bat::new(4 * MB1, format::DEFAULT_BLOCK_SIZE, 512, false, MB1 as u32).unwrap()
+    }
+
+    #[test]
+    fn decrement_preserves_trim_pending_bit() {
+        let bat = make_test_bat();
+        // Simulate: trim-pending with 3 in-flight I/Os draining.
+        bat.io_refcounts[0].store(IoBlockRef::TRIM_PENDING_BIT | 3, Ordering::Release);
+        assert!(!bat.decrement_io_refcount(0), "3→2 should not need notify");
+        // After decrement: pending bit preserved, count is 2.
+        let cur = IoBlockRef(bat.io_refcount(0));
+        assert!(cur.trim_pending());
+        assert_eq!(cur.io_count(), 2);
+    }
+
+    #[test]
+    #[should_panic(expected = "io_refcount underflow")]
+    fn decrement_panics_on_underflow() {
+        let _ = make_test_bat().decrement_io_refcount(0);
+    }
+
+    #[test]
+    #[should_panic(expected = "trim claimed")]
+    fn decrement_panics_on_trim_claimed() {
+        let bat = make_test_bat();
+        bat.io_refcounts[0].store(IoBlockRef::TRIM_CLAIMED.0, Ordering::Release);
+        let _ = bat.decrement_io_refcount(0);
+    }
+
+    #[test]
+    #[should_panic(expected = "wasn't claimed")]
+    fn release_trim_claim_panics_if_not_claimed() {
+        make_test_bat().release_trim_claim(0);
+    }
+
+    #[async_test]
+    async fn acquire_io_on_idle_block() {
+        let bat = make_test_bat();
+        let guard = bat.acquire_io_refcounts(0, 1).await;
+        assert_eq!(bat.io_refcount(0), 1);
+        drop(guard);
+        assert_eq!(bat.io_refcount(0), 0);
+    }
+
+    #[async_test]
+    async fn acquire_io_resumes_after_trim_releases() {
+        let bat = Arc::new(make_test_bat());
+        bat.io_refcounts[0].store(IoBlockRef::TRIM_CLAIMED.0, Ordering::Release);
+
+        let bat2 = bat.clone();
+        let io_task = async move {
+            let guard = bat2.acquire_io_refcounts(0, 1).await;
+            assert_eq!(bat2.io_refcount(0), 1);
+            drop(guard);
+        };
+
+        let release_task = async {
+            bat.release_trim_claim(0);
+        };
+
+        futures::future::join(io_task, release_task).await;
+        assert_eq!(bat.io_refcount(0), 0);
+    }
+
+    #[async_test]
+    async fn acquire_io_multi_block_rolls_back_on_partial_conflict() {
+        let bat = Arc::new(make_test_bat());
+        bat.io_refcounts[1].store(IoBlockRef::TRIM_CLAIMED.0, Ordering::Release);
+
+        let bat2 = bat.clone();
+        let io_task = async move {
+            let guard = bat2.acquire_io_refcounts(0, 2).await;
+            assert_eq!(bat2.io_refcount(0), 1);
+            assert_eq!(bat2.io_refcount(1), 1);
+            drop(guard);
+        };
+
+        let release_task = async {
+            bat.release_trim_claim(1);
+        };
+
+        futures::future::join(io_task, release_task).await;
+    }
+
+    #[async_test]
+    async fn claim_for_trim_on_idle_block() {
+        let bat = make_test_bat();
+        let guard = bat.claim_for_trim(0).await;
+        assert_eq!(bat.io_refcount(0), IoBlockRef::TRIM_CLAIMED.0);
+        drop(guard);
+        assert_eq!(bat.io_refcount(0), 0);
+    }
+
+    #[async_test]
+    async fn claim_for_trim_waits_for_io_drain() {
+        let bat = Arc::new(make_test_bat());
+
+        let io_guard = bat.acquire_io_refcounts(0, 1).await;
+        assert_eq!(bat.io_refcount(0), 1);
+
+        let trim_task = async {
+            let guard = bat.claim_for_trim(0).await;
+            assert_eq!(bat.io_refcount(0), IoBlockRef::TRIM_CLAIMED.0);
+            guard
+        };
+
+        let drain_task = async {
+            // After trim_task's first poll, trim-pending is set.
+            assert!(IoBlockRef(bat.io_refcount(0)).trim_pending());
+            assert!(!bat.try_increment_io_refcount(0));
+            drop(io_guard);
+        };
+
+        let (trim_guard, ()) = futures::future::join(trim_task, drain_task).await;
+        drop(trim_guard);
+        assert_eq!(bat.io_refcount(0), 0);
+    }
+
+    #[async_test]
+    async fn trim_has_writer_priority_over_new_io() {
+        let bat = Arc::new(make_test_bat());
+
+        // Block 0 has an in-flight I/O.
+        let io_guard = bat.acquire_io_refcounts(0, 1).await;
+
+        // Trim claims — sets pending, waits for drain.
+        let trim_task = async {
+            let guard = bat.claim_for_trim(0).await;
+            assert_eq!(bat.io_refcount(0), IoBlockRef::TRIM_CLAIMED.0);
+            guard
+        };
+        let drain_task = async { drop(io_guard) };
+
+        let (trim_guard, ()) = futures::future::join(trim_task, drain_task).await;
+
+        // Trim owns the block. New I/O should be blocked.
+        assert!(!bat.try_increment_io_refcount(0));
+
+        // Release trim, then new I/O should succeed.
+        drop(trim_guard);
+        let io_guard2 = bat.acquire_io_refcounts(0, 1).await;
+        assert_eq!(bat.io_refcount(0), 1);
+        drop(io_guard2);
+        assert_eq!(bat.io_refcount(0), 0);
+    }
+
+    #[async_test]
+    async fn acquire_io_blocked_at_overflow_resumes() {
+        let bat = Arc::new(make_test_bat());
+        bat.io_refcounts[0].store(IoBlockRef::MAX_IO_REFCOUNT, Ordering::Release);
+
+        let bat2 = bat.clone();
+        let io_task = async move {
+            let guard = bat2.acquire_io_refcounts(0, 1).await;
+            assert_eq!(bat2.io_refcount(0), IoBlockRef::MAX_IO_REFCOUNT);
+            drop(guard);
+        };
+
+        let unblock_task = async {
+            if bat.decrement_io_refcount(0) {
+                bat.refcount_event.notify(usize::MAX);
+            }
+        };
+
+        futures::future::join(io_task, unblock_task).await;
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/cache.rs b/vm/devices/storage/vhdx/src/cache.rs
new file mode 100644
index 0000000000..9aa2edb4fc
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/cache.rs
@@ -0,0 +1,1969 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Write-back page cache for VHDX metadata pages.
+//!
+//! Provides a hash-table-backed, page-granularity (4 KiB) caching layer over
+//! an [`AsyncFile`](crate::AsyncFile). Pages are identified by a [`PageKey`]
+//! consisting of a tag (u8) and an offset within a tagged region. Tags map
+//! to base file offsets, allowing region relocation without invalidating
+//! cached pages.
+//!
+//! Modified pages accumulate as **Dirty** in the cache. On [`commit()`](PageCache::commit),
+//! dirty pages are sent to the [log task](crate::log_task) via a mesh channel
+//! for WAL persistence. The log task applies them to their final file offsets
+//! in the background.
+//!
+//! Page data is stored as `Arc<[u8; PAGE_SIZE]>` to enable zero-copy commit
+//! (Arc::clone) and implicit COW (Arc::make_mut) when a page is modified while
+//! the log task holds a reference.
+//!
+//! # Write Ordering
+//!
+//! The cache guarantees that writes are **ordered** through the log. If a
+//! caller writes page A, then later writes page B, the only crash-recovery
+//! outcomes are: {neither}, {A only}, or {both A and B}. It is never the case
+//! that B is persisted without A.
+//!
+//! This ordering is maintained by **batch-full commit**: when the dirty page
+//! count reaches [`MAX_COMMIT_PAGES`] and a new page is about to become dirty,
+//! the cache automatically commits the current dirty set to the log before
+//! allowing the new page to enter the dirty set.
+
+use crate::AsyncFile;
+use crate::error::CacheError;
+use crate::flush::Fsn;
+use crate::log_permits::LogPermits;
+use crate::log_task::LogClient;
+use crate::log_task::LogData;
+use crate::log_task::Lsn;
+use crate::lsn_watermark::LsnWatermark;
+use parking_lot::ArcMutexGuard;
+use parking_lot::Mutex;
+use std::collections::HashMap;
+use std::collections::hash_map;
+use std::sync::Arc;
+
+/// Page size used by the cache (4 KiB).
+pub const PAGE_SIZE: usize = 4096;
+
+/// Maximum number of dirty pages per commit batch.
+///
+/// Derived from 1/4 of the minimum 1 MiB VHDX log. With 0 zero ranges:
+///   entry_length(N) = ceil((64 + 32*N) / 4096) * 4096 + N * 4096
+///   (N+1)*4096 + 4096 (guard) ≤ 262144  →  N ≤ 62
+///
+/// Note: the permit count is a *multiple* of this value (see `open.rs`)
+/// to allow pipelining — multiple batches can be in-flight in the
+/// log/apply pipeline simultaneously.
+pub const MAX_COMMIT_PAGES: usize = 62;
+
+/// Key identifying a cached page.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PageKey {
+    /// Tag selecting the region (e.g., 0 = BAT, 1 = metadata).
+    pub tag: u8,
+    /// Byte offset within the tagged region. Must be 4 KiB aligned.
+    pub offset: u64,
+}
+
+/// Write mode for page acquisition.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum WriteMode {
+    /// Page is loaded from file if not cached. Caller will modify parts.
+    Modify,
+    /// Page is NOT loaded from file (caller will overwrite the entire page).
+    Overwrite,
+}
+
+/// Per-page lifecycle state.
+///
+/// Encodes the dirty flag, permit state, and data provenance as a single
+/// enum to prevent invalid combinations.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum PageState {
+    /// Page is not dirty. Data may or may not be loaded (`data` can be
+    /// `None` for a freshly created entry that hasn't been loaded or
+    /// written yet).
+    Clean,
+    /// Page data is being loaded from disk by another task.
+    /// Other acquirers wait on `state_event`.
+    Loading,
+    /// A log permit is being acquired for this page.
+    /// Other acquirers wait on `state_event`.
+    AcquiringPermit,
+    /// Page has been modified (or a permit has been acquired for it).
+    /// A permit is consumed (transfers to the log task on commit).
+    Dirty,
+}
+
+/// Internal per-page data, generic over the file buffer type.
+struct PageData<B> {
+    /// The page contents as `Arc` for zero-copy commit and COW.
+    /// `Some` when `state` is `Dirty`, and when `Clean` after a
+    /// successful load or write.
+    /// `None` when `Clean` (freshly created, not yet loaded),
+    /// `Loading`, or `AcquiringPermit`.
+    data: Option<Arc<B>>,
+    /// Page lifecycle state.
+    state: PageState,
+    /// If set, the log task must wait for this FSN to complete before
+    /// including this page in a log entry.
+    pre_log_fsn: Option<Fsn>,
+    /// Index into `PageMap::lru`. Allocated on entry creation.
+    lru_index: usize,
+    /// Hint: this page is cheap to regenerate; evict before other pages.
+    demoted: bool,
+}
+
+/// Node in the slab-indexed LRU doubly-linked list.
+struct LruNode {
+    key: PageKey,
+    prev: usize,
+    next: usize,
+    linked: bool,
+}
+
+/// Slab-indexed doubly-linked list for LRU eviction ordering.
+///
+/// Index 0 is a sentinel node. `sentinel.next` is the MRU end,
+/// `sentinel.prev` is the LRU end (eviction candidate). All
+/// operations are O(1).
+struct LruList {
+    nodes: Vec<LruNode>,
+    free: Vec<usize>,
+}
+
+impl LruList {
+    /// Create a new list with only the sentinel node.
+    fn new() -> Self {
+        Self {
+            nodes: vec![LruNode {
+                key: PageKey { tag: 0, offset: 0 },
+                prev: 0,
+                next: 0,
+                linked: true, // sentinel is always "linked"
+            }],
+            free: Vec::new(),
+        }
+    }
+
+    /// Allocate a slot for a new page. The node is NOT linked.
+    fn alloc(&mut self, key: PageKey) -> usize {
+        if let Some(idx) = self.free.pop() {
+            self.nodes[idx] = LruNode {
+                key,
+                prev: 0,
+                next: 0,
+                linked: false,
+            };
+            idx
+        } else {
+            let idx = self.nodes.len();
+            self.nodes.push(LruNode {
+                key,
+                prev: 0,
+                next: 0,
+                linked: false,
+            });
+            idx
+        }
+    }
+
+    /// Returns true if the node is currently linked in the list.
+    fn is_linked(&self, idx: usize) -> bool {
+        debug_assert!(idx != 0, "cannot check sentinel linkage");
+        self.nodes[idx].linked
+    }
+
+    /// Remove a node from its current position. No-op if unlinked.
+    fn unlink(&mut self, idx: usize) {
+        if idx == 0 || !self.is_linked(idx) {
+            return;
+        }
+        let prev = self.nodes[idx].prev;
+        let next = self.nodes[idx].next;
+        self.nodes[prev].next = next;
+        self.nodes[next].prev = prev;
+        self.nodes[idx].prev = 0;
+        self.nodes[idx].next = 0;
+        self.nodes[idx].linked = false;
+    }
+
+    /// Move a node to the MRU end. Works whether linked or unlinked.
+    fn move_to_front(&mut self, idx: usize) {
+        if idx == 0 {
+            return;
+        }
+        // Already at front?
+        if self.nodes[0].next == idx {
+            return;
+        }
+        // Remove from current position if linked.
+        if self.nodes[idx].linked {
+            let prev = self.nodes[idx].prev;
+            let next = self.nodes[idx].next;
+            self.nodes[prev].next = next;
+            self.nodes[next].prev = prev;
+        }
+        // Insert after sentinel.
+        let old_front = self.nodes[0].next;
+        self.nodes[idx].prev = 0;
+        self.nodes[idx].next = old_front;
+        self.nodes[0].next = idx;
+        self.nodes[old_front].prev = idx;
+        self.nodes[idx].linked = true;
+    }
+
+    /// Move a node to the LRU end. Works whether linked or unlinked.
+    fn move_to_back(&mut self, idx: usize) {
+        if idx == 0 {
+            return;
+        }
+        // Already at back?
+        if self.nodes[0].prev == idx {
+            return;
+        }
+        // Remove from current position if linked.
+        if self.nodes[idx].linked {
+            let prev = self.nodes[idx].prev;
+            let next = self.nodes[idx].next;
+            self.nodes[prev].next = next;
+            self.nodes[next].prev = prev;
+        }
+        // Insert before sentinel.
+        let old_back = self.nodes[0].prev;
+        self.nodes[idx].next = 0;
+        self.nodes[idx].prev = old_back;
+        self.nodes[0].prev = idx;
+        self.nodes[old_back].next = idx;
+        self.nodes[idx].linked = true;
+    }
+
+    /// Return the index of the LRU tail node, or 0 if empty.
+    fn tail(&self) -> usize {
+        self.nodes[0].prev
+    }
+
+    /// Unlink and recycle a node.
+    fn dealloc(&mut self, idx: usize) {
+        self.unlink(idx);
+        self.free.push(idx);
+    }
+}
+
+/// Number of distinct cache tags (BAT=0, METADATA=1, SBM=2).
+const TAG_COUNT: usize = 3;
+
+/// Entry in the page map. Wraps the page data mutex with metadata
+/// that can be read under the map lock without taking the page lock.
+struct CacheEntry<B> {
+    page: Arc<Mutex<PageData<B>>>,
+    /// LSN of the most recent commit that included this page.
+    /// Set under the map lock in `commit_locked`, read under the map
+    /// lock in eviction — no page lock needed. 0 = never committed.
+    committed_lsn: Lsn,
+    /// True when the page is clean and no writer holds it.
+    /// Set under the map lock on all write-acquire and write-release
+    /// paths. Eviction checks this without taking the page lock.
+    idle: bool,
+}
+
+/// Internal page map wrapping the `HashMap` and dirty page counter.
+struct PageMap<B> {
+    map: HashMap<PageKey, CacheEntry<B>>,
+    /// Number of pages with a consumed permit (Dirty, or Clean with
+    /// an active `WritePageGuard` that hasn't called `DerefMut` yet).
+    /// Maintained under the map lock to prevent races.
+    dirty_count: usize,
+    /// Log client for sending transactions. `None` for read-only caches.
+    log_client: Option<LogClient<B>>,
+    /// Base file offset per tag, indexed by tag value.
+    tag_offsets: [u64; TAG_COUNT],
+    /// LRU eviction list. Protected by the same lock as `map`.
+    lru: LruList,
+}
+
+impl<B> PageMap<B> {
+    /// Resolve a [`PageKey`] to an absolute file offset.
+    fn resolve_offset(&self, key: PageKey) -> u64 {
+        self.tag_offsets[key.tag as usize] + key.offset
+    }
+}
+
+/// Action to perform when a page isn't ready (returned by sync helpers).
+/// This enum is `Send` — it never contains `ArcMutexGuard`.
+enum PendingAction<B> {
+    /// Wait for another task to finish loading/acquiring.
+    Wait(event_listener::EventListener),
+    /// Load page data from disk at this file offset. Carries the page
+    /// entry Arc so `complete_load` can skip the map re-lookup.
+    Load(u64, Arc<Mutex<PageData<B>>>),
+}
+
+/// Action for acquire_write when the page isn't ready.
+/// This enum is `Send` — it never contains `ArcMutexGuard`.
+enum WritePendingAction<B> {
+    /// Wait for another task to finish loading/acquiring.
+    Wait(event_listener::EventListener),
+    /// Load page data from disk at this file offset. Carries the page
+    /// entry Arc so `complete_load` can skip the map re-lookup.
+    Load(u64, Arc<Mutex<PageData<B>>>),
+    /// Acquire a log permit. Carries the page entry Arc so
+    /// `finalize_permit` can skip the map re-lookup.
+    AcquirePermit(Arc<Mutex<PageData<B>>>),
+}
+
+/// Log pipeline state shared between the cache and the log/apply tasks.
+///
+/// Present only when the file is opened writable with a log task.
+pub(crate) struct CacheLogState {
+    /// Failable semaphore for log backpressure.
+    pub permits: Arc<LogPermits>,
+    /// LSN watermark published by the apply task.
+    pub applied_lsn: Arc<LsnWatermark>,
+}
+
+/// Write-back page cache backed by an [`AsyncFile`].
+pub struct PageCache<F: AsyncFile> {
+    pub(crate) file: Arc<F>,
+    pages: Mutex<PageMap<F::Buffer>>,
+    log_state: Option<CacheLogState>,
+    /// Notified when a page transitions out of `Loading` or `AcquiringPermit`.
+    state_event: event_listener::Event,
+    /// Maximum number of pages to keep in the cache. 0 = unlimited.
+    quota: usize,
+}
+
+impl<F: AsyncFile> PageCache<F> {
+    /// Create a new cache backed by the given file.
+    pub fn new(
+        file: Arc<F>,
+        log_client: Option<LogClient<F::Buffer>>,
+        log_state: Option<CacheLogState>,
+        quota: usize,
+    ) -> Self {
+        Self {
+            file,
+            pages: Mutex::new(PageMap {
+                map: HashMap::new(),
+                dirty_count: 0,
+                log_client,
+                tag_offsets: [0; TAG_COUNT],
+                lru: LruList::new(),
+            }),
+            log_state,
+            state_event: event_listener::Event::new(),
+            quota,
+        }
+    }
+
+    /// Take the log client out of the cache, returning it.
+    pub fn take_log_client(&mut self) -> Option<LogClient<F::Buffer>> {
+        self.pages.lock().log_client.take()
+    }
+
+    /// Set the log pipeline state (for late initialization after log task spawn).
+    pub fn set_log_state(&mut self, state: CacheLogState) {
+        self.log_state = Some(state);
+    }
+
+    /// Register a tag with its base file offset.
+    pub fn register_tag(&mut self, tag: u8, base_offset: u64) {
+        self.pages.lock().tag_offsets[tag as usize] = base_offset;
+    }
+
+    /// Evict clean, applied pages to bring the cache back under quota.
+    /// Must be called with the pages map lock held.
+    /// `skip_key` is the page being acquired — never evict it.
+    fn try_evict_under_lock(&self, pages: &mut PageMap<F::Buffer>, skip_key: Option<PageKey>) {
+        let applied = self
+            .log_state
+            .as_ref()
+            .map(|s| s.applied_lsn.get())
+            .unwrap_or(Lsn::ZERO);
+
+        // Walk backward from the LRU tail. Check `idle` and
+        // `committed_lsn` on the entry — both maintained under the
+        // map lock, so no page lock needed.
+        let mut idx = pages.lru.tail();
+        while self.quota > 0 && pages.map.len() > self.quota {
+            if idx == 0 {
+                break;
+            }
+            let prev_idx = pages.lru.nodes[idx].prev;
+            let key = pages.lru.nodes[idx].key;
+            if skip_key == Some(key) {
+                idx = prev_idx;
+                continue;
+            }
+            let entry = pages.map.get(&key).expect("LRU key missing from map");
+            if entry.idle && entry.committed_lsn <= applied {
+                pages.map.remove(&key);
+                pages.lru.dealloc(idx);
+                idx = pages.lru.tail();
+                continue;
+            }
+            idx = prev_idx;
+        }
+    }
+
+    /// Acquire read access to a page.
+    pub async fn acquire_read(&self, key: PageKey) -> Result<ReadPageGuard<F::Buffer>, CacheError> {
+        loop {
+            let action = match self.try_acquire_read(key) {
+                Ok(guard) => return Ok(guard),
+                Err(action) => action,
+            };
+            match action {
+                PendingAction::Wait(listener) => listener.await,
+                PendingAction::Load(file_offset, entry) => {
+                    let buf = self.file.alloc_buffer(PAGE_SIZE);
+                    match self.file.read_into(file_offset, buf).await {
+                        Ok(buf) => self.complete_load(entry, Some(Arc::new(buf))),
+                        Err(e) => {
+                            self.complete_load(entry, None);
+                            return Err(CacheError::Read {
+                                err: e,
+                                file_offset,
+                            });
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Sync helper: try to acquire read access.
+    fn try_acquire_read(
+        &self,
+        key: PageKey,
+    ) -> Result<ReadPageGuard<F::Buffer>, PendingAction<F::Buffer>> {
+        assert!(
+            key.offset.is_multiple_of(PAGE_SIZE as u64),
+            "page offset {:#x} is not {PAGE_SIZE}-byte aligned",
+            key.offset
+        );
+        let mut pages = self.pages.lock();
+        let file_offset = pages.resolve_offset(key);
+
+        // Pre-allocate an LRU slot. Freed below if the entry already exists.
+        let lru_index = pages.lru.alloc(key);
+        let mut inserted = false;
+        let page = pages
+            .map
+            .entry(key)
+            .or_insert_with(|| {
+                inserted = true;
+                CacheEntry {
+                    page: Arc::new(Mutex::new(PageData {
+                        data: None,
+                        state: PageState::Clean,
+                        pre_log_fsn: None,
+                        lru_index,
+                        demoted: false,
+                    })),
+                    committed_lsn: Lsn::ZERO,
+                    idle: true,
+                }
+            })
+            .page
+            .clone();
+
+        if !inserted {
+            pages.lru.dealloc(lru_index);
+        } else if self.quota > 0 && pages.map.len() > self.quota {
+            self.try_evict_under_lock(&mut pages, Some(key));
+        }
+
+        let mut guard = Mutex::lock_arc(&page);
+
+        match guard.state {
+            PageState::Loading | PageState::AcquiringPermit => {
+                let listener = self.state_event.listen();
+                drop(guard);
+                drop(pages);
+                Err(PendingAction::Wait(listener))
+            }
+            PageState::Clean if guard.data.is_none() => {
+                guard.state = PageState::Loading;
+                let entry_arc = ArcMutexGuard::into_arc(guard);
+                drop(pages);
+                Err(PendingAction::Load(file_offset, entry_arc))
+            }
+            PageState::Clean | PageState::Dirty => {
+                assert!(
+                    guard.data.is_some(),
+                    "page in {:?} has no data",
+                    guard.state
+                );
+                // Promote to MRU.
+                let idx = guard.lru_index;
+                guard.demoted = false;
+                pages.lru.move_to_front(idx);
+                drop(pages);
+                Ok(ReadPageGuard { guard })
+            }
+        }
+    }
+
+    /// Complete a page load: store data and transition out of Loading.
+    ///
+    /// On success (`data` is `Some`): stores data, transitions `Loading → Clean`.
+    /// Uses the `entry` Arc directly — no map re-lookup needed.
+    ///
+    /// On failure (`data` is `None`): removes the entry from the cache so the
+    /// next acquirer creates a fresh entry and retries.
+    fn complete_load(&self, entry: Arc<Mutex<PageData<F::Buffer>>>, data: Option<Arc<F::Buffer>>) {
+        let mut page = entry.lock();
+        assert!(
+            page.state == PageState::Loading,
+            "complete_load called but page state is {:?}, expected Loading",
+            page.state
+        );
+        assert!(
+            page.data.is_none(),
+            "complete_load called but page already has data"
+        );
+        page.state = PageState::Clean;
+        page.data = data;
+        self.state_event.notify(usize::MAX);
+    }
+
+    /// Acquire write access to a page.
+    ///
+    /// If a log is configured, acquires a permit (backpressure). If the
+    /// dirty batch is full, commits it first (batch-full commit).
+    pub async fn acquire_write(
+        &self,
+        key: PageKey,
+        mode: WriteMode,
+    ) -> Result<WritePageGuard<'_, F>, CacheError> {
+        let load = mode == WriteMode::Modify;
+
+        loop {
+            let action = match self.try_acquire_write(key, load) {
+                Ok(guard) => return Ok(guard),
+                Err(action) => action,
+            };
+            match action {
+                WritePendingAction::Wait(listener) => listener.await,
+                WritePendingAction::Load(file_offset, entry) => {
+                    let buf = self.file.alloc_buffer(PAGE_SIZE);
+                    match self.file.read_into(file_offset, buf).await {
+                        Ok(buf) => self.complete_load(entry, Some(Arc::new(buf))),
+                        Err(e) => {
+                            self.complete_load(entry, None);
+                            return Err(CacheError::Read {
+                                err: e,
+                                file_offset,
+                            });
+                        }
+                    }
+                }
+                WritePendingAction::AcquirePermit(entry) => {
+                    let permits = &self.log_state.as_ref().unwrap().permits;
+                    let result = permits.acquire(1).await;
+                    match result {
+                        Ok(()) => {
+                            return self.finalize_permit(entry);
+                        }
+                        Err(e) => {
+                            self.finalize_permit_failed(entry);
+                            return Err(CacheError::PipelineFailed(e));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Sync helper: try to acquire write access.
+    ///
+    /// Returns the guard on success, or an action to perform before
+    /// retrying. Batch-full commit is handled in [`finalize_permit`].
+    fn try_acquire_write(
+        &self,
+        key: PageKey,
+        load: bool,
+    ) -> Result<WritePageGuard<'_, F>, WritePendingAction<F::Buffer>> {
+        assert!(
+            self.log_state.is_some(),
+            "acquire_write requires a log (use VhdxFile::open().writable())"
+        );
+
+        assert!(
+            key.offset.is_multiple_of(PAGE_SIZE as u64),
+            "page offset {:#x} is not {PAGE_SIZE}-byte aligned",
+            key.offset
+        );
+
+        let file_offset;
+        let mut pages = self.pages.lock();
+        let mut guard = {
+            let pages = &mut *pages;
+            file_offset = pages.resolve_offset(key);
+
+            // Pre-allocate an LRU slot. Freed below if the entry already exists.
+            let lru_index = pages.lru.alloc(key);
+            match pages.map.entry(key) {
+                hash_map::Entry::Occupied(entry) => {
+                    let entry = entry.into_mut();
+                    pages.lru.dealloc(lru_index);
+                    entry.idle = false;
+                    entry.page.lock_arc()
+                }
+                hash_map::Entry::Vacant(entry) => {
+                    let entry = entry.insert(CacheEntry {
+                        page: Arc::new(Mutex::new(PageData {
+                            data: None,
+                            state: PageState::Clean,
+                            pre_log_fsn: None,
+                            lru_index,
+                            demoted: false,
+                        })),
+                        committed_lsn: Lsn::ZERO,
+                        idle: false,
+                    });
+                    let page = entry.page.clone();
+                    if self.quota > 0 && pages.map.len() > self.quota {
+                        self.try_evict_under_lock(pages, Some(key));
+                    }
+                    page.lock_arc()
+                }
+            }
+        };
+
+        match guard.state {
+            PageState::Loading | PageState::AcquiringPermit => {
+                Err(WritePendingAction::Wait(self.state_event.listen()))
+            }
+            PageState::Dirty => {
+                assert!(
+                    guard.data.is_some(),
+                    "page in {:?} has no data",
+                    guard.state
+                );
+                // Promote to MRU.
+                let idx = guard.lru_index;
+                guard.demoted = false;
+                pages.lru.move_to_front(idx);
+                drop(pages);
+                Ok(WritePageGuard {
+                    cache: self,
+                    guard: Some(guard),
+                    overwriting: false,
+                })
+            }
+            PageState::Clean if load && guard.data.is_none() => {
+                guard.state = PageState::Loading;
+                Err(WritePendingAction::Load(
+                    file_offset,
+                    ArcMutexGuard::into_arc(guard),
+                ))
+            }
+            PageState::Clean => {
+                // Promote to MRU.
+                let idx = guard.lru_index;
+                guard.demoted = false;
+                pages.lru.move_to_front(idx);
+                guard.state = PageState::AcquiringPermit;
+                Err(WritePendingAction::AcquirePermit(ArcMutexGuard::into_arc(
+                    guard,
+                )))
+            }
+        }
+    }
+
+    /// Finalize a successful permit acquisition.
+    ///
+    /// Returns the page guard directly — the caller wraps it in a
+    /// `WritePageGuard` without re-entering `try_acquire_write`.
+    /// This eliminates the window where the page is in HasPermit/Overwritten
+    /// state without an active writer.
+    ///
+    /// The dirty_count check, batch-full commit, and dirty_count increment
+    /// are all performed atomically under the map lock — no TOCTOU gap.
+    fn finalize_permit(
+        &self,
+        entry: Arc<Mutex<PageData<F::Buffer>>>,
+    ) -> Result<WritePageGuard<'_, F>, CacheError> {
+        let mut pages = self.pages.lock();
+
+        // Batch-full commit: if the dirty batch has reached
+        // MAX_COMMIT_PAGES, commit before adding this page.
+        if pages.dirty_count >= MAX_COMMIT_PAGES {
+            if let Err(e) = self.commit_locked(&mut pages) {
+                self.revert_permit(&entry, &mut pages);
+                return Err(e);
+            }
+        }
+        // Note that this may actually put us over MAX_COMMIT_PAGES, but only due to
+        // transient dirty counts from pages that are clean and have not yet decremented
+        // the count in [`WritePageGuard::drop`]. So, it will still be imposible for a
+        // cache transaction to be larger than MAX_COMMIT_PAGES.
+        pages.dirty_count += 1;
+
+        let mut page = Mutex::lock_arc(&entry);
+        assert!(page.state == PageState::AcquiringPermit);
+        let overwriting = page.data.is_none();
+        if overwriting {
+            page.data = Some(Arc::new(self.file.alloc_buffer(PAGE_SIZE)));
+        }
+        // For overwriting pages (fresh zeros for Overwrite), set Dirty —
+        // the zeros are synthetic, not real disk data, so they must be
+        // committed even if the caller doesn't call DerefMut.
+        page.state = if overwriting {
+            PageState::Dirty
+        } else {
+            PageState::Clean
+        };
+
+        self.state_event.notify(usize::MAX);
+        Ok(WritePageGuard {
+            cache: self,
+            guard: Some(page),
+            overwriting,
+        })
+    }
+
+    /// Release a clean write permit: mark idle, release the permit,
+    /// decrement dirty_count, and notify waiters. The page must already
+    /// be in `Clean` state.
+    fn release_clean_permit(&self, lru_index: usize, pages: &mut PageMap<F::Buffer>) {
+        pages.dirty_count -= 1;
+        let key = pages.lru.nodes[lru_index].key;
+        if let Some(ce) = pages.map.get_mut(&key) {
+            ce.idle = true;
+        }
+        if let Some(ref state) = self.log_state {
+            state.permits.release(1);
+        }
+        self.state_event.notify(usize::MAX);
+    }
+
+    /// Revert a page from `AcquiringPermit` back to `Clean` and release
+    /// the permit. Used on permit acquisition failure and commit failure.
+    fn revert_permit(
+        &self,
+        entry: &Arc<Mutex<PageData<F::Buffer>>>,
+        pages: &mut PageMap<F::Buffer>,
+    ) {
+        let mut page = entry.lock();
+        assert!(page.state == PageState::AcquiringPermit);
+        page.state = PageState::Clean;
+        let lru_index = page.lru_index;
+        drop(page);
+        self.release_clean_permit(lru_index, pages);
+    }
+
+    /// Finalize a failed permit acquisition: revert to Clean.
+    fn finalize_permit_failed(&self, entry: Arc<Mutex<PageData<F::Buffer>>>) {
+        let mut pages = self.pages.lock();
+        self.revert_permit(&entry, &mut pages);
+    }
+
+    /// Get the pre-log FSN for a specific page, if set.
+    #[cfg(test)]
+    pub fn get_pre_log_fsn(&self, key: PageKey) -> Option<Fsn> {
+        let pages = self.pages.lock();
+        if let Some(entry) = pages.map.get(&key) {
+            let page = entry.page.lock();
+            page.pre_log_fsn
+        } else {
+            None
+        }
+    }
+
+    /// Commit all dirty pages to the log task (fire-and-forget).
+    ///
+    /// Returns the current LSN. If there were dirty pages, they are sent
+    /// to the log task and the returned LSN is the one assigned to that
+    /// batch. If there were no dirty pages, returns the most recently
+    /// assigned LSN (so that concurrent `flush()` callers still wait
+    /// for any in-flight WAL writes).
+    pub fn commit(&self) -> Result<Lsn, CacheError> {
+        let mut pages = self.pages.lock();
+        self.commit_locked(&mut pages)
+    }
+
+    /// Send pre-built page-aligned data through the log, bypassing the
+    /// cache's dirty-page tracking. Used for non-cache metadata writes
+    /// (e.g., region table repair).
+    ///
+    /// Returns the assigned LSN.
+    pub fn commit_raw(&self, raw_pages: Vec<LogData<F::Buffer>>, pre_log_fsn: Option<Fsn>) -> Lsn {
+        let mut map = self.pages.lock();
+        let client = map
+            .log_client
+            .as_mut()
+            .expect("commit_raw requires a log client (use VhdxFile::open().writable())");
+        let txn = client.begin();
+        txn.commit(raw_pages, pre_log_fsn)
+    }
+
+    /// Inner commit implementation that takes an already-held map lock.
+    ///
+    /// This allows `finalize_permit` to check dirty_count and commit
+    /// atomically under the same lock — no TOCTOU gap.
+    fn commit_locked(&self, pages: &mut PageMap<F::Buffer>) -> Result<Lsn, CacheError> {
+        let client = pages
+            .log_client
+            .as_mut()
+            .expect("commit requires a log client (use VhdxFile::open().writable())");
+
+        let mut log_data = Vec::new();
+        let mut max_pre_log_fsn: Option<Fsn> = None;
+
+        let txn = client.begin();
+        let lsn = txn.lsn();
+
+        // Destructure to get separate borrows on map, lru, and tag_offsets.
+        let PageMap {
+            ref mut map,
+            ref mut lru,
+            ref tag_offsets,
+            ..
+        } = *pages;
+
+        for (&key, entry) in map.iter_mut() {
+            let mut page = entry.page.lock();
+            if matches!(page.state, PageState::Dirty) {
+                let file_offset = tag_offsets[key.tag as usize] + key.offset;
+                let data = page.data.as_ref().expect("dirty page has no data").clone();
+
+                if let Some(fsn) = page.pre_log_fsn.take() {
+                    max_pre_log_fsn = Some(max_pre_log_fsn.map_or(fsn, |m| m.max(fsn)));
+                }
+
+                page.state = PageState::Clean;
+
+                if page.demoted {
+                    page.demoted = false;
+                    lru.move_to_back(page.lru_index);
+                }
+
+                entry.committed_lsn = lsn;
+                entry.idle = true;
+
+                log_data.push(LogData::new(file_offset, data));
+            }
+        }
+
+        if log_data.is_empty() {
+            return Ok(client.current_lsn());
+        }
+
+        let committed_count = log_data.len();
+        pages.dirty_count -= committed_count;
+
+        txn.commit(log_data, max_pre_log_fsn);
+
+        // Do NOT release permits here. Permits stay consumed until the
+        // apply task writes pages to their final offsets and releases
+        // them. This bounds the total in-flight page data (Arc clones)
+        // in the log/apply pipeline, preventing unbounded memory growth.
+
+        Ok(lsn)
+    }
+}
+
+/// RAII guard providing read-only access to a cached page.
+#[must_use = "page guard holds a lock; drop it when done reading"]
+pub struct ReadPageGuard<B> {
+    guard: ArcMutexGuard<parking_lot::RawMutex, PageData<B>>,
+}
+
+impl<B: AsRef<[u8]> + Send + Sync + 'static> std::ops::Deref for ReadPageGuard<B> {
+    type Target = [u8; PAGE_SIZE];
+
+    fn deref(&self) -> &[u8; PAGE_SIZE] {
+        self.guard
+            .data
+            .as_ref()
+            .expect("page data missing")
+            .as_ref()
+            .as_ref()
+            .try_into()
+            .expect("buffer is not PAGE_SIZE")
+    }
+}
+
+/// RAII guard providing write access to a cached page.
+///
+/// Mutating via `DerefMut` transitions the page to `Dirty`. Arc COW
+/// ensures the writer gets a private copy if the log task holds a
+/// reference.
+pub struct WritePageGuard<'a, F: AsyncFile> {
+    cache: &'a PageCache<F>,
+    guard: Option<ArcMutexGuard<parking_lot::RawMutex, PageData<F::Buffer>>>,
+    /// Data existed before this acquire (loaded or previously written).
+    /// False for first-touch Overwrite (zeroed data).
+    overwriting: bool,
+}
+
+impl<F: AsyncFile> WritePageGuard<'_, F> {
+    /// Returns true if the page is being overwritten rather than modified.
+    ///
+    /// If true, the page data is freshly zeroed and must be fully written by
+    /// the caller (unless the caller just wants to commit a zero page).
+    pub fn is_overwriting(&self) -> bool {
+        self.overwriting
+    }
+
+    /// Set the pre-log flush sequence number on this page.
+    pub fn set_pre_log_fsn(&mut self, fsn: Fsn) {
+        let guard = self.guard.as_mut().expect("guard consumed");
+        guard.pre_log_fsn = Some(match guard.pre_log_fsn {
+            Some(existing) => existing.max(fsn),
+            None => fsn,
+        });
+    }
+
+    /// Hint that this page is cheap to regenerate and should be evicted
+    /// before other pages (e.g., BAT pages that can be rebuilt from
+    /// in-memory state).
+    ///
+    /// If the page becomes dirty, the demotion is applied when the page
+    /// transitions back to clean in [`PageCache::commit`]. If the page
+    /// stays clean (guard dropped without mutation), the demotion is
+    /// applied immediately on drop.
+    pub fn demote(&mut self) {
+        self.guard.as_mut().expect("guard consumed").demoted = true;
+    }
+}
+
+impl<F: AsyncFile> std::ops::Deref for WritePageGuard<'_, F> {
+    type Target = [u8; PAGE_SIZE];
+
+    fn deref(&self) -> &[u8; PAGE_SIZE] {
+        self.guard
+            .as_ref()
+            .expect("guard consumed")
+            .data
+            .as_ref()
+            .expect("page data missing")
+            .as_ref()
+            .as_ref()
+            .try_into()
+            .expect("buffer is not PAGE_SIZE")
+    }
+}
+
+impl<F: AsyncFile> std::ops::DerefMut for WritePageGuard<'_, F> {
+    fn deref_mut(&mut self) -> &mut [u8; PAGE_SIZE] {
+        let guard = self.guard.as_mut().expect("guard consumed");
+        guard.state = PageState::Dirty;
+        let buf = Arc::make_mut(guard.data.as_mut().expect("page data missing"));
+        buf.as_mut().try_into().expect("buffer is not PAGE_SIZE")
+    }
+}
+
+impl<F: AsyncFile> Drop for WritePageGuard<'_, F> {
+    fn drop(&mut self) {
+        if let Some(guard) = self.guard.take() {
+            if guard.state != PageState::Dirty {
+                // Guard dropped without mutation. Page is clean — release
+                // the permit and mark idle.
+                let lru_index = guard.lru_index;
+                drop(guard);
+                let mut pages = self.cache.pages.lock();
+                self.cache.release_clean_permit(lru_index, &mut pages);
+            }
+            // If Dirty: permit consumed, page stays not-idle.
+            // Guard drops, releasing page lock.
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::log_task::LogRequest;
+    use crate::tests::support::{FailingInterceptor, InMemoryFile};
+    use pal_async::async_test;
+    use std::sync::Arc;
+
+    /// Helper to create a writable cache with log sender + permits.
+    fn writable_cache(
+        file: InMemoryFile,
+    ) -> (PageCache<InMemoryFile>, mesh::Receiver<LogRequest<Vec<u8>>>) {
+        let (tx, rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let cache = PageCache::new(
+            Arc::new(file),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        (cache, rx)
+    }
+
+    #[async_test]
+    async fn acquire_read_loads_from_file() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64);
+        let pattern: Vec<u8> = (0..PAGE_SIZE).map(|i| (i & 0xFF) as u8).collect();
+        file.write_at(0, &pattern).await.unwrap();
+
+        let mut cache = PageCache::new(Arc::new(file), None, None, 0);
+        cache.register_tag(0, 0);
+
+        let guard = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(&guard[..], &pattern[..]);
+    }
+
+    #[async_test]
+    async fn acquire_modify_loads_and_writes_back() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64);
+        let pattern: Vec<u8> = (0..PAGE_SIZE).map(|i| (i & 0xFF) as u8).collect();
+        file.write_at(0, &pattern).await.unwrap();
+
+        let (_cache, _rx) = writable_cache(InMemoryFile::new(PAGE_SIZE as u64));
+        // Re-create with the patterned file.
+        let file = InMemoryFile::new(PAGE_SIZE as u64);
+        file.write_at(0, &pattern).await.unwrap();
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(file),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        {
+            let mut guard = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify)
+                .await
+                .unwrap();
+            assert_eq!(guard[0], 0x00);
+            assert_eq!(guard[1], 0x01);
+            guard[0] = 0xAA;
+            guard[1] = 0xBB;
+        }
+
+        let guard = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(guard[0], 0xAA);
+        assert_eq!(guard[1], 0xBB);
+        assert_eq!(guard[2], 0x02);
+    }
+
+    #[async_test]
+    async fn acquire_overwrite_skips_read() {
+        let file = InMemoryFile::with_interceptor(
+            PAGE_SIZE as u64,
+            Arc::new(FailingInterceptor {
+                fail_reads: true,
+                fail_writes: false,
+                fail_flushes: false,
+                fail_set_file_size: false,
+            }),
+        );
+
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(file),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        {
+            let mut guard = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            guard.fill(0xCC);
+        }
+
+        let guard = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert!(guard.iter().all(|&b| b == 0xCC));
+    }
+
+    #[async_test]
+    async fn concurrent_reads_return_correct_data() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64);
+        let pattern: Vec<u8> = (0..PAGE_SIZE).map(|i| ((i * 3) & 0xFF) as u8).collect();
+        file.write_at(0, &pattern).await.unwrap();
+
+        let mut cache = PageCache::new(Arc::new(file), None, None, 0);
+        cache.register_tag(0, 0);
+
+        let g1 = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(&g1[..], &pattern[..]);
+        drop(g1);
+
+        let g2 = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(&g2[..], &pattern[..]);
+    }
+
+    #[async_test]
+    async fn sequential_modify_acquires_work() {
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        {
+            let mut guard = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify)
+                .await
+                .unwrap();
+            guard[0] = 0x11;
+        }
+
+        {
+            let mut guard = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify)
+                .await
+                .unwrap();
+            assert_eq!(guard[0], 0x11);
+            guard[0] = 0x22;
+        }
+
+        let guard = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(guard[0], 0x22);
+    }
+
+    #[async_test]
+    async fn modify_then_modify_same_page() {
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        {
+            let mut g = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify)
+                .await
+                .unwrap();
+            g[0] = 0xAA;
+        }
+
+        {
+            let mut g = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify)
+                .await
+                .unwrap();
+            assert_eq!(g[0], 0xAA);
+            g[1] = 0xBB;
+        }
+
+        let guard = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(guard[0], 0xAA);
+        assert_eq!(guard[1], 0xBB);
+    }
+
+    #[async_test]
+    async fn different_pages_independent() {
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 4)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        {
+            let mut g = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify)
+                .await
+                .unwrap();
+            g[0] = 0x11;
+        }
+
+        {
+            let mut g = cache
+                .acquire_write(
+                    PageKey {
+                        tag: 0,
+                        offset: PAGE_SIZE as u64,
+                    },
+                    WriteMode::Modify,
+                )
+                .await
+                .unwrap();
+            g[0] = 0x22;
+        }
+
+        let g1 = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(g1[0], 0x11);
+        drop(g1);
+
+        let g2 = cache
+            .acquire_read(PageKey {
+                tag: 0,
+                offset: PAGE_SIZE as u64,
+            })
+            .await
+            .unwrap();
+        assert_eq!(g2[0], 0x22);
+    }
+
+    #[async_test]
+    async fn tag_offset_resolution() {
+        let base: u64 = 0x10000;
+        let page_offset: u64 = 0x1000;
+        let file = InMemoryFile::new(base + page_offset + PAGE_SIZE as u64);
+        let pattern = [0xDE; PAGE_SIZE];
+        file.write_at(base + page_offset, &pattern).await.unwrap();
+
+        let mut cache = PageCache::new(Arc::new(file), None, None, 0);
+        cache.register_tag(0, base);
+
+        let guard = cache
+            .acquire_read(PageKey {
+                tag: 0,
+                offset: page_offset,
+            })
+            .await
+            .unwrap();
+        assert_eq!(&guard[..], &pattern[..]);
+    }
+
+    #[async_test]
+    async fn commit_sends_transaction() {
+        let (tx, mut rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+        let key = PageKey { tag: 0, offset: 0 };
+
+        {
+            let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap();
+            g.fill(0xAA);
+        }
+        let lsn = cache.commit().unwrap();
+        assert!(lsn > Lsn::ZERO);
+
+        match rx.recv().await.unwrap() {
+            LogRequest::Commit(txn) => {
+                assert_eq!(txn.lsn, lsn);
+                assert_eq!(txn.data.len(), 1);
+                assert!(txn.data[0].data().iter().all(|&b| b == 0xAA));
+            }
+            _ => panic!("expected Commit"),
+        }
+    }
+
+    #[async_test]
+    async fn consecutive_commits_get_increasing_lsns() {
+        let (tx, mut rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+        let key = PageKey { tag: 0, offset: 0 };
+
+        {
+            let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap();
+            g.fill(0xAA);
+        }
+        let lsn1 = cache.commit().unwrap();
+
+        {
+            let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap();
+            g.fill(0xBB);
+        }
+        let lsn2 = cache.commit().unwrap();
+
+        assert!(lsn2 > lsn1);
+
+        match rx.recv().await.unwrap() {
+            LogRequest::Commit(txn) => assert_eq!(txn.lsn, lsn1),
+            _ => panic!("expected Commit"),
+        }
+        match rx.recv().await.unwrap() {
+            LogRequest::Commit(txn) => assert_eq!(txn.lsn, lsn2),
+            _ => panic!("expected Commit"),
+        }
+    }
+
+    #[async_test]
+    async fn commit_sets_committed_lsn() {
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+        let key = PageKey { tag: 0, offset: 0 };
+
+        {
+            let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap();
+            g.fill(0xAA);
+        }
+        let lsn = cache.commit().unwrap();
+
+        let pages = cache.pages.lock();
+        let entry = pages.map.get(&key).unwrap();
+        assert_eq!(entry.committed_lsn, lsn);
+    }
+
+    async fn dirty_pages<F: AsyncFile>(cache: &PageCache<F>, count: usize) {
+        for i in 0..count {
+            let key = PageKey {
+                tag: 0,
+                offset: (i * PAGE_SIZE) as u64,
+            };
+            let mut g = cache
+                .acquire_write(key, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            g.fill(i as u8);
+        }
+    }
+
+    #[async_test]
+    async fn batch_full_commit_on_dirty_overflow() {
+        let (tx, mut rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 200)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        dirty_pages(&cache, MAX_COMMIT_PAGES).await;
+
+        let new_key = PageKey {
+            tag: 0,
+            offset: (MAX_COMMIT_PAGES * PAGE_SIZE) as u64,
+        };
+        {
+            let mut guard = cache
+                .acquire_write(new_key, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            guard.fill(0xFF);
+        }
+
+        match rx.recv().await.unwrap() {
+            LogRequest::Commit(txn) => {
+                assert_eq!(txn.data.len(), MAX_COMMIT_PAGES);
+            }
+            _ => panic!("expected Commit from batch-full commit"),
+        }
+
+        cache.commit().unwrap();
+        match rx.recv().await.unwrap() {
+            LogRequest::Commit(txn) => {
+                assert_eq!(txn.data.len(), 1);
+            }
+            _ => panic!("expected Commit from explicit commit"),
+        }
+    }
+
+    #[async_test]
+    async fn redirty_does_not_trigger_batch_full_commit() {
+        let (tx, mut rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 200)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        dirty_pages(&cache, MAX_COMMIT_PAGES).await;
+
+        let key = PageKey { tag: 0, offset: 0 };
+        let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap();
+        g[0] = 0xDD;
+
+        assert!(
+            rx.try_recv().is_err(),
+            "re-dirtying an already-dirty page must not trigger batch-full commit"
+        );
+
+        assert_eq!(cache.pages.lock().dirty_count, MAX_COMMIT_PAGES);
+    }
+
+    #[async_test]
+    async fn write_ordering_across_batches() {
+        let (tx, mut rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 200)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        dirty_pages(&cache, MAX_COMMIT_PAGES).await;
+
+        let key_b = PageKey {
+            tag: 0,
+            offset: (MAX_COMMIT_PAGES * PAGE_SIZE) as u64,
+        };
+        {
+            let mut g = cache
+                .acquire_write(key_b, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            g.fill(0xBB);
+        }
+
+        let batch1 = match rx.recv().await.unwrap() {
+            LogRequest::Commit(txn) => txn,
+            _ => panic!("expected Commit"),
+        };
+        assert_eq!(batch1.data.len(), MAX_COMMIT_PAGES);
+
+        let key_c = PageKey {
+            tag: 0,
+            offset: ((MAX_COMMIT_PAGES + 1) * PAGE_SIZE) as u64,
+        };
+        {
+            let mut g = cache
+                .acquire_write(key_c, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            g.fill(0xCC);
+        }
+
+        cache.commit().unwrap();
+        let batch2 = match rx.recv().await.unwrap() {
+            LogRequest::Commit(txn) => txn,
+            _ => panic!("expected Commit"),
+        };
+        assert_eq!(batch2.data.len(), 2);
+        assert!(batch1.lsn < batch2.lsn);
+    }
+
+    // ---- Eviction tests ----
+
+    #[async_test]
+    async fn eviction_removes_clean_page() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64 * 4);
+        let pattern_a = [0xAA; PAGE_SIZE];
+        let pattern_b = [0xBB; PAGE_SIZE];
+        file.write_at(0, &pattern_a).await.unwrap();
+        file.write_at(PAGE_SIZE as u64, &pattern_b).await.unwrap();
+
+        // Quota of 1 page.
+        let mut cache = PageCache::new(Arc::new(file), None, None, 1);
+        cache.register_tag(0, 0);
+
+        // Load page A.
+        let g = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xAA);
+        drop(g);
+
+        // Cache has 1 page (at quota). Loading page B should evict page A.
+        let g = cache
+            .acquire_read(PageKey {
+                tag: 0,
+                offset: PAGE_SIZE as u64,
+            })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xBB);
+        drop(g);
+
+        // Page A was evicted — cache should have 1 entry.
+        assert_eq!(cache.pages.lock().map.len(), 1);
+    }
+
+    #[async_test]
+    async fn eviction_reloads_from_disk() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64 * 4);
+        let pattern_a = [0xAA; PAGE_SIZE];
+        let pattern_b = [0xBB; PAGE_SIZE];
+        file.write_at(0, &pattern_a).await.unwrap();
+        file.write_at(PAGE_SIZE as u64, &pattern_b).await.unwrap();
+
+        let mut cache = PageCache::new(Arc::new(file), None, None, 1);
+        cache.register_tag(0, 0);
+
+        // Load page A.
+        let g = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xAA);
+        drop(g);
+
+        // Load page B (evicts A).
+        let g = cache
+            .acquire_read(PageKey {
+                tag: 0,
+                offset: PAGE_SIZE as u64,
+            })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xBB);
+        drop(g);
+
+        // Re-load page A (evicts B, reloads from disk).
+        let g = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xAA);
+        drop(g);
+    }
+
+    #[async_test]
+    async fn eviction_skips_dirty_pages() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64 * 4);
+        file.write_at(PAGE_SIZE as u64, &[0xBB; PAGE_SIZE])
+            .await
+            .unwrap();
+
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        // Quota of 1, but page 0 will be dirty.
+        let mut cache = PageCache::new(
+            Arc::new(file),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            1,
+        );
+        cache.register_tag(0, 0);
+
+        // Write page A (makes it Dirty).
+        {
+            let mut g = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            g.fill(0xAA);
+        }
+
+        // Try to load page B. Eviction should skip dirty page A.
+        // Cache will have 2 entries (over quota but nothing evictable).
+        let g = cache
+            .acquire_read(PageKey {
+                tag: 0,
+                offset: PAGE_SIZE as u64,
+            })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xBB);
+        drop(g);
+
+        // Both pages present.
+        assert_eq!(cache.pages.lock().map.len(), 2);
+
+        // Verify page A is still readable (not evicted).
+        let g = cache
+            .acquire_read(PageKey { tag: 0, offset: 0 })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xAA);
+    }
+
+    #[async_test]
+    async fn eviction_skips_uncommitted_page() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64 * 4);
+        file.write_at(0, &[0xAA; PAGE_SIZE]).await.unwrap();
+        file.write_at(PAGE_SIZE as u64, &[0xBB; PAGE_SIZE])
+            .await
+            .unwrap();
+
+        let applied = Arc::new(LsnWatermark::new());
+        // applied_lsn = 0, so committed pages with lsn > 0 are not evictable.
+
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(file),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: applied.clone(),
+            }),
+            1,
+        );
+        cache.register_tag(0, 0);
+
+        // Write and commit page A (committed_lsn = 1, applied_lsn = 0).
+        {
+            let mut g = cache
+                .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            g.fill(0xAA);
+        }
+        cache.commit().unwrap();
+
+        // Page A is Clean with committed_lsn=1. applied_lsn=0.
+        // Eviction should skip it (not yet applied).
+        let g = cache
+            .acquire_read(PageKey {
+                tag: 0,
+                offset: PAGE_SIZE as u64,
+            })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xBB);
+        drop(g);
+
+        // Both pages present (A is not evictable).
+        assert_eq!(cache.pages.lock().map.len(), 2);
+
+        // Now advance applied_lsn past the committed_lsn.
+        applied.advance(Lsn::new(1), Fsn::ZERO);
+
+        // Load another page — now A is evictable.
+        let _file_size = PAGE_SIZE as u64 * 4;
+        // Load page at offset 2*PAGE_SIZE (need data there).
+        cache
+            .file
+            .write_at(PAGE_SIZE as u64 * 2, &[0xCC; PAGE_SIZE])
+            .await
+            .unwrap();
+        let g = cache
+            .acquire_read(PageKey {
+                tag: 0,
+                offset: PAGE_SIZE as u64 * 2,
+            })
+            .await
+            .unwrap();
+        assert_eq!(g[0], 0xCC);
+        drop(g);
+
+        // Should have evicted one of the old pages (A or B).
+        assert!(cache.pages.lock().map.len() <= 2);
+    }
+
+    #[async_test]
+    async fn no_deadlock_with_quota() {
+        // Regression test: verify that acquiring pages with a small quota
+        // doesn't deadlock. The dual-lock pattern
+        // should prevent lock-order issues.
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 10)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            2,
+        );
+        cache.register_tag(0, 0);
+
+        // Rapidly acquire and drop pages, cycling through more than the quota.
+        for i in 0..5u64 {
+            let mut g = cache
+                .acquire_write(
+                    PageKey {
+                        tag: 0,
+                        offset: i * PAGE_SIZE as u64,
+                    },
+                    WriteMode::Overwrite,
+                )
+                .await
+                .unwrap();
+            g.fill(i as u8);
+        }
+        // If we get here without hanging, no deadlock.
+    }
+
+    #[async_test]
+    async fn overwrite_uncached_reports_not_cached() {
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        let key = PageKey { tag: 0, offset: 0 };
+        let g = cache
+            .acquire_write(key, WriteMode::Overwrite)
+            .await
+            .unwrap();
+        assert!(
+            g.is_overwriting(),
+            "first Overwrite acquire should report overwriting (not cached)"
+        );
+    }
+
+    #[async_test]
+    async fn overwrite_cached_reports_cached() {
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(InMemoryFile::new(PAGE_SIZE as u64)),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        let key = PageKey { tag: 0, offset: 0 };
+
+        // First write populates the cache.
+        {
+            let mut g = cache
+                .acquire_write(key, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            g.fill(0xAA);
+        }
+
+        // Second write should find it cached.
+        let g = cache
+            .acquire_write(key, WriteMode::Overwrite)
+            .await
+            .unwrap();
+        assert!(
+            !g.is_overwriting(),
+            "second Overwrite acquire should report cached (not overwriting)"
+        );
+        assert_eq!(g[0], 0xAA);
+        assert_eq!(g[PAGE_SIZE - 1], 0xAA);
+    }
+
+    #[async_test]
+    async fn modify_always_reports_cached() {
+        // Modify loads from disk if not cached, so populated reflects
+        // map presence after load — always true since load populates it.
+        let file = InMemoryFile::new(PAGE_SIZE as u64);
+        file.write_at(0, &[0xBB; PAGE_SIZE]).await.unwrap();
+
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let mut cache = PageCache::new(
+            Arc::new(file),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: Arc::new(LsnWatermark::new()),
+            }),
+            0,
+        );
+        cache.register_tag(0, 0);
+
+        let key = PageKey { tag: 0, offset: 0 };
+
+        // Modify loads from disk then retries — page is in map on retry.
+        let g = cache.acquire_write(key, WriteMode::Modify).await.unwrap();
+        assert!(
+            !g.is_overwriting(),
+            "Modify always reports cached (not overwriting)"
+        );
+        assert_eq!(g[0], 0xBB);
+    }
+
+    #[async_test]
+    async fn lru_evicts_oldest_first() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64 * 4);
+        for i in 0..4 {
+            file.write_at(i * PAGE_SIZE as u64, &[(i as u8) + 0xA0; PAGE_SIZE])
+                .await
+                .unwrap();
+        }
+
+        // Quota of 2.
+        let mut cache = PageCache::new(Arc::new(file), None, None, 2);
+        cache.register_tag(0, 0);
+
+        let key_a = PageKey { tag: 0, offset: 0 };
+        let key_b = PageKey {
+            tag: 0,
+            offset: PAGE_SIZE as u64,
+        };
+        let key_c = PageKey {
+            tag: 0,
+            offset: 2 * PAGE_SIZE as u64,
+        };
+
+        // Load A then B (both in cache, at quota).
+        let g = cache.acquire_read(key_a).await.unwrap();
+        assert_eq!(g[0], 0xA0);
+        drop(g);
+
+        let g = cache.acquire_read(key_b).await.unwrap();
+        assert_eq!(g[0], 0xA1);
+        drop(g);
+
+        // LRU order: MRU=B, LRU=A. Loading C should evict A.
+        let g = cache.acquire_read(key_c).await.unwrap();
+        assert_eq!(g[0], 0xA2);
+        drop(g);
+
+        let pages = cache.pages.lock();
+        assert!(
+            !pages.map.contains_key(&key_a),
+            "A should have been evicted"
+        );
+        assert!(pages.map.contains_key(&key_b), "B should still be cached");
+        assert!(pages.map.contains_key(&key_c), "C should be cached");
+    }
+
+    #[async_test]
+    async fn write_demote_defers_to_commit() {
+        let file = InMemoryFile::new(PAGE_SIZE as u64 * 4);
+        for i in 0..3 {
+            file.write_at(i * PAGE_SIZE as u64, &[(i as u8) + 0xC0; PAGE_SIZE])
+                .await
+                .unwrap();
+        }
+
+        let (tx, _rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+        let permits = Arc::new(LogPermits::new(1000));
+        let applied = Arc::new(LsnWatermark::new());
+        let mut cache = PageCache::new(
+            Arc::new(file),
+            Some(LogClient::new(tx)),
+            Some(CacheLogState {
+                permits,
+                applied_lsn: applied.clone(),
+            }),
+            2,
+        );
+        cache.register_tag(0, 0);
+
+        let key_a = PageKey { tag: 0, offset: 0 };
+        let key_b = PageKey {
+            tag: 0,
+            offset: PAGE_SIZE as u64,
+        };
+        let key_c = PageKey {
+            tag: 0,
+            offset: 2 * PAGE_SIZE as u64,
+        };
+
+        // Write A with demote. Page becomes dirty.
+        {
+            let mut g = cache
+                .acquire_write(key_a, WriteMode::Overwrite)
+                .await
+                .unwrap();
+            g.fill(0xDD);
+            g.demote();
+        }
+
+        // Read B.
+        let g = cache.acquire_read(key_b).await.unwrap();
+        drop(g);
+
+        // Commit A (dirty→clean). Since demoted, it should go to LRU end.
+        let lsn = cache.commit().unwrap();
+        applied.advance(lsn, Fsn::ZERO);
+
+        // Now load C. Should evict A (demoted at LRU end) not B.
+        let g = cache.acquire_read(key_c).await.unwrap();
+        assert_eq!(g[0], 0xC2);
+        drop(g);
+
+        let pages = cache.pages.lock();
+        assert!(
+            !pages.map.contains_key(&key_a),
+            "demoted A should be evicted after commit"
+        );
+        assert!(pages.map.contains_key(&key_b), "B should still be cached");
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/create.rs b/vm/devices/storage/vhdx/src/create.rs
new file mode 100644
index 0000000000..0fa3c52717
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/create.rs
@@ -0,0 +1,936 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! VHDX file creation.
+//!
+//! Writes a valid, empty VHDX file (file identifier, dual headers, dual
+//! region tables, metadata table, and empty BAT) to an [`AsyncFile`].
+
+use crate::AsyncFile;
+use crate::error::CreateError;
+use crate::error::InvalidFormatReason;
+use crate::format;
+use crate::format::FileIdentifier;
+use crate::format::FileParameters;
+use crate::format::FileParametersFlags;
+use crate::format::Header;
+use crate::format::MetadataTableEntry;
+use crate::format::MetadataTableEntryFlags;
+use crate::format::MetadataTableHeader;
+use crate::format::RegionTableEntry;
+use crate::format::RegionTableEntryFlags;
+use crate::format::RegionTableHeader;
+use guid::Guid;
+use zerocopy::FromZeros;
+use zerocopy::IntoBytes;
+
+/// Parameters for creating a new VHDX file.
+pub struct CreateParams {
+    /// Virtual disk size in bytes. Must be a multiple of `logical_sector_size`
+    /// and at most 64 TiB.
+    pub disk_size: u64,
+
+    /// Block size in bytes. Must be a multiple of 1 MiB and at most 256 MiB.
+    /// Default: 2 MiB.
+    pub block_size: u32,
+
+    /// Logical sector size. Must be 512 or 4096. Default: 512.
+    pub logical_sector_size: u32,
+
+    /// Physical sector size. Must be 512 or 4096. Default: 512.
+    pub physical_sector_size: u32,
+
+    /// Whether this is a differencing disk (has a parent).
+    pub has_parent: bool,
+
+    /// Block alignment for the data region. 0 means no special alignment.
+    /// If non-zero, must be a power of 2.
+    pub block_alignment: u32,
+
+    /// If true, create the file in an incomplete state
+    /// (adds an "incomplete file" metadata item that prevents open).
+    pub create_incomplete: bool,
+
+    /// If true, mark all blocks as allocated (fixed VHD).
+    pub is_fully_allocated: bool,
+
+    /// Data write GUID. If zero GUID, a random one will be generated.
+    /// Callers can supply a specific GUID for re-parenting workflows.
+    pub data_write_guid: Guid,
+
+    /// Page 83 SCSI identifier. If zero GUID, a random one will be generated.
+    pub page_83_data: Guid,
+}
+
+impl Default for CreateParams {
+    fn default() -> Self {
+        Self {
+            disk_size: 0,
+            block_size: 0,
+            logical_sector_size: 0,
+            physical_sector_size: 0,
+            has_parent: false,
+            block_alignment: 0,
+            create_incomplete: false,
+            is_fully_allocated: false,
+            data_write_guid: Guid::ZERO,
+            page_83_data: Guid::ZERO,
+        }
+    }
+}
+
+/// Integer ceiling division (a / b, rounded up). Panics if b == 0.
+pub(crate) fn ceil_div(a: u64, b: u64) -> u64 {
+    a.div_ceil(b)
+}
+
+/// Round `value` up to the next multiple of `alignment`.
+/// `alignment` must be a power of 2.
+pub(crate) fn round_up(value: u64, alignment: u64) -> u64 {
+    (value + alignment - 1) & !(alignment - 1)
+}
+
+/// Compute the chunk ratio (number of data blocks per sector bitmap block).
+pub(crate) fn chunk_block_count(block_size: u32, sector_size: u32) -> u32 {
+    let sectors_per_block = block_size / sector_size;
+    (format::SECTORS_PER_CHUNK / sectors_per_block as u64) as u32
+}
+
+/// Create a new, empty VHDX file.
+///
+/// Writes file identifier, dual headers, dual region tables, metadata
+/// table with standard metadata items, and an empty BAT to the provided
+/// file. The file is truncated/extended to the required size.
+///
+/// `params` is updated in place with defaults filled in (e.g. zero
+/// `block_size` becomes 2 MiB, zero GUIDs become random).
+pub async fn create(file: &impl AsyncFile, params: &mut CreateParams) -> Result<(), CreateError> {
+    // --- Validate and default parameters ---
+
+    if params.logical_sector_size == 0 {
+        params.logical_sector_size = format::DEFAULT_SECTOR_SIZE;
+    }
+    if params.logical_sector_size != 512 && params.logical_sector_size != 4096 {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::InvalidLogicalSectorSize,
+        ));
+    }
+
+    if params.physical_sector_size == 0 {
+        params.physical_sector_size = format::DEFAULT_SECTOR_SIZE;
+    }
+    if params.physical_sector_size != 512 && params.physical_sector_size != 4096 {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::InvalidPhysicalSectorSize,
+        ));
+    }
+
+    if params.disk_size == 0 {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::DiskSizeZero,
+        ));
+    }
+    if !params
+        .disk_size
+        .is_multiple_of(params.logical_sector_size as u64)
+    {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::DiskSizeNotAligned,
+        ));
+    }
+    if params.disk_size > format::MAXIMUM_DISK_SIZE {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::DiskSizeTooLarge,
+        ));
+    }
+
+    if params.block_size == 0 {
+        params.block_size = format::DEFAULT_BLOCK_SIZE;
+    }
+    if !(params.block_size as u64).is_multiple_of(format::REGION_ALIGNMENT) {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::BlockSizeNotAligned,
+        ));
+    }
+    if params.block_size as u64 > format::MAXIMUM_BLOCK_SIZE {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::BlockSizeTooLarge,
+        ));
+    }
+
+    if params.block_alignment != 0 && !params.block_alignment.is_power_of_two() {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::BlockAlignmentNotPowerOfTwo,
+        ));
+    }
+
+    // Generate a random page 83 GUID if not provided.
+    if params.page_83_data == Guid::ZERO {
+        params.page_83_data = Guid::new_random();
+    }
+
+    // --- Compute BAT size ---
+
+    let data_block_count = ceil_div(params.disk_size, params.block_size as u64);
+    let chunk_ratio = chunk_block_count(params.block_size, params.logical_sector_size);
+
+    if chunk_ratio == 0 {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::InvalidChunkRatio,
+        ));
+    }
+
+    let sector_bitmap_block_count = ceil_div(data_block_count, chunk_ratio as u64);
+
+    let bat_entry_count = if params.has_parent {
+        sector_bitmap_block_count * (chunk_ratio as u64 + 1)
+    } else {
+        data_block_count + data_block_count.saturating_sub(1) / chunk_ratio as u64
+    };
+
+    if bat_entry_count > format::ABSOLUTE_MAXIMUM_BAT_ENTRY_COUNT {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::BatEntryCountTooLarge,
+        ));
+    }
+
+    let bat_sector_count = ceil_div(bat_entry_count, format::ENTRIES_PER_BAT_PAGE);
+    let bat_length = round_up(
+        bat_sector_count * format::CACHE_PAGE_SIZE,
+        format::REGION_ALIGNMENT,
+    );
+
+    if bat_length > format::MAXIMUM_BAT_SIZE {
+        return Err(CreateError::InvalidFormat(
+            InvalidFormatReason::BatSizeTooLarge,
+        ));
+    }
+
+    // --- Region layout ---
+
+    let log_offset = format::HEADER_AREA_SIZE;
+    let log_length = format::DEFAULT_LOG_SIZE as u64;
+    let metadata_offset = log_offset + log_length;
+    let metadata_length = format::DEFAULT_METADATA_REGION_SIZE as u64;
+    let bat_offset = metadata_offset + metadata_length;
+
+    // --- Build the 1 MiB header area buffer ---
+
+    let mut buf = file.alloc_buffer(format::HEADER_AREA_SIZE as usize);
+    let buf_bytes = buf.as_mut();
+
+    // File identifier at offset 0.
+    let mut ident = FileIdentifier::new_zeroed();
+    ident.signature = format::FILE_IDENTIFIER_SIGNATURE;
+    buf_bytes[..size_of::<FileIdentifier>()].copy_from_slice(ident.as_bytes());
+
+    // Generate random GUIDs for the headers.
+    let file_write_guid = Guid::new_random();
+    if params.data_write_guid == Guid::ZERO {
+        params.data_write_guid = Guid::new_random();
+    }
+    let data_write_guid = params.data_write_guid;
+
+    // Header 1 (sequence number 0).
+    let mut header = Header::new_zeroed();
+    header.signature = format::HEADER_SIGNATURE;
+    header.sequence_number = 0;
+    header.file_write_guid = file_write_guid;
+    header.data_write_guid = data_write_guid;
+    header.log_guid = Guid::ZERO;
+    header.log_version = format::LOG_VERSION;
+    header.version = format::VERSION_1;
+    header.log_offset = log_offset;
+    header.log_length = log_length as u32;
+    header.checksum = 0;
+
+    // Serialize header 1, compute checksum, update.
+    let h1_start = format::HEADER_OFFSET_1 as usize;
+    let h1_end = h1_start + size_of::<Header>();
+    buf_bytes[h1_start..h1_end].copy_from_slice(header.as_bytes());
+    let crc = format::compute_checksum(
+        &buf_bytes[h1_start..h1_start + format::HEADER_SIZE as usize],
+        4, // checksum field offset within Header
+    );
+    buf_bytes[h1_start + 4..h1_start + 8].copy_from_slice(&crc.to_le_bytes());
+
+    // Header 2 (sequence number 1).
+    header.sequence_number = 1;
+    header.checksum = 0;
+    let h2_start = format::HEADER_OFFSET_2 as usize;
+    let h2_end = h2_start + size_of::<Header>();
+    buf_bytes[h2_start..h2_end].copy_from_slice(header.as_bytes());
+    let crc = format::compute_checksum(
+        &buf_bytes[h2_start..h2_start + format::HEADER_SIZE as usize],
+        4,
+    );
+    buf_bytes[h2_start + 4..h2_start + 8].copy_from_slice(&crc.to_le_bytes());
+
+    // Region table 1.
+    let rt_start = format::REGION_TABLE_OFFSET as usize;
+    let mut rt_header = RegionTableHeader::new_zeroed();
+    rt_header.signature = format::REGION_TABLE_SIGNATURE;
+    rt_header.entry_count = 2;
+
+    let rt_header_bytes = rt_header.as_bytes();
+    buf_bytes[rt_start..rt_start + rt_header_bytes.len()].copy_from_slice(rt_header_bytes);
+
+    // BAT region entry.
+    let entry_offset = rt_start + size_of::<RegionTableHeader>();
+    let bat_entry = RegionTableEntry {
+        guid: format::BAT_REGION_GUID,
+        file_offset: bat_offset,
+        length: bat_length as u32,
+        flags: RegionTableEntryFlags::new().with_required(true),
+    };
+    let bat_entry_bytes = bat_entry.as_bytes();
+    buf_bytes[entry_offset..entry_offset + bat_entry_bytes.len()].copy_from_slice(bat_entry_bytes);
+
+    // Metadata region entry.
+    let entry_offset2 = entry_offset + size_of::<RegionTableEntry>();
+    let meta_entry = RegionTableEntry {
+        guid: format::METADATA_REGION_GUID,
+        file_offset: metadata_offset,
+        length: metadata_length as u32,
+        flags: RegionTableEntryFlags::new().with_required(true),
+    };
+    let meta_entry_bytes = meta_entry.as_bytes();
+    buf_bytes[entry_offset2..entry_offset2 + meta_entry_bytes.len()]
+        .copy_from_slice(meta_entry_bytes);
+
+    // Compute region table checksum over the full 64 KiB region.
+    let rt_end = rt_start + format::REGION_TABLE_SIZE as usize;
+    let crc = format::compute_checksum(&buf_bytes[rt_start..rt_end], 4);
+    buf_bytes[rt_start + 4..rt_start + 8].copy_from_slice(&crc.to_le_bytes());
+
+    // Copy region table 1 to region table 2.
+    let alt_start = format::ALT_REGION_TABLE_OFFSET as usize;
+    buf_bytes.copy_within(rt_start..rt_end, alt_start);
+
+    // Write the header area.
+    file.write_from(0, buf).await.map_err(CreateError::Write)?;
+
+    // --- Zero the log region ---
+
+    file.zero_range(log_offset, log_length)
+        .await
+        .map_err(CreateError::Write)?;
+
+    // --- Build and write the metadata table ---
+
+    let mut meta_buf = file.alloc_buffer(metadata_length as usize);
+    let meta_bytes = meta_buf.as_mut();
+
+    let mut table_header = MetadataTableHeader::new_zeroed();
+    table_header.signature = format::METADATA_TABLE_SIGNATURE;
+
+    let mut entry_count: u16 = 0;
+    let entries_start = size_of::<MetadataTableHeader>();
+    let mut entry_write_offset = entries_start;
+    let mut item_data_offset = format::METADATA_TABLE_SIZE as u32;
+
+    // Helper: write a metadata table entry.
+    let add_entry = |buf: &mut [u8],
+                     entry_write_offset: &mut usize,
+                     entry_count: &mut u16,
+                     item_id: Guid,
+                     offset: u32,
+                     length: u32,
+                     is_required: bool,
+                     is_virtual_disk: bool| {
+        let entry = MetadataTableEntry {
+            item_id,
+            offset,
+            length,
+            flags: MetadataTableEntryFlags::new()
+                .with_is_required(is_required)
+                .with_is_virtual_disk(is_virtual_disk),
+            reserved2: 0,
+        };
+        let bytes = entry.as_bytes();
+        buf[*entry_write_offset..*entry_write_offset + bytes.len()].copy_from_slice(bytes);
+        *entry_write_offset += bytes.len();
+        *entry_count += 1;
+    };
+
+    // 1. File parameters (IsRequired only).
+    let file_params_len = size_of::<FileParameters>() as u32;
+    add_entry(
+        meta_bytes,
+        &mut entry_write_offset,
+        &mut entry_count,
+        format::FILE_PARAMETERS_ITEM_GUID,
+        item_data_offset,
+        file_params_len,
+        true,
+        false,
+    );
+    let fp_data_offset = item_data_offset;
+    item_data_offset += file_params_len;
+
+    // 2. Virtual disk size (IsRequired + IsVirtualDisk).
+    let disk_size_len = 8u32; // u64
+    add_entry(
+        meta_bytes,
+        &mut entry_write_offset,
+        &mut entry_count,
+        format::VIRTUAL_DISK_SIZE_ITEM_GUID,
+        item_data_offset,
+        disk_size_len,
+        true,
+        true,
+    );
+    let ds_data_offset = item_data_offset;
+    item_data_offset += disk_size_len;
+
+    // 3. Logical sector size (IsRequired + IsVirtualDisk).
+    let sector_len = 4u32; // u32
+    add_entry(
+        meta_bytes,
+        &mut entry_write_offset,
+        &mut entry_count,
+        format::LOGICAL_SECTOR_SIZE_ITEM_GUID,
+        item_data_offset,
+        sector_len,
+        true,
+        true,
+    );
+    let lss_data_offset = item_data_offset;
+    item_data_offset += sector_len;
+
+    // 4. Physical sector size (IsRequired + IsVirtualDisk).
+    add_entry(
+        meta_bytes,
+        &mut entry_write_offset,
+        &mut entry_count,
+        format::PHYSICAL_SECTOR_SIZE_ITEM_GUID,
+        item_data_offset,
+        sector_len,
+        true,
+        true,
+    );
+    let pss_data_offset = item_data_offset;
+    item_data_offset += sector_len;
+
+    // 5. Page 83 data (IsRequired + IsVirtualDisk).
+    let guid_len = 16u32;
+    add_entry(
+        meta_bytes,
+        &mut entry_write_offset,
+        &mut entry_count,
+        format::PAGE_83_ITEM_GUID,
+        item_data_offset,
+        guid_len,
+        true,
+        true,
+    );
+    let p83_data_offset = item_data_offset;
+    item_data_offset += guid_len;
+
+    // 6. Incomplete file (optional, IsRequired only).
+    if params.create_incomplete {
+        add_entry(
+            meta_bytes,
+            &mut entry_write_offset,
+            &mut entry_count,
+            format::INCOMPLETE_FILE_ITEM_GUID,
+            0,
+            0,
+            true,
+            false,
+        );
+    }
+
+    // Verify initial metadata items fit within a single hosting sector.
+    debug_assert!(
+        (item_data_offset as u64 - format::METADATA_TABLE_SIZE) <= format::MAX_HOSTING_SECTOR_SIZE
+    );
+
+    // Write the metadata table header.
+    table_header.entry_count = entry_count;
+    let th_bytes = table_header.as_bytes();
+    meta_bytes[..th_bytes.len()].copy_from_slice(th_bytes);
+
+    // Write the file parameters item data.
+    let fp = FileParameters {
+        block_size: params.block_size,
+        flags: FileParametersFlags::new()
+            .with_has_parent(params.has_parent)
+            .with_leave_blocks_allocated(params.is_fully_allocated),
+    };
+    let fp_bytes = fp.as_bytes();
+    let fp_off = fp_data_offset as usize;
+    meta_bytes[fp_off..fp_off + fp_bytes.len()].copy_from_slice(fp_bytes);
+
+    // Write the virtual disk size item data.
+    let ds_off = ds_data_offset as usize;
+    meta_bytes[ds_off..ds_off + 8].copy_from_slice(&params.disk_size.to_le_bytes());
+
+    // Write the logical sector size item data.
+    let lss_off = lss_data_offset as usize;
+    meta_bytes[lss_off..lss_off + 4].copy_from_slice(&params.logical_sector_size.to_le_bytes());
+
+    // Write the physical sector size item data.
+    let pss_off = pss_data_offset as usize;
+    meta_bytes[pss_off..pss_off + 4].copy_from_slice(&params.physical_sector_size.to_le_bytes());
+
+    // Write the page 83 item data.
+    let p83_off = p83_data_offset as usize;
+    meta_bytes[p83_off..p83_off + 16].copy_from_slice(params.page_83_data.as_bytes());
+
+    // Write the metadata region.
+    file.write_from(metadata_offset, meta_buf)
+        .await
+        .map_err(CreateError::Write)?;
+
+    // --- Zero the BAT region ---
+
+    file.zero_range(bat_offset, bat_length)
+        .await
+        .map_err(CreateError::Write)?;
+
+    // --- Set file size ---
+
+    let mut file_size = bat_offset + bat_length;
+
+    // Apply block alignment padding if requested.
+    if params.block_alignment as u64 > format::REGION_ALIGNMENT
+        && params.block_alignment <= params.block_size
+    {
+        file_size = round_up(file_size, params.block_alignment as u64);
+    }
+
+    file.set_file_size(file_size)
+        .await
+        .map_err(CreateError::Write)?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::format;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+    use zerocopy::FromBytes;
+
+    /// Read a little-endian u64 from a byte slice at the given offset.
+    fn read_u64(data: &[u8], offset: usize) -> u64 {
+        u64::from_le_bytes(data[offset..offset + 8].try_into().unwrap())
+    }
+
+    /// Read a little-endian u32 from a byte slice at the given offset.
+    fn read_u32(data: &[u8], offset: usize) -> u32 {
+        u32::from_le_bytes(data[offset..offset + 4].try_into().unwrap())
+    }
+
+    /// Read a Header from the snapshot at the given offset.
+    fn read_header(snapshot: &[u8], offset: usize) -> Header {
+        Header::read_from_bytes(&snapshot[offset..offset + size_of::<Header>()])
+            .unwrap()
+            .clone()
+    }
+
+    /// Read the region table header from the snapshot at the given offset.
+    fn read_region_table_header(snapshot: &[u8], offset: usize) -> RegionTableHeader {
+        RegionTableHeader::read_from_bytes(
+            &snapshot[offset..offset + size_of::<RegionTableHeader>()],
+        )
+        .unwrap()
+        .clone()
+    }
+
+    /// Read metadata table header from the metadata region.
+    fn read_metadata_table_header(snapshot: &[u8], meta_offset: usize) -> MetadataTableHeader {
+        MetadataTableHeader::read_from_bytes(
+            &snapshot[meta_offset..meta_offset + size_of::<MetadataTableHeader>()],
+        )
+        .unwrap()
+        .clone()
+    }
+
+    /// Read a metadata table entry at the given index (0-based).
+    fn read_metadata_entry(
+        snapshot: &[u8],
+        meta_offset: usize,
+        index: usize,
+    ) -> MetadataTableEntry {
+        let entry_offset = meta_offset
+            + size_of::<MetadataTableHeader>()
+            + index * size_of::<MetadataTableEntry>();
+        MetadataTableEntry::read_from_bytes(
+            &snapshot[entry_offset..entry_offset + size_of::<MetadataTableEntry>()],
+        )
+        .unwrap()
+        .clone()
+    }
+
+    #[async_test]
+    async fn create_default_params() {
+        let disk_size = format::GB1;
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+
+        assert_eq!(params.disk_size, disk_size);
+        assert_eq!(params.block_size, format::DEFAULT_BLOCK_SIZE);
+        assert_eq!(params.logical_sector_size, 512);
+        assert_eq!(params.physical_sector_size, 512);
+
+        let snapshot = file.snapshot();
+        let file_size = file.file_size().await.unwrap();
+
+        // File identifier signature at offset 0.
+        let sig = read_u64(&snapshot, 0);
+        assert_eq!(sig, format::FILE_IDENTIFIER_SIGNATURE);
+
+        // Header 1 at 64K.
+        let h1 = read_header(&snapshot, format::HEADER_OFFSET_1 as usize);
+        assert_eq!(h1.signature, format::HEADER_SIGNATURE);
+        assert!(format::validate_checksum(
+            &snapshot[format::HEADER_OFFSET_1 as usize
+                ..format::HEADER_OFFSET_1 as usize + format::HEADER_SIZE as usize],
+            4
+        ));
+
+        // Header 2 at 128K.
+        let h2 = read_header(&snapshot, format::HEADER_OFFSET_2 as usize);
+        assert_eq!(h2.signature, format::HEADER_SIGNATURE);
+        assert!(format::validate_checksum(
+            &snapshot[format::HEADER_OFFSET_2 as usize
+                ..format::HEADER_OFFSET_2 as usize + format::HEADER_SIZE as usize],
+            4
+        ));
+
+        // Region table 1 at 192K.
+        let rt = read_region_table_header(&snapshot, format::REGION_TABLE_OFFSET as usize);
+        assert_eq!(rt.signature, format::REGION_TABLE_SIGNATURE);
+        assert_eq!(rt.entry_count, 2);
+        assert!(format::validate_checksum(
+            &snapshot[format::REGION_TABLE_OFFSET as usize
+                ..format::REGION_TABLE_OFFSET as usize + format::REGION_TABLE_SIZE as usize],
+            4
+        ));
+
+        // Region table 2 checksum.
+        assert!(format::validate_checksum(
+            &snapshot[format::ALT_REGION_TABLE_OFFSET as usize
+                ..format::ALT_REGION_TABLE_OFFSET as usize + format::REGION_TABLE_SIZE as usize],
+            4
+        ));
+
+        // Metadata region starts at 2 MiB.
+        let meta_offset = 2 * format::MB1 as usize;
+        let mth = read_metadata_table_header(&snapshot, meta_offset);
+        assert_eq!(mth.signature, format::METADATA_TABLE_SIGNATURE);
+        assert_eq!(mth.entry_count, 5);
+
+        // BAT region should be all zeros.
+        let bat_offset = 3 * format::MB1 as usize;
+        // Compute expected BAT length.
+        let data_block_count = ceil_div(disk_size, format::DEFAULT_BLOCK_SIZE as u64);
+        let chunk_ratio = chunk_block_count(format::DEFAULT_BLOCK_SIZE, 512);
+        let bat_entry_count = data_block_count + data_block_count / chunk_ratio as u64;
+        let bat_sec_count = ceil_div(bat_entry_count, format::ENTRIES_PER_BAT_PAGE);
+        let bat_len = round_up(
+            bat_sec_count * format::CACHE_PAGE_SIZE,
+            format::REGION_ALIGNMENT,
+        ) as usize;
+        assert!(
+            snapshot[bat_offset..bat_offset + bat_len]
+                .iter()
+                .all(|&b| b == 0)
+        );
+
+        // File size should cover all regions.
+        assert_eq!(file_size, (bat_offset + bat_len) as u64);
+    }
+
+    #[async_test]
+    async fn create_validates_disk_size_zero() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: 0,
+            ..Default::default()
+        };
+        assert!(create(&file, &mut params).await.is_err());
+    }
+
+    #[async_test]
+    async fn create_validates_disk_size_alignment() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: 1000, // not a multiple of 512
+            logical_sector_size: 512,
+            ..Default::default()
+        };
+        assert!(create(&file, &mut params).await.is_err());
+    }
+
+    #[async_test]
+    async fn create_validates_sector_sizes() {
+        let file = InMemoryFile::new(0);
+
+        // Invalid logical sector size.
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            logical_sector_size: 1024,
+            ..Default::default()
+        };
+        assert!(create(&file, &mut params).await.is_err());
+
+        // Invalid physical sector size.
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            physical_sector_size: 8192,
+            ..Default::default()
+        };
+        assert!(create(&file, &mut params).await.is_err());
+    }
+
+    #[async_test]
+    async fn create_validates_block_size() {
+        let file = InMemoryFile::new(0);
+
+        // Not a multiple of 1 MiB.
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            block_size: 500_000,
+            ..Default::default()
+        };
+        assert!(create(&file, &mut params).await.is_err());
+
+        // Greater than maximum (256 MiB).
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            block_size: 512 * 1024 * 1024,
+            ..Default::default()
+        };
+        assert!(create(&file, &mut params).await.is_err());
+    }
+
+    #[async_test]
+    async fn create_validates_block_alignment() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            block_alignment: 3, // not a power of 2
+            ..Default::default()
+        };
+        assert!(create(&file, &mut params).await.is_err());
+    }
+
+    #[async_test]
+    async fn create_with_512_sectors() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            logical_sector_size: 512,
+            physical_sector_size: 512,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+        assert_eq!(params.logical_sector_size, 512);
+        assert_eq!(params.physical_sector_size, 512);
+
+        let snapshot = file.snapshot();
+        let meta_offset = 2 * format::MB1 as usize;
+
+        // Find the logical sector size entry (index 2) and physical (index 3).
+        let lss_entry = read_metadata_entry(&snapshot, meta_offset, 2);
+        assert_eq!(lss_entry.item_id, format::LOGICAL_SECTOR_SIZE_ITEM_GUID);
+        let lss_val = read_u32(&snapshot, meta_offset + lss_entry.offset as usize);
+        assert_eq!(lss_val, 512);
+
+        let pss_entry = read_metadata_entry(&snapshot, meta_offset, 3);
+        assert_eq!(pss_entry.item_id, format::PHYSICAL_SECTOR_SIZE_ITEM_GUID);
+        let pss_val = read_u32(&snapshot, meta_offset + pss_entry.offset as usize);
+        assert_eq!(pss_val, 512);
+    }
+
+    #[async_test]
+    async fn create_with_4k_sectors() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            logical_sector_size: 4096,
+            physical_sector_size: 4096,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+        assert_eq!(params.logical_sector_size, 4096);
+        assert_eq!(params.physical_sector_size, 4096);
+
+        let snapshot = file.snapshot();
+        let meta_offset = 2 * format::MB1 as usize;
+
+        let lss_entry = read_metadata_entry(&snapshot, meta_offset, 2);
+        let lss_val = read_u32(&snapshot, meta_offset + lss_entry.offset as usize);
+        assert_eq!(lss_val, 4096);
+
+        let pss_entry = read_metadata_entry(&snapshot, meta_offset, 3);
+        let pss_val = read_u32(&snapshot, meta_offset + pss_entry.offset as usize);
+        assert_eq!(pss_val, 4096);
+    }
+
+    #[async_test]
+    async fn create_various_block_sizes() {
+        let block_sizes: Vec<u32> = vec![1, 2, 4, 8, 16, 32, 64, 128, 256]
+            .into_iter()
+            .map(|m| m * format::MB1 as u32)
+            .collect();
+
+        for &bs in &block_sizes {
+            let file = InMemoryFile::new(0);
+            let mut params = CreateParams {
+                disk_size: format::GB1,
+                block_size: bs,
+                ..Default::default()
+            };
+            let result = create(&file, &mut params).await;
+            assert!(result.is_ok(), "failed for block_size={bs}");
+
+            let snapshot = file.snapshot();
+            let sig = read_u64(&snapshot, 0);
+            assert_eq!(sig, format::FILE_IDENTIFIER_SIGNATURE);
+        }
+    }
+
+    #[async_test]
+    async fn create_block_alignment() {
+        // No alignment: file ends right after BAT.
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            block_alignment: 0,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+        let size_no_align = file.file_size().await.unwrap();
+
+        // With 2 MiB alignment.
+        let file2 = InMemoryFile::new(0);
+        let align = 2 * format::MB1 as u32;
+        let mut params2 = CreateParams {
+            disk_size: format::GB1,
+            block_alignment: align,
+            ..Default::default()
+        };
+        create(&file2, &mut params2).await.unwrap();
+        let size_aligned = file2.file_size().await.unwrap();
+
+        // Aligned size should be >= non-aligned and a multiple of alignment.
+        assert!(size_aligned >= size_no_align);
+        assert_eq!(size_aligned % align as u64, 0);
+
+        // With alignment == block_size (should be honored since
+        // block_alignment <= block_size).
+        let file3 = InMemoryFile::new(0);
+        let mut params3 = CreateParams {
+            disk_size: format::GB1,
+            block_alignment: params.block_size,
+            ..Default::default()
+        };
+        create(&file3, &mut params3).await.unwrap();
+        let size3 = file3.file_size().await.unwrap();
+        assert_eq!(size3 % params.block_size as u64, 0);
+    }
+
+    #[async_test]
+    async fn create_differencing_disk() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+
+        let snapshot = file.snapshot();
+        let meta_offset = 2 * format::MB1 as usize;
+
+        // File parameters entry (index 0).
+        let fp_entry = read_metadata_entry(&snapshot, meta_offset, 0);
+        assert_eq!(fp_entry.item_id, format::FILE_PARAMETERS_ITEM_GUID);
+
+        // Read the FileParameters data.
+        let fp_off = meta_offset + fp_entry.offset as usize;
+        let fp = FileParameters::read_from_bytes(
+            &snapshot[fp_off..fp_off + size_of::<FileParameters>()],
+        )
+        .unwrap();
+        assert!(fp.flags.has_parent());
+
+        // BAT entry count should include sector bitmap entries.
+        let data_block_count = ceil_div(format::GB1, format::DEFAULT_BLOCK_SIZE as u64);
+        let chunk_ratio = chunk_block_count(format::DEFAULT_BLOCK_SIZE, 512);
+        let sbm_count = ceil_div(data_block_count, chunk_ratio as u64);
+        let bat_entry_count_diff = sbm_count * (chunk_ratio as u64 + 1);
+        let bat_entry_count_nondiff = data_block_count + data_block_count / chunk_ratio as u64;
+        // Differencing should have more entries.
+        assert!(bat_entry_count_diff > bat_entry_count_nondiff);
+    }
+
+    #[async_test]
+    async fn create_incomplete() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            create_incomplete: true,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+
+        let snapshot = file.snapshot();
+        let meta_offset = 2 * format::MB1 as usize;
+        let mth = read_metadata_table_header(&snapshot, meta_offset);
+        assert_eq!(mth.entry_count, 6);
+
+        // The 6th entry (index 5) should be the incomplete file item.
+        let entry = read_metadata_entry(&snapshot, meta_offset, 5);
+        assert_eq!(entry.item_id, format::INCOMPLETE_FILE_ITEM_GUID);
+        assert!(entry.flags.is_required());
+        assert!(!entry.flags.is_virtual_disk());
+        assert_eq!(entry.offset, 0);
+        assert_eq!(entry.length, 0);
+    }
+
+    #[async_test]
+    async fn create_headers_have_different_sequence_numbers() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+
+        let snapshot = file.snapshot();
+        let h1 = read_header(&snapshot, format::HEADER_OFFSET_1 as usize);
+        let h2 = read_header(&snapshot, format::HEADER_OFFSET_2 as usize);
+
+        assert_eq!(h1.sequence_number, 0);
+        assert_eq!(h2.sequence_number, 1);
+    }
+
+    #[async_test]
+    async fn create_region_tables_are_identical() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            ..Default::default()
+        };
+        create(&file, &mut params).await.unwrap();
+
+        let snapshot = file.snapshot();
+        let rt1_start = format::REGION_TABLE_OFFSET as usize;
+        let rt1_end = rt1_start + format::REGION_TABLE_SIZE as usize;
+        let rt2_start = format::ALT_REGION_TABLE_OFFSET as usize;
+        let rt2_end = rt2_start + format::REGION_TABLE_SIZE as usize;
+
+        assert_eq!(&snapshot[rt1_start..rt1_end], &snapshot[rt2_start..rt2_end]);
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/error.rs b/vm/devices/storage/vhdx/src/error.rs
new file mode 100644
index 0000000000..b7722975f3
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/error.rs
@@ -0,0 +1,349 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Error types for the VHDX parser.
+//!
+//! Separate error types are provided for each category of operation:
+//!
+//! - [`CreateError`] — file creation parameter validation
+//! - [`OpenError`] — file open and format parsing
+//! - [`VhdxIoError`] — runtime I/O (read, write, flush, trim, close)
+
+use thiserror::Error;
+
+use crate::log_task::LogTaskError;
+
+/// The VHDX write pipeline has been poisoned by a previous fatal error.
+///
+/// Once set, all runtime I/O operations on the file fail permanently.
+/// Produced by the log permits semaphore and LSN watermark when the
+/// log or apply task encounters a fatal error.
+#[derive(Debug, Clone, Error)]
+#[error("VHDX pipeline failed: {0}")]
+pub(crate) struct PipelineFailed(pub(crate) String);
+
+/// Errors returned by VHDX file creation ([`create::create`](crate::create::create)).
+#[derive(Debug, Error)]
+pub enum CreateError {
+    /// An I/O error occurred while writing the VHDX file.
+    #[error("write error")]
+    Write(#[source] std::io::Error),
+
+    /// A parameter validation error.
+    #[error("invalid format parameters")]
+    InvalidFormat(#[source] InvalidFormatReason),
+}
+
+/// Errors returned when opening or parsing a VHDX file.
+///
+/// Covers file identifier validation, header parsing, region table
+/// validation, metadata parsing, BAT loading, and parent locator
+/// parsing.
+#[derive(Debug, Error)]
+pub(crate) enum OpenErrorInner {
+    /// An I/O error occurred while reading the VHDX file.
+    #[error("I/O error")]
+    Io(#[source] std::io::Error),
+
+    /// The VHDX file is corrupt or has an invalid structure.
+    #[error("VHDX file is corrupt")]
+    Corrupt(#[from] CorruptionType),
+
+    /// An open option (e.g. block alignment) is invalid.
+    #[error("invalid parameter")]
+    InvalidParameter(InvalidFormatReason),
+
+    /// The write pipeline failed during writable open initialization.
+    #[error("pipeline failed during open")]
+    PipelineFailed(#[source] PipelineFailed),
+
+    /// A metadata item could not be read through the page cache.
+    #[error("failed to access metadata page cache")]
+    MetadataCache(#[source] CacheError),
+}
+
+/// Errors returned when opening or parsing a VHDX file.
+///
+/// Covers file identifier validation, header parsing, region table
+/// validation, metadata parsing, BAT loading, and parent locator
+/// parsing.
+#[derive(Debug, Error)]
+#[error(transparent)]
+pub struct OpenError(pub(crate) OpenErrorInner);
+
+impl<T: Into<OpenErrorInner>> From<T> for OpenError {
+    fn from(inner: T) -> Self {
+        OpenError(inner.into())
+    }
+}
+
+/// Errors returned by runtime VHDX I/O operations.
+///
+/// Covers read, write, flush, trim, and close. Use [`kind()`](Self::kind)
+/// to classify the error.
+#[derive(Debug, Error)]
+#[error(transparent)]
+pub struct VhdxIoError(#[from] pub(crate) VhdxIoErrorInner);
+
+impl VhdxIoError {
+    /// Classify this error into a [`VhdxIoErrorKind`].
+    pub fn kind(&self) -> VhdxIoErrorKind {
+        match self.0 {
+            VhdxIoErrorInner::ReadOnly => VhdxIoErrorKind::ReadOnly,
+            VhdxIoErrorInner::UnalignedIo => VhdxIoErrorKind::InvalidInput,
+            VhdxIoErrorInner::BeyondEndOfDisk => VhdxIoErrorKind::InvalidSector,
+            _ => VhdxIoErrorKind::Other,
+        }
+    }
+}
+
+/// Classification of [`VhdxIoError`] for programmatic handling.
+#[non_exhaustive]
+pub enum VhdxIoErrorKind {
+    /// The file was opened read-only.
+    ReadOnly,
+    /// The I/O request had invalid parameters (e.g., unaligned).
+    InvalidInput,
+    /// The I/O request referenced a sector beyond the virtual disk.
+    InvalidSector,
+    /// Any other error (I/O failure, pipeline failure, etc.).
+    Other,
+}
+
+/// Inner representation of [`VhdxIoError`].
+#[derive(Debug, Error)]
+pub(crate) enum VhdxIoErrorInner {
+    #[error("failed to write header")]
+    WriteHeader(#[source] std::io::Error),
+    #[error("failed to flush")]
+    Flush(#[source] std::io::Error),
+    #[error("failed to commit cache")]
+    CommitCache(#[source] CacheError),
+    #[error("failed to read sector bitmap")]
+    ReadSectorBitmap(#[source] CacheError),
+    #[error("failed to zero block at file offset {file_offset:#x}")]
+    ZeroBlock {
+        #[source]
+        err: std::io::Error,
+        file_offset: u64,
+    },
+    #[error("failed to extend file to {target_file_size:#x}")]
+    ExtendFile {
+        #[source]
+        err: std::io::Error,
+        target_file_size: u64,
+    },
+    #[error("failed to truncate file to {target_file_size:#x}")]
+    TruncateFile {
+        #[source]
+        err: std::io::Error,
+        target_file_size: u64,
+    },
+    #[error("failed to access BAT page cache")]
+    BatCache(#[source] CacheError),
+    #[error("failed to access sector bitmap page cache")]
+    SectorBitmapCache(#[source] CacheError),
+    #[error("VHDX file is opened read-only")]
+    ReadOnly,
+    #[error("VHDX file failed")]
+    Failed(#[source] PipelineFailed),
+    #[error("I/O request is not aligned to logical sector size")]
+    UnalignedIo,
+    #[error("I/O request extends beyond end of virtual disk")]
+    BeyondEndOfDisk,
+    #[error("failed to close log task")]
+    LogClose(#[source] LogTaskError),
+}
+
+/// Errors from the page cache write path.
+///
+/// Produced by [`PageCache::acquire_write`](crate::cache::PageCache::acquire_write)
+/// when a page cannot be acquired for writing.
+#[derive(Debug, Error)]
+pub(crate) enum CacheError {
+    /// An I/O error occurred while loading the page from disk.
+    #[error("read error at file offset {file_offset:#x}")]
+    Read {
+        #[source]
+        err: std::io::Error,
+        file_offset: u64,
+    },
+
+    /// The write pipeline has been poisoned by a previous fatal error.
+    #[error("pipeline failed")]
+    PipelineFailed(#[source] PipelineFailed),
+}
+
+/// Specific reasons a VHDX creation or parameter validation may fail.
+///
+/// Each variant corresponds to a distinct validation error detected
+/// when processing VHDX parameters (e.g. during file creation).
+#[derive(Debug, Clone, Error)]
+pub enum InvalidFormatReason {
+    /// The logical sector size is not 512 or 4096.
+    #[error("logical sector size must be 512 or 4096")]
+    InvalidLogicalSectorSize,
+
+    /// The physical sector size is not 512 or 4096.
+    #[error("physical sector size must be 512 or 4096")]
+    InvalidPhysicalSectorSize,
+
+    /// The disk size is zero.
+    #[error("disk size must be > 0")]
+    DiskSizeZero,
+
+    /// The disk size is not a multiple of the logical sector size.
+    #[error("disk size must be a multiple of logical sector size")]
+    DiskSizeNotAligned,
+
+    /// The disk size exceeds the maximum (64 TiB).
+    #[error("disk size exceeds maximum (64 TiB)")]
+    DiskSizeTooLarge,
+
+    /// The block size is not a multiple of 1 MiB.
+    #[error("block size must be a multiple of 1 MiB")]
+    BlockSizeNotAligned,
+
+    /// The block size exceeds the maximum (256 MiB).
+    #[error("block size exceeds maximum (256 MiB)")]
+    BlockSizeTooLarge,
+
+    /// The block alignment is not a power of 2.
+    #[error("block alignment must be a power of 2")]
+    BlockAlignmentNotPowerOfTwo,
+
+    /// The block size / logical sector size combination is invalid (chunk ratio is zero).
+    #[error("invalid block size / logical sector size combination")]
+    InvalidChunkRatio,
+
+    /// The computed BAT entry count exceeds the absolute maximum.
+    #[error("BAT entry count exceeds absolute maximum")]
+    BatEntryCountTooLarge,
+
+    /// The computed BAT size exceeds the maximum.
+    #[error("BAT size exceeds maximum")]
+    BatSizeTooLarge,
+}
+
+/// Specific reasons a VHDX file may be considered corrupt.
+///
+/// Each variant corresponds to a distinct corruption condition detected
+/// during parsing or validation. Covers all corruption types from the
+/// VHDX implementation.
+#[derive(Debug, Clone, Error)]
+pub(crate) enum CorruptionType {
+    #[error("user metadata entry is marked as required")]
+    MetadataUserRequired,
+    #[error("BAT region is too small for the disk geometry")]
+    BatTooSmall,
+    #[error("no valid VHDX headers found")]
+    NoValidHeaders,
+    #[error("invalid log offset or length in header")]
+    InvalidLogOffsetOrLength,
+    #[error("log offset is not aligned")]
+    InvalidLogOffset,
+    #[error("log region extends beyond end of file")]
+    LogBeyondEndOfFile,
+    #[error("parent locator item is too small for its header")]
+    LocatorTooSmallForHeader,
+    #[error("parent locator item is too small for its entries")]
+    LocatorTooSmallForEntries,
+    #[error("parent locator entry key is invalid")]
+    InvalidLocatorEntryKey,
+    #[error("parent locator entry value is invalid")]
+    InvalidLocatorEntryValue,
+    #[error("metadata table has an invalid signature")]
+    InvalidMetadataTableSignature,
+    #[error("metadata table entry count too high")]
+    MetadataTableEntryCountTooHigh,
+    #[error("reserved metadata table field is nonzero")]
+    ReservedMetadataTableFieldNonzero,
+    #[error("duplicate metadata GUID")]
+    MetadataDuplicateGuid,
+    #[error("metadata entries have overlapping ranges")]
+    MetadataOverlapping,
+    #[error("user metadata entry count exceeded")]
+    MetadataUserCountExceeded,
+    #[error("file is empty")]
+    EmptyFile,
+    #[error("file parameters item has invalid size")]
+    InvalidFileParameterSize,
+    #[error("reserved file parameters field is nonzero")]
+    ReservedFileParametersFieldNonzero,
+    #[error("file parameters marked as virtual disk metadata")]
+    FileParametersMarkedVirtual,
+    #[error("invalid block size")]
+    InvalidBlockSize,
+    #[error("invalid logical sector size")]
+    InvalidLogicalSectorSize,
+    #[error("logical sector size marked as virtual disk metadata")]
+    LogicalSectorSizeMarkedVirtual,
+    #[error("invalid sector size")]
+    InvalidSectorSize,
+    #[error("logical sector size item has invalid size")]
+    InvalidLogicalSectorSizeSize,
+    #[error("disk size item marked as virtual disk metadata")]
+    DiskMarkedVirtual,
+    #[error("invalid virtual disk size")]
+    InvalidDiskSize,
+    #[error("both region tables are corrupt")]
+    RegionTablesBothCorrupt,
+    #[error("invalid entry count in region table")]
+    InvalidEntryCountInRegionTable,
+    #[error("reserved region table field is nonzero")]
+    ReservedRegionTableFieldNonzero,
+    #[error("duplicate region table entry")]
+    DuplicateRegionEntry,
+    #[error("invalid offset or length in region table entry")]
+    OffsetOrLengthInRegionTable,
+    #[error("unknown required region")]
+    UnknownRequiredRegion,
+    #[error("BAT or metadata region is missing")]
+    MissingBatOrMetadataRegion,
+    #[error("bad log entry encountered during replay")]
+    BadLogEntryOnReplay,
+    #[error("no valid log entries found")]
+    NoValidLogEntries,
+    #[error("BAT entry references range beyond end of file")]
+    RangeBeyondEof,
+    #[error("BAT entries reference overlapping file ranges")]
+    RangeCollision,
+    #[error("invalid block state in BAT entry")]
+    InvalidBlockState,
+    #[error("reserved BAT entry field is nonzero")]
+    ReservedBatEntryFieldNonzero,
+    #[error("partially present block has no sector bitmap")]
+    PartiallyPresentWithoutSectorBitmap,
+    #[error("trimmed range collides with allocated range")]
+    TrimmedRangeCollision,
+    #[error("unknown required metadata item")]
+    UnknownRequiredMetadata,
+    #[error("file is marked as incomplete")]
+    IncompleteFile,
+    #[error("required metadata item is missing")]
+    MissingRequiredMetadata,
+    #[error("header has log GUID but log is missing")]
+    MissingLogHasGuid,
+    #[error("invalid metadata entry offset")]
+    InvalidMetadataEntryOffset,
+    #[error("metadata region is too large")]
+    MetadataRegionTooLarge,
+    #[error("metadata item is too large")]
+    MetadataItemTooLarge,
+    #[error("total metadata size per category exceeded")]
+    TotalMetadataSizeExceeded,
+    #[error("metadata entry has zero item GUID")]
+    ZeroMetadataItemId,
+    #[error("invalid file identifier signature")]
+    InvalidFileIdentifier,
+    #[error("invalid parent locator key-value count")]
+    InvalidLocatorKeyValueCount,
+    #[error("log is full")]
+    LogFull,
+    #[error("log replay required (log GUID is non-zero)")]
+    LogReplayRequired,
+    #[error("unsupported VHDX version")]
+    UnsupportedVersion,
+    #[error("unsupported VHDX log version")]
+    UnsupportedLogVersion,
+}
diff --git a/vm/devices/storage/vhdx/src/flush.rs b/vm/devices/storage/vhdx/src/flush.rs
new file mode 100644
index 0000000000..753d3af2f4
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/flush.rs
@@ -0,0 +1,601 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Flush sequencer — FSN tracking and concurrent flush coalescing.
+//!
+//! The VHDX write path needs a way to order and coalesce file flush operations.
+//! Multiple concurrent callers may request flushes simultaneously (e.g., several
+//! I/O threads completing writes at the same time). Rather than issuing one file
+//! flush per caller, the [`FlushSequencer`] coalesces them: if a flush is already
+//! in progress that will satisfy a caller's flush sequence number (FSN), the
+//! caller waits for that flush instead of issuing a new one.
+//!
+//! FSNs increase monotonically. Each `flush()` call is assigned the next FSN.
+//! When the flush I/O completes, the completed FSN advances to match. Callers
+//! can ensure all data through a specific FSN is flushed via
+//! [`FlushSequencer::flush_through`].
+//!
+//! Used by both the [log task](crate::log_task) (to make WAL entries durable
+//! before advancing the log tail) and
+//! [`VhdxFile::flush()`](crate::open::VhdxFile::flush) (for crash safety).
+
+use crate::AsyncFile;
+use crate::open::FailureFlag;
+use event_listener::Event;
+use parking_lot::Mutex;
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering::Acquire;
+use std::sync::atomic::Ordering::Release;
+
+/// Tracks flush sequence numbers and coalesces concurrent flush requests.
+///
+/// Multiple callers can request flushes concurrently. The sequencer ensures
+/// that at most one file flush is in progress at a time. If a flush is
+/// in-flight that will satisfy a caller's FSN, the caller waits for that
+/// flush instead of issuing a redundant one.
+///
+/// FSNs increase monotonically. Each [`flush()`](FlushSequencer::flush) call
+/// is assigned the next FSN. [`flush_through()`](FlushSequencer::flush_through)
+/// ensures all data through a specific FSN is flushed (used by the log task
+/// to enforce ordering constraints like "data must be flushed before BAT
+/// is logged").
+pub(crate) struct FlushSequencer {
+    state: Mutex<FlushState>,
+    failure_flag: Option<Arc<FailureFlag>>,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub(crate) struct Fsn(u64);
+
+impl Fsn {
+    pub const ZERO: Self = Fsn(0);
+
+    #[cfg(test)]
+    pub(crate) const fn new(value: u64) -> Self {
+        Self(value)
+    }
+}
+
+struct FlushState {
+    /// The most recently issued FSN that has been assigned. The next flush
+    /// will get `issued_fsn + 1`.
+    issued_fsn: Fsn,
+    /// The most recently completed FSN. All FSNs <= this value have been
+    /// durably flushed.
+    completed_fsn: Fsn,
+    /// The currently in-progress flush, if any. New callers whose target FSN
+    /// is covered by this flush wait on it rather than issuing a redundant
+    /// flush. If a caller needs a higher FSN, it replaces this with a new
+    /// flush (the old one continues running but is no longer advertised).
+    active_flush: Option<Arc<Flush>>,
+}
+
+/// A single in-progress flush operation. Waiters hold an `Arc` clone and
+/// poll [`wait_done`](Flush::wait_done) until the flusher signals completion.
+struct Flush {
+    /// The FSN that this flush will satisfy when it completes.
+    fsn: Fsn,
+    /// Set to `true` by the flusher after the I/O completes (success or failure).
+    done: AtomicBool,
+    /// Notified when `done` becomes `true`.
+    event: Event,
+}
+
+impl FlushSequencer {
+    /// Create a new flush sequencer with FSNs starting at 0.
+    pub fn new() -> Self {
+        Self {
+            state: Mutex::new(FlushState {
+                issued_fsn: Fsn::ZERO,
+                completed_fsn: Fsn::ZERO,
+                active_flush: None,
+            }),
+            failure_flag: None,
+        }
+    }
+
+    /// Set the failure flag for poisoning on I/O errors.
+    pub fn set_failure_flag(&mut self, flag: Arc<FailureFlag>) {
+        self.failure_flag = Some(flag);
+    }
+
+    /// Returns the next FSN that will be assigned to a flush request.
+    ///
+    /// This is `issued_fsn + 1`. Callers use this to capture the "current
+    /// point in time" after performing writes, so they can later
+    /// [`flush_through()`](Self::flush_through) to ensure those writes have
+    /// been flushed.
+    ///
+    /// # Why this is safe without "reserving" an FSN
+    ///
+    /// Because `completed_fsn ≤ issued_fsn` always holds, `current_fsn()`
+    /// always returns a value *strictly greater than* `completed_fsn`.
+    /// Therefore `flush_through(current_fsn())` will never short-circuit
+    /// — it always creates a new `Flush` (or waits for one with a
+    /// sufficiently high FSN) that calls `file.flush()` *after* the
+    /// caller's writes have been issued.  No separate reserve/issue
+    /// step is needed.
+    pub fn current_fsn(&self) -> Fsn {
+        let state = self.state.lock();
+        Fsn(state.issued_fsn.0 + 1)
+    }
+
+    /// Request a file flush through the sequencer.
+    ///
+    /// Assigns the next FSN to this flush request and ensures that a file
+    /// flush completes that covers this FSN. Multiple concurrent `flush()`
+    /// calls are coalesced: if a flush is already in progress, the caller
+    /// waits for it to complete. If the completed FSN is still less than the
+    /// caller's FSN after the in-progress flush finishes, a new flush is
+    /// issued.
+    ///
+    /// Returns the completed FSN of the flush that satisfied this request.
+    /// Under concurrency this may be greater than the FSN originally assigned
+    /// to this caller, because a single flush can cover multiple requests.
+    pub async fn flush(&self, file: &impl AsyncFile) -> Result<Fsn, std::io::Error> {
+        self.flush_until(file, None).await
+    }
+
+    /// Ensure all data through the given FSN is durably flushed.
+    ///
+    /// If the FSN has already completed, returns immediately. Otherwise,
+    /// bumps `issued_fsn` if needed and waits for a flush to complete that
+    /// covers the requested FSN.
+    ///
+    /// This is the safe replacement for the old `require_fsn` + `wait_for_fsn`
+    /// pattern — it both issues and waits in a single call.
+    pub async fn flush_through(
+        &self,
+        file: &impl AsyncFile,
+        fsn: Fsn,
+    ) -> Result<(), std::io::Error> {
+        let completed = self.flush_until(file, Some(fsn)).await?;
+        // Safety invariant: callers only pass FSNs from current_fsn(), which is
+        // issued_fsn + 1 at capture time. Since issued_fsn only grows, by the
+        // time we run, issued_fsn + 1 >= fsn, so the flush we create always
+        // covers the target. If this fires, a caller passed a bogus FSN.
+        assert!(
+            completed >= fsn,
+            "flush_through({fsn:?}) completed only through {completed:?}"
+        );
+        Ok(())
+    }
+
+    /// Returns the most recently completed FSN.
+    pub fn completed_fsn(&self) -> Fsn {
+        self.state.lock().completed_fsn
+    }
+
+    /// Inner workhorse: keep flushing until `completed_fsn >= target_fsn`.
+    ///
+    /// `target_fsn`:
+    /// - `None` — assign the next sequential FSN (used by `flush()`).
+    /// - `Some(fsn)` — ensure completion through that FSN (used by `flush_through()`).
+    ///
+    /// Returns the resolved FSN.
+    async fn flush_until(
+        &self,
+        file: &impl AsyncFile,
+        mut requested_fsn: Option<Fsn>,
+    ) -> Result<Fsn, std::io::Error> {
+        // Step 1: find or create the Flush we'll execute.
+        //
+        // If there's an active flush covering our target FSN, wait for it.
+        // When it completes, loop back — if completed_fsn >= target we're
+        // done; otherwise we'll create a new flush ourselves.
+        //
+        // If no active flush covers our target, create one and `break` out
+        // of the loop to proceed to the I/O in step 2.
+        let my_flush = loop {
+            let active = {
+                let mut state = self.state.lock();
+                let target_fsn = requested_fsn.unwrap_or(Fsn(state.issued_fsn.0 + 1));
+                requested_fsn = Some(target_fsn);
+
+                if target_fsn <= state.completed_fsn {
+                    return Ok(state.completed_fsn);
+                }
+
+                if let Some(active) = &state.active_flush
+                    && active.fsn >= target_fsn
+                {
+                    active.clone()
+                } else {
+                    let fsn = Fsn(state.issued_fsn.0 + 1);
+                    // flush_through() callers capture FSNs via current_fsn(),
+                    // which returns issued_fsn + 1. Since issued_fsn only
+                    // grows, the target can never exceed the next FSN we're
+                    // about to issue.
+                    assert!(
+                        target_fsn <= fsn,
+                        "flush_through target {target_fsn:?} exceeds next FSN {fsn:?}"
+                    );
+                    let new_flush = Arc::new(Flush {
+                        fsn,
+                        done: false.into(),
+                        event: Default::default(),
+                    });
+                    state.active_flush = Some(new_flush.clone());
+                    state.issued_fsn = fsn;
+                    break new_flush;
+                }
+            };
+            active.wait_done().await;
+        };
+
+        // Step 2: perform the actual file flush and update state.
+        let r = file.flush().await;
+        let completed_fsn = {
+            let mut state = self.state.lock();
+            if r.is_ok() {
+                state.completed_fsn = my_flush.fsn.max(state.completed_fsn);
+            }
+            if state
+                .active_flush
+                .as_ref()
+                .is_some_and(|p| Arc::ptr_eq(p, &my_flush))
+            {
+                state.active_flush = None;
+            }
+            state.completed_fsn
+        };
+        my_flush.done.store(true, Release);
+        my_flush.event.notify(usize::MAX);
+        r.inspect_err(|e| {
+            if let Some(flag) = &self.failure_flag {
+                flag.set(e);
+            }
+        })?;
+        Ok(completed_fsn)
+    }
+}
+
+impl Flush {
+    /// Wait for this flush to complete (success or failure).
+    async fn wait_done(&self) {
+        loop {
+            let event = self.event.listen();
+            if self.done.load(Acquire) {
+                break;
+            }
+            event.await;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+    use std::borrow::Borrow;
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
+
+    // -- Helper wrappers --
+
+    /// File wrapper that counts how many times `flush()` is called.
+    struct CountingFile {
+        inner: InMemoryFile,
+        flush_count: AtomicU32,
+    }
+
+    impl CountingFile {
+        fn new() -> Self {
+            Self {
+                inner: InMemoryFile::new(0),
+                flush_count: AtomicU32::new(0),
+            }
+        }
+
+        fn flush_count(&self) -> u32 {
+            self.flush_count.load(Ordering::Relaxed)
+        }
+    }
+
+    impl AsyncFile for CountingFile {
+        type Buffer = Vec<u8>;
+
+        fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+            self.inner.alloc_buffer(len)
+        }
+
+        async fn read_into(&self, offset: u64, buf: Vec<u8>) -> Result<Vec<u8>, std::io::Error> {
+            self.inner.read_into(offset, buf).await
+        }
+
+        async fn write_from(
+            &self,
+            offset: u64,
+            buf: impl Borrow<Vec<u8>> + Send + 'static,
+        ) -> Result<(), std::io::Error> {
+            self.inner.write_from(offset, buf).await
+        }
+
+        async fn flush(&self) -> Result<(), std::io::Error> {
+            self.flush_count.fetch_add(1, Ordering::Relaxed);
+            self.inner.flush().await
+        }
+
+        async fn file_size(&self) -> Result<u64, std::io::Error> {
+            self.inner.file_size().await
+        }
+
+        async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+            self.inner.set_file_size(size).await
+        }
+    }
+
+    /// File wrapper that can be configured to fail flushes.
+    struct FailingFile {
+        inner: InMemoryFile,
+        fail_flush: AtomicBool,
+    }
+
+    impl FailingFile {
+        fn new(fail: bool) -> Self {
+            Self {
+                inner: InMemoryFile::new(0),
+                fail_flush: AtomicBool::new(fail),
+            }
+        }
+
+        fn set_fail(&self, fail: bool) {
+            self.fail_flush.store(fail, Ordering::Relaxed);
+        }
+    }
+
+    impl AsyncFile for FailingFile {
+        type Buffer = Vec<u8>;
+
+        fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+            self.inner.alloc_buffer(len)
+        }
+
+        async fn read_into(&self, offset: u64, buf: Vec<u8>) -> Result<Vec<u8>, std::io::Error> {
+            self.inner.read_into(offset, buf).await
+        }
+
+        async fn write_from(
+            &self,
+            offset: u64,
+            buf: impl Borrow<Vec<u8>> + Send + 'static,
+        ) -> Result<(), std::io::Error> {
+            self.inner.write_from(offset, buf).await
+        }
+
+        async fn flush(&self) -> Result<(), std::io::Error> {
+            if self.fail_flush.load(Ordering::Relaxed) {
+                return Err(std::io::Error::other("flush failed"));
+            }
+            self.inner.flush().await
+        }
+
+        async fn file_size(&self) -> Result<u64, std::io::Error> {
+            self.inner.file_size().await
+        }
+
+        async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+            self.inner.set_file_size(size).await
+        }
+    }
+
+    // -- Tests --
+
+    /// Single `flush()` call → FSN advances from 0 to 1.
+    #[async_test]
+    async fn test_basic_flush() {
+        let file = InMemoryFile::new(0);
+        let seq = FlushSequencer::new();
+        let fsn = seq.flush(&file).await.unwrap();
+        assert_eq!(fsn, Fsn::new(1));
+        assert_eq!(seq.completed_fsn(), Fsn::new(1));
+    }
+
+    /// Three sequential `flush()` calls → FSNs are 1, 2, 3.
+    #[async_test]
+    async fn test_fsn_monotonically_increasing() {
+        let file = InMemoryFile::new(0);
+        let seq = FlushSequencer::new();
+        let fsn1 = seq.flush(&file).await.unwrap();
+        let fsn2 = seq.flush(&file).await.unwrap();
+        let fsn3 = seq.flush(&file).await.unwrap();
+        assert_eq!(fsn1, Fsn::new(1));
+        assert_eq!(fsn2, Fsn::new(2));
+        assert_eq!(fsn3, Fsn::new(3));
+        assert_eq!(seq.completed_fsn(), Fsn::new(3));
+    }
+
+    /// `current_fsn()` returns 1 initially, advances after each flush.
+    #[async_test]
+    async fn test_current_fsn() {
+        let file = InMemoryFile::new(0);
+        let seq = FlushSequencer::new();
+        assert_eq!(seq.current_fsn(), Fsn::new(1));
+        seq.flush(&file).await.unwrap();
+        assert_eq!(seq.current_fsn(), Fsn::new(2));
+        seq.flush(&file).await.unwrap();
+        assert_eq!(seq.current_fsn(), Fsn::new(3));
+    }
+
+    /// Spawn two concurrent `flush()` tasks. Both should complete, and the
+    /// total number of actual file flushes should be ≤ 2 (possibly 1 if
+    /// coalesced).
+    #[async_test]
+    async fn test_concurrent_flush_coalescing() {
+        let file = Arc::new(CountingFile::new());
+        let seq = Arc::new(FlushSequencer::new());
+
+        let file1 = file.clone();
+        let seq1 = seq.clone();
+        let t1 =
+            futures::FutureExt::boxed(async move { seq1.flush(file1.as_ref()).await.unwrap() });
+
+        let file2 = file.clone();
+        let seq2 = seq.clone();
+        let t2 =
+            futures::FutureExt::boxed(async move { seq2.flush(file2.as_ref()).await.unwrap() });
+
+        let (fsn1, fsn2) = futures::join!(t1, t2);
+
+        // Both FSNs should be valid (1 or 2).
+        assert!((Fsn::new(1)..=Fsn::new(2)).contains(&fsn1));
+        assert!((Fsn::new(1)..=Fsn::new(2)).contains(&fsn2));
+        assert_ne!(fsn1, fsn2);
+
+        // Completed FSN should be at least the max of both.
+        assert!(seq.completed_fsn() >= fsn1.max(fsn2));
+
+        // At most 2 actual file flushes should have occurred.
+        assert!(file.flush_count() <= 2);
+    }
+
+    /// Call `flush()`, then `flush_through(fsn)` → returns immediately.
+    #[async_test]
+    async fn test_flush_through_already_completed() {
+        let file = CountingFile::new();
+        let seq = FlushSequencer::new();
+        let fsn = seq.flush(&file).await.unwrap();
+        let count_before = file.flush_count();
+        // Should return immediately since the FSN is already completed.
+        seq.flush_through(&file, fsn).await.unwrap();
+        assert_eq!(seq.completed_fsn(), fsn);
+        // No additional flush should have been issued.
+        assert_eq!(file.flush_count(), count_before);
+    }
+
+    /// Call `flush_through(fsn)` on an un-issued FSN → triggers a flush
+    /// and completes.
+    #[async_test]
+    async fn test_flush_through_triggers_flush() {
+        let file = CountingFile::new();
+        let seq = FlushSequencer::new();
+        // FSN 1 has not been issued yet.
+        seq.flush_through(&file, Fsn::new(1)).await.unwrap();
+        assert!(seq.completed_fsn() >= Fsn::new(1));
+        assert!(file.flush_count() >= 1);
+    }
+
+    /// Spawn a concurrent `flush()` and `flush_through()` — both complete.
+    #[async_test]
+    async fn test_flush_through_waits_for_in_progress() {
+        let file = Arc::new(CountingFile::new());
+        let seq = Arc::new(FlushSequencer::new());
+
+        let file1 = file.clone();
+        let seq1 = seq.clone();
+        let flusher = futures::FutureExt::boxed(async move {
+            seq1.flush(file1.as_ref()).await.unwrap();
+        });
+
+        let file2 = file.clone();
+        let seq2 = seq.clone();
+        let waiter = futures::FutureExt::boxed(async move {
+            seq2.flush_through(file2.as_ref(), Fsn::new(1))
+                .await
+                .unwrap();
+        });
+
+        futures::join!(flusher, waiter);
+        assert!(seq.completed_fsn() >= Fsn::new(1));
+    }
+
+    /// Use a file wrapper that fails on `flush()` → `flush()` returns error,
+    /// `completed_fsn` does NOT advance.
+    #[async_test]
+    async fn test_flush_error_propagated() {
+        let file = FailingFile::new(true);
+        let seq = FlushSequencer::new();
+        let result = seq.flush(&file).await;
+        assert!(result.is_err());
+        assert_eq!(seq.completed_fsn(), Fsn::ZERO);
+    }
+
+    /// Use a file wrapper that fails on the first `flush()` but succeeds on
+    /// retry → first call fails, second `flush()` succeeds and FSN advances.
+    #[async_test]
+    async fn test_flush_error_recovery() {
+        let file = FailingFile::new(true);
+        let seq = FlushSequencer::new();
+
+        // First flush should fail.
+        let result = seq.flush(&file).await;
+        assert!(result.is_err());
+        assert_eq!(seq.completed_fsn(), Fsn::ZERO);
+
+        // Allow flushes to succeed now.
+        file.set_fail(false);
+
+        // Second flush should succeed.
+        let fsn = seq.flush(&file).await.unwrap();
+        assert!(fsn >= Fsn::new(1));
+        assert!(seq.completed_fsn() >= fsn);
+    }
+
+    /// `flush_through(0)` returns immediately — FSN 0 is always completed
+    /// since the sequencer starts with `completed_fsn = 0`.
+    #[async_test]
+    async fn test_flush_through_zero_is_noop() {
+        let file = CountingFile::new();
+        let seq = FlushSequencer::new();
+        seq.flush_through(&file, Fsn::ZERO).await.unwrap();
+        assert_eq!(file.flush_count(), 0);
+        assert_eq!(seq.completed_fsn(), Fsn::ZERO);
+    }
+
+    /// `flush_through` on a failing file propagates the error, and a
+    /// subsequent retry with a working file succeeds.
+    #[async_test]
+    async fn test_flush_through_error_recovery() {
+        let file = FailingFile::new(true);
+        let seq = FlushSequencer::new();
+
+        let result = seq.flush_through(&file, Fsn::new(1)).await;
+        assert!(result.is_err());
+        assert_eq!(seq.completed_fsn(), Fsn::ZERO);
+
+        file.set_fail(false);
+        seq.flush_through(&file, Fsn::new(1)).await.unwrap();
+        assert!(seq.completed_fsn() >= Fsn::new(1));
+    }
+
+    /// Two concurrent `flush_through` calls for the same FSN — both
+    /// complete, and the total number of file flushes is reasonable.
+    #[async_test]
+    async fn test_concurrent_flush_through_same_fsn() {
+        let file = Arc::new(CountingFile::new());
+        let seq = Arc::new(FlushSequencer::new());
+
+        let file1 = file.clone();
+        let seq1 = seq.clone();
+        let t1 = futures::FutureExt::boxed(async move {
+            seq1.flush_through(file1.as_ref(), Fsn::new(1))
+                .await
+                .unwrap();
+        });
+
+        let file2 = file.clone();
+        let seq2 = seq.clone();
+        let t2 = futures::FutureExt::boxed(async move {
+            seq2.flush_through(file2.as_ref(), Fsn::new(1))
+                .await
+                .unwrap();
+        });
+
+        futures::join!(t1, t2);
+        assert!(seq.completed_fsn() >= Fsn::new(1));
+    }
+
+    /// `flush()` returns `completed_fsn`, which may be higher than the
+    /// caller's own FSN if a concurrent flush for a higher FSN completed.
+    #[async_test]
+    async fn test_flush_returns_completed_fsn() {
+        let file = InMemoryFile::new(0);
+        let seq = FlushSequencer::new();
+
+        // Sequential flushes: returned FSN equals completed_fsn.
+        let fsn = seq.flush(&file).await.unwrap();
+        assert_eq!(fsn, seq.completed_fsn());
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/format.rs b/vm/devices/storage/vhdx/src/format.rs
new file mode 100644
index 0000000000..473e642f20
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/format.rs
@@ -0,0 +1,782 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! On-disk format types and constants for the VHDX file format.
+//!
+//! All structures use `#[repr(C)]` and derive zerocopy traits for safe
+//! zero-copy parsing.
+
+#![allow(dead_code)]
+
+use bitfield_struct::bitfield;
+use guid::Guid;
+use guid::guid;
+use zerocopy::FromBytes;
+use zerocopy::Immutable;
+use zerocopy::IntoBytes;
+use zerocopy::KnownLayout;
+
+// ---------------------------------------------------------------------------
+// Size constants
+// ---------------------------------------------------------------------------
+
+/// 4 KiB.
+pub const KB4: u64 = 4096;
+/// 64 KiB.
+pub const KB64: u64 = 65536;
+/// 1 MiB.
+pub const MB1: u64 = 1024 * 1024;
+/// 1 GiB.
+pub const GB1: u64 = 1024 * MB1;
+/// 1 TiB.
+pub const TB1: u64 = MB1 * MB1;
+
+/// Size of a log sector (4 KiB).
+pub const LOG_SECTOR_SIZE: u64 = KB4;
+/// Size of a large sector (64 KiB).
+pub const LARGE_SECTOR_SIZE: u64 = KB64;
+/// Alignment requirement for VHDX regions.
+pub const REGION_ALIGNMENT: u64 = MB1;
+/// Size of a sector bitmap block (1 MiB).
+pub const SECTOR_BITMAP_BLOCK_SIZE: u64 = MB1;
+/// Number of sectors described per chunk (sector bitmap block size * 8 bits).
+pub const SECTORS_PER_CHUNK: u64 = SECTOR_BITMAP_BLOCK_SIZE * 8;
+/// Minimum file offset that may be covered by log replay.
+pub const LOGGABLE_OFFSET: u64 = REGION_TABLE_OFFSET;
+
+// ---------------------------------------------------------------------------
+// Header area
+// ---------------------------------------------------------------------------
+
+/// Total size of the header area (1 MiB).
+pub const HEADER_AREA_SIZE: u64 = MB1;
+/// On-disk size of a single header (4 KiB).
+pub const HEADER_SIZE: u64 = KB4;
+/// File offset of the first (primary) header.
+pub const HEADER_OFFSET_1: u64 = LARGE_SECTOR_SIZE;
+/// File offset of the second (alternate) header.
+pub const HEADER_OFFSET_2: u64 = LARGE_SECTOR_SIZE * 2;
+
+/// Signature for [`Header`] (`'head'` as a little-endian u32).
+pub const HEADER_SIGNATURE: u32 = u32::from_le_bytes(*b"head");
+/// Current VHDX format version.
+pub const VERSION_1: u16 = 1;
+/// Current log format version.
+pub const LOG_VERSION: u16 = 0;
+
+// ---------------------------------------------------------------------------
+// Region table
+// ---------------------------------------------------------------------------
+
+/// Size of a region table (64 KiB).
+pub const REGION_TABLE_SIZE: u64 = LARGE_SECTOR_SIZE;
+/// File offset of the primary region table.
+pub const REGION_TABLE_OFFSET: u64 = LARGE_SECTOR_SIZE * 3;
+/// File offset of the alternate region table.
+pub const ALT_REGION_TABLE_OFFSET: u64 = LARGE_SECTOR_SIZE * 4;
+
+/// Signature for [`RegionTableHeader`] (`'regi'` as a little-endian u32).
+pub const REGION_TABLE_SIGNATURE: u32 = u32::from_le_bytes(*b"regi");
+
+/// Maximum number of entries in a region table.
+pub const REGION_TABLE_MAX_ENTRY_COUNT: u64 = (REGION_TABLE_SIZE
+    - size_of::<RegionTableHeader>() as u64)
+    / size_of::<RegionTableEntry>() as u64;
+
+// ---------------------------------------------------------------------------
+// BAT
+// ---------------------------------------------------------------------------
+
+/// Well-known GUID identifying the BAT region.
+pub const BAT_REGION_GUID: Guid = guid!("2dc27766-f623-4200-9d64-115e9bfd4a08");
+
+/// Maximum BAT size in bytes (513 MiB).
+pub const MAXIMUM_BAT_SIZE: u64 = 513 * MB1;
+/// Maximum number of BAT entries.
+pub const MAXIMUM_BAT_ENTRY_COUNT: u64 = MAXIMUM_BAT_SIZE / size_of::<BatEntry>() as u64;
+/// Absolute maximum BAT entry count (2^30).
+pub const ABSOLUTE_MAXIMUM_BAT_ENTRY_COUNT: u64 = 1 << 30;
+/// Maximum block size (256 MiB).
+pub const MAXIMUM_BLOCK_SIZE: u64 = 256 * MB1;
+/// Maximum virtual disk size (64 TiB).
+pub const MAXIMUM_DISK_SIZE: u64 = 64 * TB1;
+
+// ---------------------------------------------------------------------------
+// Metadata
+// ---------------------------------------------------------------------------
+
+/// Well-known GUID identifying the metadata region.
+pub const METADATA_REGION_GUID: Guid = guid!("8b7ca206-4790-4b9a-b8fe-575f050f886e");
+
+/// Signature for [`MetadataTableHeader`] (`"metadata"` as a little-endian u64).
+pub const METADATA_TABLE_SIGNATURE: u64 = u64::from_le_bytes(*b"metadata");
+
+/// Size of the metadata table (64 KiB).
+pub const METADATA_TABLE_SIZE: u64 = LARGE_SECTOR_SIZE;
+
+/// Maximum number of metadata table entries.
+pub const METADATA_ENTRY_MAX_COUNT: u64 = (METADATA_TABLE_SIZE
+    - size_of::<MetadataTableHeader>() as u64)
+    / size_of::<MetadataTableEntry>() as u64;
+
+/// Maximum number of system (non-user) metadata entries.
+pub const METADATA_SYSTEM_ENTRY_MAX_COUNT: u64 = 1023;
+/// Maximum number of user metadata entries.
+pub const METADATA_USER_ENTRY_MAX_COUNT: u64 = 1024;
+
+/// Maximum size of the entire metadata region (128 MiB).
+pub const MAXIMUM_METADATA_REGION_SIZE: u64 = 128 * MB1;
+/// Maximum total metadata size per category (user or system) (40 MiB).
+pub const MAXIMUM_TOTAL_METADATA_SIZE_PER_CATEGORY: u64 = 40 * MB1;
+/// Maximum size of a single metadata item (1 MiB).
+pub const MAXIMUM_METADATA_ITEM_SIZE: u64 = MB1;
+
+// ---------------------------------------------------------------------------
+// Metadata item GUIDs
+// ---------------------------------------------------------------------------
+
+/// File parameters metadata item GUID.
+pub const FILE_PARAMETERS_ITEM_GUID: Guid = guid!("caa16737-fa36-4d43-b3b6-33f0aa44e76b");
+
+/// Virtual disk size metadata item GUID.
+pub const VIRTUAL_DISK_SIZE_ITEM_GUID: Guid = guid!("2fa54224-cd1b-4876-b211-5dbed83bf4b8");
+
+/// Page 83 data metadata item GUID.
+pub const PAGE_83_ITEM_GUID: Guid = guid!("beca12ab-b2e6-4523-93ef-c309e000c746");
+
+/// CHS (cylinder-head-sector) parameters metadata item GUID.
+pub const CHS_PARAMETERS_ITEM_GUID: Guid = guid!("da02d7bc-3d3a-423c-ac88-2a36ab21479b");
+
+/// Logical sector size metadata item GUID.
+pub const LOGICAL_SECTOR_SIZE_ITEM_GUID: Guid = guid!("8141bf1d-a96f-4709-ba47-f233a8faab5f");
+
+/// Physical sector size metadata item GUID.
+pub const PHYSICAL_SECTOR_SIZE_ITEM_GUID: Guid = guid!("cda348c7-445d-4471-9cc9-e9885251c556");
+
+/// Incomplete file metadata item GUID.
+///
+/// Present on VHDs that have been created but not yet fully initialized.
+/// Deleted when creation is complete.
+pub const INCOMPLETE_FILE_ITEM_GUID: Guid = guid!("71cc85f0-1b69-4e28-9558-c3bf83ae75d3");
+
+// ---------------------------------------------------------------------------
+// Parent locator GUIDs
+// ---------------------------------------------------------------------------
+
+/// Parent locator metadata item GUID.
+pub const PARENT_LOCATOR_ITEM_GUID: Guid = guid!("a8d35f2d-b30b-454d-abf7-d3d84834ab0c");
+
+/// Parent locator type GUID for VHDX parent references.
+pub const PARENT_LOCATOR_VHDX_TYPE_GUID: Guid = guid!("b04aefb7-d19e-4a81-b789-25b8e9445913");
+
+/// Maximum number of key-value pairs in a parent locator.
+pub const PARENT_LOCATOR_MAXIMUM_KEY_VALUE_COUNT: u16 = 256;
+
+// ---------------------------------------------------------------------------
+// PMEM label storage area
+// ---------------------------------------------------------------------------
+
+/// PMEM label storage area metadata item GUID.
+pub const PMEM_LABEL_STORAGE_AREA_ITEM_GUID: Guid = guid!("10e1ae8a-4b7e-4169-a40f-cd70de928393");
+
+/// Version 1 of the PMEM label storage area header.
+pub const PMEM_LABEL_STORAGE_AREA_VERSION_1: u16 = 1;
+
+// ---------------------------------------------------------------------------
+// Log signatures
+// ---------------------------------------------------------------------------
+
+/// Signature for [`LogEntryHeader`] (`'loge'` as a little-endian u32).
+pub const LOG_ENTRY_HEADER_SIGNATURE: u32 = u32::from_le_bytes(*b"loge");
+/// Signature for a data log descriptor (`'desc'` as a little-endian u32).
+pub const LOG_DESCRIPTOR_DATA_SIGNATURE: u32 = u32::from_le_bytes(*b"desc");
+/// Signature for a zero log descriptor (`'zero'` as a little-endian u32).
+pub const LOG_DESCRIPTOR_ZERO_SIGNATURE: u32 = u32::from_le_bytes(*b"zero");
+/// Signature for [`LogDataSector`] (`'data'` as a little-endian u32).
+pub const LOG_DATA_SECTOR_SIGNATURE: u32 = u32::from_le_bytes(*b"data");
+
+// ---------------------------------------------------------------------------
+// File identifier signature
+// ---------------------------------------------------------------------------
+
+/// Default block size (2 MiB).
+pub const DEFAULT_BLOCK_SIZE: u32 = 2 * MB1 as u32;
+
+/// Default logical/physical sector size (512 bytes).
+pub const DEFAULT_SECTOR_SIZE: u32 = 512;
+
+/// Default metadata region size (1 MiB).
+pub const DEFAULT_METADATA_REGION_SIZE: u32 = MB1 as u32;
+
+/// Default log region size (1 MiB).
+pub const DEFAULT_LOG_SIZE: u32 = MB1 as u32;
+
+/// Cache page size (4 KiB) — the granularity of BAT page I/O.
+pub const CACHE_PAGE_SIZE: u64 = KB4;
+
+/// Number of BAT entries per cache page (4096 / 8 = 512).
+pub const ENTRIES_PER_BAT_PAGE: u64 = CACHE_PAGE_SIZE / size_of::<BatEntry>() as u64;
+
+/// Maximum hosting sector size (64 KiB) — largest sector the metadata
+/// table items should fit in.
+pub const MAX_HOSTING_SECTOR_SIZE: u64 = KB64;
+
+/// Signature for [`FileIdentifier`] (`"vhdxfile"` as a little-endian u64).
+pub const FILE_IDENTIFIER_SIGNATURE: u64 = u64::from_le_bytes(*b"vhdxfile");
+
+// ===========================================================================
+// On-disk structures
+// ===========================================================================
+
+/// VHDX file identifier — the first structure at offset 0 in a VHDX file.
+///
+/// Contains the file signature and a UTF-16 creator string.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct FileIdentifier {
+    /// Must be [`FILE_IDENTIFIER_SIGNATURE`].
+    pub signature: u64,
+    /// UTF-16LE creator string (informational, not validated by parsers).
+    pub creator: [u16; 256],
+}
+
+/// VHDX header — one of two dual headers located at [`HEADER_OFFSET_1`]
+/// and [`HEADER_OFFSET_2`].
+///
+/// The header with the higher valid sequence number is the current header.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct Header {
+    /// Must be [`HEADER_SIGNATURE`].
+    pub signature: u32,
+    /// CRC-32C checksum of the entire 4 KiB header (with this field zeroed).
+    pub checksum: u32,
+    /// Monotonically increasing sequence number.
+    pub sequence_number: u64,
+    /// GUID changed on every file-level write (metadata or data structure writes).
+    pub file_write_guid: Guid,
+    /// GUID changed on every virtual-disk data write.
+    pub data_write_guid: Guid,
+    /// GUID identifying the active log. Zero GUID means no active log.
+    pub log_guid: Guid,
+    /// Log format version (currently [`LOG_VERSION`]).
+    pub log_version: u16,
+    /// File format version (currently [`VERSION_1`]).
+    pub version: u16,
+    /// Length of the log region in bytes.
+    pub log_length: u32,
+    /// File offset of the log region.
+    pub log_offset: u64,
+}
+
+/// Region table header — precedes an array of [`RegionTableEntry`] values.
+///
+/// Two copies exist at [`REGION_TABLE_OFFSET`] and [`ALT_REGION_TABLE_OFFSET`].
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct RegionTableHeader {
+    /// Must be [`REGION_TABLE_SIGNATURE`].
+    pub signature: u32,
+    /// CRC-32C checksum of the entire 64 KiB region table.
+    pub checksum: u32,
+    /// Number of valid entries following this header.
+    pub entry_count: u32,
+    /// Reserved, must be zero.
+    pub reserved: u32,
+}
+
+/// A single entry in the region table.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct RegionTableEntry {
+    /// GUID identifying the region type (e.g. [`BAT_REGION_GUID`]).
+    pub guid: Guid,
+    /// File offset of the region.
+    pub file_offset: u64,
+    /// Length of the region in bytes.
+    pub length: u32,
+    /// Region table entry flags.
+    pub flags: RegionTableEntryFlags,
+}
+
+/// Flags for a [`RegionTableEntry`].
+#[bitfield(u32)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)]
+pub struct RegionTableEntryFlags {
+    /// Whether this region is required for the file to be valid.
+    pub required: bool,
+    /// Reserved bits.
+    #[bits(31)]
+    _reserved: u32,
+}
+
+/// BAT (Block Allocation Table) entry.
+///
+/// Packs a 3-bit block state and a 44-bit file offset (in MiB units)
+/// into a single `u64`.
+#[bitfield(u64)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)]
+pub struct BatEntry {
+    /// Block state (see [`BatEntryState`]).
+    #[bits(3)]
+    pub state: u8,
+    /// Reserved bits.
+    #[bits(17)]
+    _reserved: u32,
+    /// File offset in MiB units (bits 20..63).
+    #[bits(44)]
+    pub file_offset_mb: u64,
+}
+
+impl BatEntry {
+    /// Computes the full file offset in bytes.
+    pub fn file_offset(&self) -> u64 {
+        self.file_offset_mb() << 20
+    }
+}
+
+/// Block states stored in the low 3 bits of a [`BatEntry`].
+#[repr(u8)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum BatEntryState {
+    /// Block is not present. For data blocks: read from parent or return
+    /// zeros. For sector bitmap blocks: treat all sectors as not present.
+    NotPresent = 0,
+    /// Block has undefined content. Reading returns an error.
+    Undefined = 1,
+    /// Block is explicitly zero-filled.
+    Zero = 2,
+    /// Block is unmapped (trimmed). Content is undefined.
+    Unmapped = 3,
+    // Values 4 and 5 are unused / reserved.
+    /// Block is fully present and backed by file data.
+    FullyPresent = 6,
+    /// Block is partially present. A sector bitmap describes which
+    /// sectors contain data.
+    PartiallyPresent = 7,
+}
+
+impl BatEntryState {
+    /// Attempt to convert a raw `u8` state value to a [`BatEntryState`].
+    pub fn from_raw(value: u8) -> Option<Self> {
+        match value {
+            0 => Some(Self::NotPresent),
+            1 => Some(Self::Undefined),
+            2 => Some(Self::Zero),
+            3 => Some(Self::Unmapped),
+            6 => Some(Self::FullyPresent),
+            7 => Some(Self::PartiallyPresent),
+            _ => None,
+        }
+    }
+
+    /// Whether this state counts as "allocated" (backed by file space).
+    pub fn is_allocated(self) -> bool {
+        matches!(self, Self::FullyPresent | Self::PartiallyPresent)
+    }
+}
+
+/// Metadata table header.
+///
+/// Located at the start of the metadata region, followed by up to
+/// [`METADATA_ENTRY_MAX_COUNT`] entries.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct MetadataTableHeader {
+    /// Must be [`METADATA_TABLE_SIGNATURE`].
+    pub signature: u64,
+    /// Reserved, must be zero.
+    pub reserved: u16,
+    /// Number of valid entries following this header.
+    pub entry_count: u16,
+    /// Reserved, must be zero.
+    pub reserved2: [u32; 5],
+}
+
+/// A single entry in the metadata table.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct MetadataTableEntry {
+    /// GUID identifying the metadata item.
+    pub item_id: Guid,
+    /// Offset of the item data relative to the start of the metadata region.
+    pub offset: u32,
+    /// Length of the item data in bytes.
+    pub length: u32,
+    /// Metadata entry flags.
+    pub flags: MetadataTableEntryFlags,
+    /// Reserved, must be zero.
+    pub reserved2: u32,
+}
+
+/// Flags for a [`MetadataTableEntry`].
+#[bitfield(u32)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)]
+pub struct MetadataTableEntryFlags {
+    /// Whether this is a user metadata entry.
+    pub is_user: bool,
+    /// Whether this is a virtual disk metadata entry.
+    pub is_virtual_disk: bool,
+    /// Whether this metadata entry is required.
+    pub is_required: bool,
+    /// Reserved bits.
+    #[bits(29)]
+    _reserved: u32,
+}
+
+/// File parameters metadata item.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct FileParameters {
+    /// Block size in bytes.
+    pub block_size: u32,
+    /// File parameters flags.
+    pub flags: FileParametersFlags,
+}
+
+/// Flags for [`FileParameters`].
+#[bitfield(u32)]
+#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)]
+pub struct FileParametersFlags {
+    /// Whether blocks are left allocated (fixed VHD).
+    pub leave_blocks_allocated: bool,
+    /// Whether the disk has a parent (differencing disk).
+    pub has_parent: bool,
+    /// Reserved bits.
+    #[bits(30)]
+    _reserved: u32,
+}
+
+/// CHS (cylinder-head-sector) parameters metadata item.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct ChsParameters {
+    /// Number of heads per cylinder.
+    pub heads_per_cylinder: u32,
+    /// Number of sectors per track.
+    pub sectors_per_track: u32,
+}
+
+/// Parent locator header.
+///
+/// Precedes an array of [`ParentLocatorEntry`] values within the parent
+/// locator metadata item.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct ParentLocatorHeader {
+    /// GUID identifying the locator type (e.g. [`PARENT_LOCATOR_VHDX_TYPE_GUID`]).
+    pub locator_type: Guid,
+    /// Reserved, must be zero.
+    pub reserved: u16,
+    /// Number of key-value entries following this header.
+    pub key_value_count: u16,
+}
+
+/// A single key-value entry in a parent locator.
+///
+/// Keys and values are stored as UTF-16LE strings at the indicated offsets
+/// within the parent locator metadata item.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct ParentLocatorEntry {
+    /// Byte offset of the key string (relative to the locator item start).
+    pub key_offset: u32,
+    /// Byte offset of the value string (relative to the locator item start).
+    pub value_offset: u32,
+    /// Length of the key string in bytes.
+    pub key_length: u16,
+    /// Length of the value string in bytes.
+    pub value_length: u16,
+}
+
+/// Log entry header.
+///
+/// Each log entry starts with this header, followed by an array of
+/// log descriptors (data or zero).
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct LogEntryHeader {
+    /// Must be [`LOG_ENTRY_HEADER_SIGNATURE`].
+    pub signature: u32,
+    /// CRC-32C checksum of the entire log entry (with this field zeroed).
+    pub checksum: u32,
+    /// Total length of this log entry in bytes (including header, descriptors,
+    /// and data sectors).
+    pub entry_length: u32,
+    /// Byte offset of the oldest active log entry (the "tail").
+    pub tail: u32,
+    /// Sequence number of this log entry.
+    pub sequence_number: u64,
+    /// Number of descriptors in this entry.
+    pub descriptor_count: u32,
+    /// Reserved, must be zero.
+    pub reserved: u32,
+    /// Must match the log GUID in the active header.
+    pub log_guid: Guid,
+    /// File size after all entries up to and including this one are applied.
+    pub flushed_file_offset: u64,
+    /// File size required to write this entry's data.
+    pub last_file_offset: u64,
+}
+
+/// Log data descriptor — describes a range of data to write from log
+/// data sectors.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct LogDataDescriptor {
+    /// Must be [`LOG_DESCRIPTOR_DATA_SIGNATURE`].
+    pub signature: u32,
+    /// Number of trailing bytes from the previous 4 KiB sector that begin
+    /// this data region.
+    pub trailing_bytes: u32,
+    /// Number of leading bytes from the next 4 KiB sector that end this
+    /// data region.
+    pub leading_bytes: u64,
+    /// File offset where this data should be written.
+    pub file_offset: u64,
+    /// Sequence number (must match the log entry's sequence number).
+    pub sequence_number: u64,
+}
+
+/// Log zero descriptor — describes a range of the file that should be
+/// zero-filled during replay.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct LogZeroDescriptor {
+    /// Must be [`LOG_DESCRIPTOR_ZERO_SIGNATURE`].
+    pub signature: u32,
+    /// Reserved, must be zero.
+    pub reserved: u32,
+    /// Length of the zero-filled range in bytes.
+    pub length: u64,
+    /// File offset where zeroing should begin.
+    pub file_offset: u64,
+    /// Sequence number (must match the log entry's sequence number).
+    pub sequence_number: u64,
+}
+
+/// A single 4 KiB data sector within a log entry (following the
+/// descriptors).
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct LogDataSector {
+    /// Must be [`LOG_DATA_SECTOR_SIGNATURE`].
+    pub signature: u32,
+    /// High 32 bits of the sequence number.
+    pub sequence_high: u32,
+    /// Payload data (4084 bytes).
+    pub data: [u8; 4084],
+    /// Low 32 bits of the sequence number.
+    pub sequence_low: u32,
+}
+
+/// PMEM label storage area header.
+///
+/// Describes label storage for NVDIMM-backed VHDX files.
+#[repr(C)]
+#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)]
+pub struct PmemLabelStorageAreaHeader {
+    /// Version of this header (currently [`PMEM_LABEL_STORAGE_AREA_VERSION_1`]).
+    pub version: u16,
+    /// Reserved, must be zero.
+    pub reserved: u16,
+    /// GUID identifying the address abstraction type.
+    pub address_abstraction_type: Guid,
+    /// Byte offset of the label data (relative to this item).
+    pub data_offset: u32,
+    /// Length of the label data in bytes.
+    pub data_length: u32,
+}
+
+// ===========================================================================
+// Checksum helpers
+// ===========================================================================
+
+/// Compute the CRC-32C checksum of `data`, treating the 4 bytes at
+/// `checksum_offset` as zero during computation.
+///
+/// This is used for headers and region tables where the checksum field
+/// itself must be excluded from the CRC calculation.
+pub fn compute_checksum(data: &[u8], checksum_offset: usize) -> u32 {
+    let mut crc = crc32c::crc32c(&data[..checksum_offset]);
+    crc = crc32c::crc32c_append(crc, &[0; 4]);
+    crc32c::crc32c_append(crc, &data[checksum_offset + 4..])
+}
+
+/// Validate that the CRC-32C checksum stored in `data` at `checksum_offset`
+/// matches the computed value.
+pub fn validate_checksum(data: &[u8], checksum_offset: usize) -> bool {
+    let stored = u32::from_le_bytes(
+        data[checksum_offset..checksum_offset + 4]
+            .try_into()
+            .unwrap(),
+    );
+    let computed = compute_checksum(data, checksum_offset);
+    stored == computed
+}
+
+// ===========================================================================
+// Well-known parent locator key names
+// ===========================================================================
+
+/// Parent linkage key name (UTF-16LE).
+pub const PARENT_LOCATOR_KEY_PARENT_LINKAGE: &str = "parent_linkage";
+/// Alternative parent linkage key name (UTF-16LE).
+pub const PARENT_LOCATOR_KEY_ALT_PARENT_LINKAGE: &str = "parent_linkage2";
+/// Relative path key name (UTF-16LE).
+pub const PARENT_LOCATOR_KEY_RELATIVE_PATH: &str = "relative_path";
+/// Absolute Win32 path key name (UTF-16LE).
+pub const PARENT_LOCATOR_KEY_ABSOLUTE_PATH: &str = "absolute_win32_path";
+/// Volume path key name (UTF-16LE).
+pub const PARENT_LOCATOR_KEY_VOLUME_PATH: &str = "volume_path";
+
+// ===========================================================================
+// Tests
+// ===========================================================================
+
+// Compile-time layout assertions.
+const _: () = {
+    // FileIdentifier: 8 bytes signature + 256 * 2 bytes creator = 520 bytes
+    assert!(size_of::<FileIdentifier>() == 8 + 256 * 2);
+    // Header: Signature(4) + Checksum(4) + SequenceNumber(8) +
+    // FileWriteGuid(16) + DataWriteGuid(16) + LogGuid(16) +
+    // LogVersion(2) + Version(2) + LogLength(4) + LogOffset(8) = 80
+    assert!(size_of::<Header>() == 80);
+    assert!(size_of::<RegionTableHeader>() == 16);
+    // RegionTableEntry: GUID(16) + FileOffset(8) + Length(4) + Flags(4) = 32
+    assert!(size_of::<RegionTableEntry>() == 32);
+    assert!(size_of::<MetadataTableHeader>() == 32);
+    assert!(size_of::<MetadataTableEntry>() == 32);
+    assert!(size_of::<LogEntryHeader>() == 64);
+    assert!(size_of::<LogDataDescriptor>() == 32);
+    assert!(size_of::<LogZeroDescriptor>() == 32);
+    assert!(size_of::<LogDataSector>() == KB4 as usize);
+    // System + user entries should equal the maximum entry count.
+    assert!(
+        METADATA_SYSTEM_ENTRY_MAX_COUNT + METADATA_USER_ENTRY_MAX_COUNT == METADATA_ENTRY_MAX_COUNT
+    );
+};
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use zerocopy::FromZeros;
+
+    #[test]
+    fn bat_entry_accessors() {
+        let entry = BatEntry::new().with_state(6).with_file_offset_mb(2);
+        assert_eq!(entry.state(), 6);
+        assert_eq!(entry.file_offset_mb(), 2);
+        assert_eq!(entry.file_offset(), 2 * MB1);
+    }
+
+    #[test]
+    fn bat_entry_state_roundtrip() {
+        for &(raw, expected) in &[
+            (0, BatEntryState::NotPresent),
+            (1, BatEntryState::Undefined),
+            (2, BatEntryState::Zero),
+            (3, BatEntryState::Unmapped),
+            (6, BatEntryState::FullyPresent),
+            (7, BatEntryState::PartiallyPresent),
+        ] {
+            assert_eq!(BatEntryState::from_raw(raw), Some(expected));
+        }
+        // Values 4, 5 are undefined.
+        assert_eq!(BatEntryState::from_raw(4), None);
+        assert_eq!(BatEntryState::from_raw(5), None);
+    }
+
+    #[test]
+    fn file_parameters_flags() {
+        let flags = FileParametersFlags::new()
+            .with_leave_blocks_allocated(true)
+            .with_has_parent(true);
+        let params = FileParameters {
+            block_size: 0,
+            flags,
+        };
+        assert!(params.flags.leave_blocks_allocated());
+        assert!(params.flags.has_parent());
+
+        let params2 = FileParameters {
+            block_size: 0,
+            flags: FileParametersFlags::new(),
+        };
+        assert!(!params2.flags.leave_blocks_allocated());
+        assert!(!params2.flags.has_parent());
+    }
+
+    #[test]
+    fn region_table_entry_flags() {
+        let entry = RegionTableEntry {
+            guid: Guid::ZERO,
+            file_offset: 0,
+            length: 0,
+            flags: RegionTableEntryFlags::new().with_required(true),
+        };
+        assert!(entry.flags.required());
+
+        let entry2 = RegionTableEntry {
+            guid: Guid::ZERO,
+            file_offset: 0,
+            length: 0,
+            flags: RegionTableEntryFlags::new(),
+        };
+        assert!(!entry2.flags.required());
+    }
+
+    #[test]
+    fn checksum_roundtrip() {
+        // Create a fake header-sized buffer and verify checksum round-trip.
+        let mut data = vec![0u8; HEADER_SIZE as usize];
+        // Write the header signature.
+        data[0..4].copy_from_slice(&HEADER_SIGNATURE.to_le_bytes());
+        // Checksum field is at offset 4.
+        let checksum_offset = 4;
+        let crc = compute_checksum(&data, checksum_offset);
+        data[checksum_offset..checksum_offset + 4].copy_from_slice(&crc.to_le_bytes());
+        assert!(validate_checksum(&data, checksum_offset));
+    }
+
+    #[test]
+    fn zero_copy_roundtrip_header() {
+        let mut header = Header::new_zeroed();
+        header.signature = HEADER_SIGNATURE;
+        header.version = VERSION_1;
+        header.sequence_number = 42;
+
+        let bytes = header.as_bytes();
+        let parsed = Header::read_from_bytes(bytes).unwrap();
+        assert_eq!(parsed.signature, HEADER_SIGNATURE);
+        assert_eq!(parsed.version, VERSION_1);
+        assert_eq!(parsed.sequence_number, 42);
+    }
+
+    #[test]
+    fn zero_copy_roundtrip_bat_entry() {
+        let entry = BatEntry::new().with_state(6).with_file_offset_mb(100);
+        let bytes = entry.as_bytes();
+        let parsed = BatEntry::read_from_bytes(bytes).unwrap();
+        assert_eq!(parsed.state(), 6);
+        assert_eq!(parsed.file_offset_mb(), 100);
+    }
+
+    #[test]
+    fn metadata_table_entry_flags() {
+        let flags = MetadataTableEntryFlags::new()
+            .with_is_user(true)
+            .with_is_virtual_disk(true)
+            .with_is_required(true);
+        let entry = MetadataTableEntry {
+            item_id: Guid::ZERO,
+            offset: 0,
+            length: 0,
+            flags,
+            reserved2: 0,
+        };
+        assert!(entry.flags.is_user());
+        assert!(entry.flags.is_virtual_disk());
+        assert!(entry.flags.is_required());
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/header.rs b/vm/devices/storage/vhdx/src/header.rs
new file mode 100644
index 0000000000..6c0daf309c
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/header.rs
@@ -0,0 +1,520 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Dual header parsing, validation, and write-mode management for VHDX files.
+//!
+//! Reads both VHDX headers, validates their signatures and CRC-32C checksums,
+//! selects the active header (higher sequence number), and validates log
+//! region parameters.
+//!
+//! Also provides [`HeaderState`], which serializes all header writes behind
+//! a `futures::lock::Mutex` and exposes the current [`WriteMode`] via an
+//! `AtomicU8` for lock-free hot-path checks.
+
+use crate::AsyncFile;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::error::OpenErrorInner;
+use crate::flush::FlushSequencer;
+use crate::format;
+use crate::format::Header;
+use guid::Guid;
+use std::sync::atomic::AtomicU8;
+use std::sync::atomic::Ordering;
+use zerocopy::FromBytes;
+use zerocopy::FromZeros;
+use zerocopy::IntoBytes;
+
+/// Parsed and validated header data extracted from a VHDX file.
+pub(crate) struct ParsedHeader {
+    /// The active header's sequence number.
+    pub sequence_number: u64,
+    /// GUID changed on every file-level write.
+    pub file_write_guid: Guid,
+    /// GUID changed on every virtual-disk data write.
+    pub data_write_guid: Guid,
+    /// GUID identifying the active log. Zero means no active log.
+    pub log_guid: Guid,
+    /// File offset of the log region.
+    pub log_offset: u64,
+    /// Length of the log region in bytes.
+    pub log_length: u32,
+    /// True if header 1 was chosen as the active header.
+    pub first_header_current: bool,
+}
+
+/// Read a single 4 KiB header from the file and validate its signature
+/// and CRC-32C checksum. Returns `Some(header)` if valid, `None` otherwise.
+async fn read_and_validate_header(
+    file: &impl AsyncFile,
+    offset: u64,
+) -> Result<Option<Header>, OpenError> {
+    let buf = file.alloc_buffer(format::HEADER_SIZE as usize);
+    let buf = file
+        .read_into(offset, buf)
+        .await
+        .map_err(OpenErrorInner::Io)?;
+
+    // Check signature.
+    let header = match Header::read_from_prefix(buf.as_ref()) {
+        Ok((h, _)) => h,
+        Err(_) => return Ok(None),
+    };
+    if header.signature != format::HEADER_SIGNATURE {
+        return Ok(None);
+    }
+
+    // Validate CRC-32C checksum (checksum field is at byte offset 4).
+    if !format::validate_checksum(buf.as_ref(), 4) {
+        return Ok(None);
+    }
+
+    Ok(Some(header.clone()))
+}
+
+/// Read both headers from the file, validate them, and return the active one.
+///
+/// If both headers are valid, the one with the higher sequence number wins.
+/// If only one is valid, it is used. If neither is valid, returns an error.
+pub(crate) async fn parse_headers(
+    file: &impl AsyncFile,
+    file_length: u64,
+) -> Result<ParsedHeader, OpenError> {
+    let header1 = read_and_validate_header(file, format::HEADER_OFFSET_1).await?;
+    let header2 = read_and_validate_header(file, format::HEADER_OFFSET_2).await?;
+
+    // Choose the active header.
+    let (header, first_header_current) = match (&header1, &header2) {
+        (Some(h1), Some(h2)) => {
+            if h1.sequence_number >= h2.sequence_number {
+                (h1, true)
+            } else {
+                (h2, false)
+            }
+        }
+        (Some(h1), None) => (h1, true),
+        (None, Some(h2)) => (h2, false),
+        (None, None) => return Err((CorruptionType::NoValidHeaders).into()),
+    };
+
+    // Validate version.
+    if header.version != format::VERSION_1 {
+        return Err((CorruptionType::UnsupportedVersion).into());
+    }
+
+    // If log GUID is non-zero, validate log version.
+    if header.log_guid != Guid::ZERO && header.log_version != format::LOG_VERSION {
+        return Err((CorruptionType::UnsupportedLogVersion).into());
+    }
+
+    // Validate log offset and length alignment.
+    if !header.log_offset.is_multiple_of(format::REGION_ALIGNMENT)
+        || !(header.log_length as u64).is_multiple_of(format::REGION_ALIGNMENT)
+    {
+        return Err((CorruptionType::InvalidLogOffsetOrLength).into());
+    }
+
+    let (log_offset, log_length) = if header.log_length == 0 {
+        // Log is empty — log GUID must also be zero.
+        if header.log_guid != Guid::ZERO {
+            return Err((CorruptionType::MissingLogHasGuid).into());
+        }
+        (0, 0)
+    } else {
+        // Log is present — validate offset and bounds.
+        if header.log_offset < format::HEADER_AREA_SIZE {
+            return Err((CorruptionType::InvalidLogOffset).into());
+        }
+        if header.log_offset.saturating_add(header.log_length as u64) > file_length {
+            return Err((CorruptionType::LogBeyondEndOfFile).into());
+        }
+        (header.log_offset, header.log_length)
+    };
+
+    Ok(ParsedHeader {
+        sequence_number: header.sequence_number,
+        file_write_guid: header.file_write_guid,
+        data_write_guid: header.data_write_guid,
+        log_guid: header.log_guid,
+        log_offset,
+        log_length,
+        first_header_current,
+    })
+}
+
+/// Serialize a VHDX header to a 4 KiB buffer with CRC and determine
+/// the target offset (non-current header slot).
+///
+/// Returns `(buffer, file_offset)` ready for `write_from`.
+pub(crate) fn serialize_header<F: AsyncFile>(
+    file: &F,
+    sequence_number: u64,
+    file_write_guid: Guid,
+    data_write_guid: Guid,
+    log_guid: Guid,
+    log_offset: u64,
+    log_length: u32,
+    first_header_current: bool,
+) -> (F::Buffer, u64) {
+    let mut header = Header::new_zeroed();
+    header.signature = format::HEADER_SIGNATURE;
+    header.sequence_number = sequence_number;
+    header.file_write_guid = file_write_guid;
+    header.data_write_guid = data_write_guid;
+    header.log_guid = log_guid;
+    header.log_version = format::LOG_VERSION;
+    header.version = format::VERSION_1;
+    header.log_length = log_length;
+    header.log_offset = log_offset;
+    header.checksum = 0;
+
+    let mut buf = file.alloc_buffer(format::HEADER_SIZE as usize);
+    let hdr_bytes = header.as_bytes();
+    buf.as_mut()[..hdr_bytes.len()].copy_from_slice(hdr_bytes);
+    let crc = format::compute_checksum(buf.as_ref(), 4);
+    buf.as_mut()[4..8].copy_from_slice(&crc.to_le_bytes());
+
+    let offset = if first_header_current {
+        format::HEADER_OFFSET_2
+    } else {
+        format::HEADER_OFFSET_1
+    };
+
+    (buf, offset)
+}
+
+/// The kind of modification being made to the VHDX file. Controls which
+/// GUIDs are updated in the header before the first write.
+///
+/// Values are ordered: `FileWritable < DataWritable`. Once `DataWritable`
+/// is reached, `FileWritable` is a no-op. The `#[repr(u8)]` layout
+/// matches the `AtomicU8` stored in [`HeaderState`].
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(u8)]
+pub(crate) enum WriteMode {
+    /// The file is being modified (metadata only, e.g. resize/compact).
+    /// Updates FileWriteGuid.
+    FileWritable = 1,
+    /// User-visible virtual disk data is being modified.
+    /// Updates both FileWriteGuid and DataWriteGuid.
+    DataWritable = 2,
+}
+
+/// Value used in [`HeaderState::write_mode`] when no write has occurred yet.
+const WRITE_MODE_NONE: u8 = 0;
+
+/// Mutable header state, serialized behind a `futures::lock::Mutex`.
+///
+/// All header writes go through [`HeaderState::write()`], which holds the
+/// async mutex across the serialize→write→flush→flip sequence, preventing
+/// concurrent header writes from interleaving.
+///
+/// The current [`WriteMode`] is also published to an `AtomicU8` so that
+/// the hot path (`enable_write_mode`) can check it with a single atomic
+/// load and avoid taking any lock.
+pub(crate) struct HeaderState {
+    /// Current write mode, published atomically for lock-free fast-path
+    /// checks. Updated *after* the header is on stable storage.
+    write_mode: AtomicU8,
+    /// Data-write GUID, stored separately for the sync public accessor
+    /// `VhdxFile::data_write_guid()`. Updated under the async mutex,
+    /// read via `parking_lot::Mutex` (or `AtomicU64` pair if needed).
+    /// Here we use `parking_lot::Mutex` since it's a brief, non-contended
+    /// read.
+    data_write_guid: parking_lot::Mutex<Guid>,
+    /// File offset of the log region (immutable after open).
+    log_offset: u64,
+    /// Length of the log region in bytes (immutable after open).
+    log_length: u32,
+    /// Async mutex serializing all header writes.
+    inner: futures::lock::Mutex<HeaderStateInner>,
+}
+
+/// Fields protected by the async mutex inside [`HeaderState`].
+struct HeaderStateInner {
+    /// Current header sequence number (bumped on every write).
+    sequence_number: u64,
+    /// GUID changed on every file-level write.
+    file_write_guid: Guid,
+    /// GUID changed on every virtual-disk data write.
+    data_write_guid: Guid,
+    /// Active log GUID. Zero when no log task is running.
+    log_guid: Guid,
+    /// True if header slot 1 (offset 64 KiB) is the current header.
+    first_header_current: bool,
+}
+
+impl HeaderState {
+    /// Create a new `HeaderState` from a parsed header.
+    pub fn new(header: &ParsedHeader) -> Self {
+        Self {
+            write_mode: AtomicU8::new(WRITE_MODE_NONE),
+            data_write_guid: parking_lot::Mutex::new(header.data_write_guid),
+            log_offset: header.log_offset,
+            log_length: header.log_length,
+            inner: futures::lock::Mutex::new(HeaderStateInner {
+                sequence_number: header.sequence_number,
+                file_write_guid: header.file_write_guid,
+                data_write_guid: header.data_write_guid,
+                log_guid: header.log_guid,
+                first_header_current: header.first_header_current,
+            }),
+        }
+    }
+
+    /// Lock-free check: is the current write mode ≥ `mode`?
+    pub fn is_mode_enabled(&self, mode: WriteMode) -> bool {
+        self.write_mode.load(Ordering::Acquire) >= mode as u8
+    }
+
+    /// Read the current data-write GUID (sync, brief lock).
+    pub fn data_write_guid(&self) -> Guid {
+        *self.data_write_guid.lock()
+    }
+
+    /// Get the log region offset and length (immutable after open).
+    pub fn log_region(&self) -> (u64, u32) {
+        (self.log_offset, self.log_length)
+    }
+
+    /// Read the current sequence number. Requires the async lock.
+    #[cfg(test)]
+    pub async fn sequence_number(&self) -> u64 {
+        self.inner.lock().await.sequence_number
+    }
+
+    /// Read the current write mode (for test assertions).
+    #[cfg(test)]
+    pub fn write_mode(&self) -> Option<WriteMode> {
+        match self.write_mode.load(Ordering::Acquire) {
+            0 => None,
+            1 => Some(WriteMode::FileWritable),
+            2 => Some(WriteMode::DataWritable),
+            _ => unreachable!(),
+        }
+    }
+
+    /// Ensure the file is in at least write mode `mode`.
+    ///
+    /// Hot path (mode already enabled): single atomic load, no lock.
+    ///
+    /// Cold path (mode transition): acquires the async mutex, generates
+    /// new GUIDs, writes the header to the non-current slot, flushes,
+    /// flips the active slot, then publishes the new mode atomically.
+    ///
+    /// Safe to call concurrently — the async mutex serializes transitions.
+    pub async fn enable_write_mode(
+        &self,
+        mode: WriteMode,
+        file: &impl AsyncFile,
+        flush_sequencer: Option<&FlushSequencer>,
+    ) -> Result<(), std::io::Error> {
+        // Hot path: single atomic load.
+        if self.is_mode_enabled(mode) {
+            return Ok(());
+        }
+
+        // Cold path: serialize under async mutex.
+        let mut inner = self.inner.lock().await;
+
+        // Double-check under lock (another caller may have raced).
+        if self.write_mode.load(Ordering::Relaxed) >= mode as u8 {
+            return Ok(());
+        }
+
+        // Generate new GUIDs.
+        inner.file_write_guid = Guid::new_random();
+        if mode >= WriteMode::DataWritable {
+            inner.data_write_guid = Guid::new_random();
+            *self.data_write_guid.lock() = inner.data_write_guid;
+        }
+
+        // Write header, flush, flip slot.
+        self.write_header(&mut inner, file, flush_sequencer).await?;
+
+        // Publish the mode change — only after the header is on stable storage.
+        self.write_mode.store(mode as u8, Ordering::Release);
+
+        Ok(())
+    }
+
+    /// Set the log GUID and write a header update. Used by `open_writable`
+    /// to mark the file as dirty before spawning the log task.
+    pub async fn set_log_guid(
+        &self,
+        log_guid: Guid,
+        file: &impl AsyncFile,
+        flush_sequencer: Option<&FlushSequencer>,
+    ) -> Result<(), std::io::Error> {
+        let mut inner = self.inner.lock().await;
+        inner.log_guid = log_guid;
+        self.write_header(&mut inner, file, flush_sequencer).await
+    }
+
+    /// Clear the log GUID (set to ZERO) and write a clean header.
+    /// Used by `close()` after the log is fully drained.
+    pub async fn clear_log_guid(
+        &self,
+        file: &impl AsyncFile,
+        flush_sequencer: Option<&FlushSequencer>,
+    ) -> Result<(), std::io::Error> {
+        let mut inner = self.inner.lock().await;
+        inner.log_guid = Guid::ZERO;
+        self.write_header(&mut inner, file, flush_sequencer).await
+    }
+
+    /// Bump the sequence number, serialize the header, write to the
+    /// non-current slot, flush, and flip the active slot.
+    ///
+    /// Caller must hold the async mutex (`inner` is `&mut`).
+    async fn write_header(
+        &self,
+        inner: &mut HeaderStateInner,
+        file: &impl AsyncFile,
+        flush_sequencer: Option<&FlushSequencer>,
+    ) -> Result<(), std::io::Error> {
+        inner.sequence_number += 1;
+        let (buf, offset) = serialize_header(
+            file,
+            inner.sequence_number,
+            inner.file_write_guid,
+            inner.data_write_guid,
+            inner.log_guid,
+            self.log_offset,
+            self.log_length,
+            inner.first_header_current,
+        );
+
+        file.write_from(offset, buf).await?;
+
+        if let Some(fs) = flush_sequencer {
+            fs.flush(file).await?;
+        } else {
+            file.flush().await?;
+        }
+
+        inner.first_header_current = !inner.first_header_current;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+
+    #[async_test]
+    async fn parse_valid_dual_headers() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let file_length = file.file_size().await.unwrap();
+        let parsed = parse_headers(&file, file_length).await.unwrap();
+
+        // Header 2 has sequence_number 1. Header 1 has 0. So header 2 wins.
+        assert_eq!(parsed.sequence_number, 1);
+        assert!(!parsed.first_header_current);
+        assert_eq!(parsed.log_guid, Guid::ZERO);
+        assert_ne!(parsed.file_write_guid, Guid::ZERO);
+        assert_ne!(parsed.data_write_guid, Guid::ZERO);
+    }
+
+    #[async_test]
+    async fn parse_higher_sequence_wins() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let file_length = file.file_size().await.unwrap();
+
+        // Corrupt header 1's CRC by flipping a byte.
+        let mut buf = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_1, &mut buf)
+            .await
+            .unwrap();
+        buf[10] ^= 0xFF;
+        file.write_at(format::HEADER_OFFSET_1, &buf).await.unwrap();
+
+        let parsed = parse_headers(&file, file_length).await.unwrap();
+        // Header 1 is invalid, so header 2 is used.
+        assert!(!parsed.first_header_current);
+        assert_eq!(parsed.sequence_number, 1);
+    }
+
+    #[async_test]
+    async fn parse_both_headers_corrupt() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let file_length = file.file_size().await.unwrap();
+
+        // Corrupt both headers.
+        let mut buf1 = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_1, &mut buf1)
+            .await
+            .unwrap();
+        buf1[10] ^= 0xFF;
+        file.write_at(format::HEADER_OFFSET_1, &buf1).await.unwrap();
+
+        let mut buf2 = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_2, &mut buf2)
+            .await
+            .unwrap();
+        buf2[10] ^= 0xFF;
+        file.write_at(format::HEADER_OFFSET_2, &buf2).await.unwrap();
+
+        let result = parse_headers(&file, file_length).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::NoValidHeaders
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn parse_one_valid_header() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let file_length = file.file_size().await.unwrap();
+
+        // Corrupt header 2's CRC.
+        let mut buf = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_2, &mut buf)
+            .await
+            .unwrap();
+        buf[10] ^= 0xFF;
+        file.write_at(format::HEADER_OFFSET_2, &buf).await.unwrap();
+
+        let parsed = parse_headers(&file, file_length).await.unwrap();
+        assert!(parsed.first_header_current);
+        assert_eq!(parsed.sequence_number, 0);
+    }
+
+    #[async_test]
+    async fn parse_log_validation() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let file_length = file.file_size().await.unwrap();
+
+        // Manually construct a header with valid signature but misaligned log.
+        let mut buf = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_1, &mut buf)
+            .await
+            .unwrap();
+
+        let mut header = Header::read_from_prefix(&buf).unwrap().0.clone();
+        header.log_offset = 12345; // Not aligned to REGION_ALIGNMENT.
+        header.log_length = format::REGION_ALIGNMENT as u32;
+        header.sequence_number = 100; // Make this the winning header.
+        header.checksum = 0;
+
+        // Write header bytes, recompute CRC.
+        let header_bytes = IntoBytes::as_bytes(&header);
+        buf[..header_bytes.len()].copy_from_slice(header_bytes);
+        let crc = format::compute_checksum(&buf, 4);
+        buf[4..8].copy_from_slice(&crc.to_le_bytes());
+        file.write_at(format::HEADER_OFFSET_1, &buf).await.unwrap();
+
+        let result = parse_headers(&file, file_length).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidLogOffsetOrLength
+            )))
+        ));
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/io.rs b/vm/devices/storage/vhdx/src/io.rs
new file mode 100644
index 0000000000..82da345e80
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/io.rs
@@ -0,0 +1,979 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! VHDX read/write I/O resolution and guards.
+//!
+//! Translates guest virtual disk offsets into file-level ranges via
+//! [`VhdxFile::resolve_read`] and [`VhdxFile::resolve_write`], handling
+//! block allocation, TFP lifecycle, sector bitmap updates, and
+//! crash-consistent BAT commits.
+
+use crate::AsyncFile;
+use crate::bat::BatGuard;
+use crate::bat::BlockMapping;
+use crate::bat::BlockSpan;
+use crate::bat::BlockType;
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::format::BatEntryState;
+use crate::format::MB1;
+use crate::header::WriteMode;
+use crate::open::VhdxFile;
+use crate::space::AllocateFlags;
+
+/// Record of a block that had Transitioning-to-Fully-Present (TFP) set
+/// during the allocation phase of [`VhdxFile::resolve_write`].
+///
+/// Carried inside [`WriteIoGuard`] so that `complete()` can finalize
+/// the BAT without re-walking the block range, and `abort()` can revert
+/// without guessing which blocks were modified.
+struct TfpRecord {
+    /// Block number in the BAT.
+    block_number: u32,
+    /// The block's mapping before TFP was set. Used by the abort path
+    /// to revert the in-memory BAT.
+    original_mapping: BlockMapping,
+    /// File offset of newly allocated space, if any. `None` when TFP
+    /// was set on an already-allocated block (e.g. PartiallyPresent →
+    /// FullyPresent promotion). The abort path releases this space back
+    /// to the free pool.
+    allocated_offset: Option<u64>,
+}
+
+/// Resolved range from a read operation.
+///
+/// Each range describes a contiguous portion of the read request and its
+/// data source. The caller iterates these ranges to perform the actual I/O.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ReadRange {
+    /// Data present at this file offset. Caller should read from the VHDX file.
+    Data {
+        /// Byte offset within the virtual disk.
+        guest_offset: u64,
+        /// Length in bytes.
+        length: u32,
+        /// Byte offset within the VHDX file where the data lives.
+        file_offset: u64,
+    },
+    /// Range is zero-filled. Caller should return zeros.
+    Zero {
+        /// Byte offset within the virtual disk.
+        guest_offset: u64,
+        /// Length in bytes.
+        length: u32,
+    },
+    /// Range is unmapped (transparent to parent). Caller should read from
+    /// the parent disk in a differencing chain.
+    Unmapped {
+        /// Byte offset within the virtual disk.
+        guest_offset: u64,
+        /// Length in bytes.
+        length: u32,
+    },
+}
+
+/// Resolved range from a write operation.
+///
+/// Each range describes a contiguous portion of the write target and
+/// what the caller needs to do.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum WriteRange {
+    /// Write caller's data at this file offset.
+    Data {
+        /// Byte offset within the virtual disk.
+        guest_offset: u64,
+        /// Length in bytes.
+        length: u32,
+        /// Byte offset within the VHDX file where data should be written.
+        file_offset: u64,
+    },
+    /// Zero-fill this file range (e.g. newly allocated block padding).
+    Zero {
+        /// Byte offset within the VHDX file to zero-fill.
+        file_offset: u64,
+        /// Length in bytes.
+        length: u32,
+    },
+}
+
+impl<F: AsyncFile> VhdxFile<F> {
+    /// Validate an I/O request and acquire per-block refcounts.
+    ///
+    /// Checks the failure flag, alignment, and bounds. Then increments
+    /// per-block refcounts atomically, waiting if trim has claimed any
+    /// block. Returns the [`BatGuard`] that holds the refcounts.
+    ///
+    /// Callers must handle zero-length requests before calling this.
+    async fn validate_and_acquire(
+        &self,
+        offset: u64,
+        len: u32,
+    ) -> Result<BatGuard<'_>, VhdxIoError> {
+        self.failed.check()?;
+
+        // Validate alignment to logical sector size.
+        if !offset.is_multiple_of(self.logical_sector_size as u64)
+            || !(len as u64).is_multiple_of(self.logical_sector_size as u64)
+        {
+            return Err(VhdxIoErrorInner::UnalignedIo.into());
+        }
+
+        // Validate bounds.
+        if offset
+            .checked_add(len as u64)
+            .is_none_or(|end| end > self.disk_size)
+        {
+            return Err(VhdxIoErrorInner::BeyondEndOfDisk.into());
+        }
+
+        // Increment per-block refcounts atomically. If trim has claimed
+        // any block (sentinel), wait and retry. Once the refcount is
+        // incremented, trim cannot modify that block's mapping, so the
+        // subsequent mapping reads are guaranteed to see stable state.
+        let start_block = self.bat.offset_to_block(offset);
+        let end_block = self.bat.offset_to_block(offset + len as u64 - 1);
+        let block_count = end_block - start_block + 1;
+
+        let guard = self
+            .bat
+            .acquire_io_refcounts(start_block, block_count)
+            .await;
+
+        Ok(guard)
+    }
+
+    /// Resolve a read request into file-level ranges.
+    ///
+    /// Walks the read request block-by-block, looking up each block's state
+    /// in the BAT and appending one or more [`ReadRange`] entries to `ranges`.
+    /// The caller performs actual file I/O based on the returned ranges.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the read extends beyond the virtual disk size,
+    /// if the offset or length is not aligned to the logical sector size,
+    /// or if a BAT entry is corrupt.
+    pub async fn resolve_read(
+        &self,
+        offset: u64,
+        len: u32,
+        ranges: &mut Vec<ReadRange>,
+    ) -> Result<ReadIoGuard<'_, F>, VhdxIoError> {
+        // Zero-length reads succeed immediately.
+        if len == 0 {
+            return Ok(ReadIoGuard::empty());
+        }
+
+        let guard = self.validate_and_acquire(offset, len).await?;
+
+        for span in self.bat.block_spans(offset, len) {
+            let mapping = self.bat.get_block_mapping(span.block_number);
+
+            match mapping.bat_state() {
+                BatEntryState::FullyPresent => {
+                    let file_offset = mapping.file_offset() + span.block_offset as u64;
+                    ranges.push(ReadRange::Data {
+                        guest_offset: span.virtual_offset,
+                        length: span.length,
+                        file_offset,
+                    });
+                }
+                BatEntryState::PartiallyPresent => {
+                    self.resolve_partial_block_read(
+                        mapping.file_offset(),
+                        span.virtual_offset,
+                        span.length,
+                        ranges,
+                    )
+                    .await?;
+                }
+                BatEntryState::NotPresent => {
+                    if self.has_parent {
+                        ranges.push(ReadRange::Unmapped {
+                            guest_offset: span.virtual_offset,
+                            length: span.length,
+                        });
+                    } else {
+                        ranges.push(ReadRange::Zero {
+                            guest_offset: span.virtual_offset,
+                            length: span.length,
+                        });
+                    }
+                }
+                BatEntryState::Zero | BatEntryState::Unmapped | BatEntryState::Undefined => {
+                    ranges.push(ReadRange::Zero {
+                        guest_offset: span.virtual_offset,
+                        length: span.length,
+                    });
+                }
+            }
+        }
+
+        Ok(ReadIoGuard::new(guard))
+    }
+
+    /// Resolve a write request into file-level ranges.
+    ///
+    /// Walks the write request block-by-block, allocating blocks as needed.
+    /// For each block, emits [`WriteRange::Data`] entries describing where
+    /// the caller should write data, and [`WriteRange::Zero`] entries for
+    /// any newly allocated regions that must be zero-filled.
+    ///
+    /// Blocks that are fully-covering writes use TFP (Transitioning to Fully
+    /// Present) to defer BAT commit to [`WriteIoGuard::complete()`]. Partial writes
+    /// commit the BAT immediately via per-entry cache write.
+    ///
+    /// Before any ranges are returned, the header is updated with new GUIDs
+    /// and flushed to disk (first-write gate).
+    ///
+    /// After the caller writes data at the returned offsets, it **must** call
+    /// [`WriteIoGuard::complete()`] to finalize the BAT and sector bitmaps.
+    /// Dropping the guard without calling `complete()` aborts the write.
+    pub async fn resolve_write(
+        &self,
+        offset: u64,
+        len: u32,
+        ranges: &mut Vec<WriteRange>,
+    ) -> Result<WriteIoGuard<'_, F>, VhdxIoError> {
+        // Check read-only.
+        if self.read_only {
+            return Err(VhdxIoErrorInner::ReadOnly.into());
+        }
+
+        // Zero-length writes succeed immediately.
+        if len == 0 {
+            return Ok(WriteIoGuard::new_completed(self));
+        }
+
+        // First-write gate: update header with new GUIDs before any data.
+        self.enable_write_mode(WriteMode::DataWritable)
+            .await
+            .map_err(VhdxIoErrorInner::WriteHeader)?;
+
+        let refcount_guard = self.validate_and_acquire(offset, len).await?;
+
+        let mut blocks_needing_allocation: Vec<BlockSpan> = Vec::new();
+
+        // --- Read phase: check BAT state for each block ---
+        for span in self.bat.block_spans(offset, len) {
+            let is_full_block = span.is_full_block(self.block_size);
+
+            // Read the in-memory BAT state.
+            loop {
+                let (state, file_offset, has_tfp) = {
+                    let mapping = self.bat.get_block_mapping(span.block_number);
+                    (
+                        mapping.bat_state(),
+                        mapping.file_offset(),
+                        mapping.transitioning_to_fully_present(),
+                    )
+                };
+
+                if has_tfp {
+                    // Block is being allocated by another task — wait and retry.
+                    let listener = self.allocation_event.listen();
+                    if self
+                        .bat
+                        .get_block_mapping(span.block_number)
+                        .transitioning_to_fully_present()
+                    {
+                        listener.await;
+                    }
+                    continue;
+                }
+
+                match state {
+                    BatEntryState::FullyPresent => {
+                        ranges.push(WriteRange::Data {
+                            guest_offset: span.virtual_offset,
+                            length: span.length,
+                            file_offset: file_offset + span.block_offset as u64,
+                        });
+                        break;
+                    }
+                    BatEntryState::PartiallyPresent if !is_full_block => {
+                        // Partial write to already-allocated block — write
+                        // directly. complete_write() updates sector bitmaps.
+                        ranges.push(WriteRange::Data {
+                            guest_offset: span.virtual_offset,
+                            length: span.length,
+                            file_offset: file_offset + span.block_offset as u64,
+                        });
+                        break;
+                    }
+                    BatEntryState::PartiallyPresent => {
+                        // Fully-covering write to PartiallyPresent block —
+                        // needs TFP to promote to FullyPresent. Fall through
+                        // to allocation phase.
+                        blocks_needing_allocation.push(span);
+                        break;
+                    }
+                    BatEntryState::NotPresent
+                    | BatEntryState::Zero
+                    | BatEntryState::Unmapped
+                    | BatEntryState::Undefined => {
+                        // Unallocated — needs allocation.
+                        blocks_needing_allocation.push(span);
+                        break;
+                    }
+                }
+            }
+        }
+
+        // If nothing needs allocation, we're done. Transfer refcount
+        // ownership from the ReadIoGuard to the WriteIoGuard.
+        if blocks_needing_allocation.is_empty() {
+            return Ok(WriteIoGuard::new_no_alloc(
+                self,
+                refcount_guard,
+                offset,
+                len,
+            ));
+        }
+
+        // --- Allocation phase ---
+        // Keep refcount_guard alive — its Drop will release refcounts
+        // if any error path below returns early. On success, we forget
+        // it and let the WriteIoGuard take ownership.
+
+        // --- Allocation phase: acquire BlockAllocationLock ---
+        // Wait until no blocks in our allocation set have TFP set by
+        // a concurrent allocator. This is the OverlappingAllocations
+        // serialization: if another writer is
+        // transitioning any of our blocks, we park and wait for that
+        // writer's post-allocate to clear TFP before proceeding.
+        // LOCK AUDIT: No synchronous locks held entering allocation loop.
+        // allocation_lock (futures::Mutex) is acquired via .await — fine.
+        let mut alloc_guard = loop {
+            let alloc_guard = self.allocation_lock.lock().await;
+
+            // Check all blocks under BAT lock for TFP overlap.
+            // Register listener before dropping locks to avoid missed wakes.
+            let listener = self.allocation_event.listen();
+            if !blocks_needing_allocation.iter().any(|span| {
+                self.bat
+                    .get_block_mapping(span.block_number)
+                    .transitioning_to_fully_present()
+            }) {
+                break alloc_guard;
+            }
+            drop(alloc_guard);
+            listener.await;
+        };
+
+        // Track blocks that got TFP set (for error cleanup and complete/abort).
+        let mut tfp_records: Vec<TfpRecord> = Vec::new();
+
+        // Track whether any TFP allocation used unsafe (non-safe-data) space.
+        // When true, complete_write_inner() captures the current FSN and
+        // attaches it to the BAT page(s) so the log task waits for the
+        // data flush before logging the BAT update.
+        let mut needs_flush_before_log = false;
+
+        // Re-check and allocate under the lock.
+        // No block in our set should have TFP at this point — we waited
+        // for all concurrent allocators to finish above.
+        let eof = &mut *alloc_guard;
+        let allocation_result: Result<(), VhdxIoError> = async {
+            for span in &blocks_needing_allocation {
+                let is_full_block = span.is_full_block(self.block_size);
+
+                // Re-read mapping (may have changed since read phase).
+                let mapping = self.bat.get_block_mapping(span.block_number);
+
+                // Assert no TFP — we serialized against concurrent
+                // allocators in the loop above.
+                assert!(
+                    !mapping.transitioning_to_fully_present(),
+                    "block {} has TFP after overlap wait",
+                    span.block_number
+                );
+
+                match mapping.bat_state() {
+                    BatEntryState::FullyPresent => {
+                        // Already allocated by a concurrent writer — just emit range.
+                        ranges.push(WriteRange::Data {
+                            guest_offset: span.virtual_offset,
+                            length: span.length,
+                            file_offset: mapping.file_offset() + span.block_offset as u64,
+                        });
+                    }
+                    BatEntryState::PartiallyPresent if is_full_block => {
+                        // Fully-covering write to PartiallyPresent — set TFP
+                        // on existing mapping, no new space.
+                        // This is always safe (space already has this block's
+                        // data), so no change to needs_flush_before_log.
+                        let original = mapping;
+                        let new_mapping = original.with_transitioning_to_fully_present(true);
+
+                        self.bat.set_block_mapping(span.block_number, new_mapping);
+
+                        tfp_records.push(TfpRecord {
+                            block_number: span.block_number,
+                            original_mapping: original,
+                            allocated_offset: None,
+                        });
+
+                        ranges.push(WriteRange::Data {
+                            guest_offset: span.virtual_offset,
+                            length: span.length,
+                            file_offset: mapping.file_offset() + span.block_offset as u64,
+                        });
+                    }
+                    _ => {
+                        // Unallocated block — allocate space.
+                        let original = mapping;
+                        let (new_offset, space_state) = self
+                            .allocate_block_space(span.block_number, mapping, eof)
+                            .await?;
+
+                        if is_full_block {
+                            self.allocate_full_block(
+                                span,
+                                original,
+                                new_offset,
+                                space_state,
+                                &mut tfp_records,
+                                &mut needs_flush_before_log,
+                                ranges,
+                            );
+                        } else {
+                            self.allocate_partial_block(
+                                span,
+                                mapping,
+                                new_offset,
+                                space_state,
+                                eof,
+                                ranges,
+                            )
+                            .await?;
+                        }
+                    }
+                }
+            }
+
+            Ok(())
+        }
+        .await;
+
+        // Error cleanup: revert TFP-marked blocks and release allocated space on failure.
+        if let Err(e) = allocation_result {
+            self.abort_write_sync(&tfp_records);
+            return Err(e);
+        }
+
+        // Allocation lock is released when _alloc_guard drops (after
+        // returning ranges to caller).
+
+        Ok(WriteIoGuard::new(
+            self,
+            refcount_guard,
+            offset,
+            len,
+            needs_flush_before_log,
+            tfp_records,
+        ))
+    }
+
+    /// Allocate file space for a payload block.
+    ///
+    /// Tries three sources in priority order:
+    /// 1. Deferred releases (non-durable trim) — reclaim same-block space.
+    /// 2. Soft-anchored blocks (durable trim) — reclaim same-block space.
+    /// 3. Fresh allocation via [`allocate_space`].
+    ///
+    /// Returns the file offset and [`SpaceState`] of the allocated region.
+    async fn allocate_block_space(
+        &self,
+        block_number: u32,
+        mapping: BlockMapping,
+        eof: &mut crate::space::EofState,
+    ) -> Result<(u64, crate::space::SpaceState), VhdxIoError> {
+        // 1. Check deferred releases (non-durable trim).
+        if let Some(deferred_offset) = self.deferred_releases.remove(block_number) {
+            return Ok((deferred_offset, crate::space::SpaceState::OwnStale));
+        }
+
+        // 2. Check soft-anchored blocks (durable trim).
+        if mapping.is_soft_anchored() {
+            let old_file_offset = mapping.file_offset();
+            if self
+                .free_space
+                .unmark_trimmed_block(block_number, old_file_offset, self.block_size)
+            {
+                return Ok((old_file_offset, crate::space::SpaceState::OwnStale));
+            }
+            // Unmark failed (race) — fall through to fresh allocation.
+        }
+
+        // 3. Allocate fresh space.
+        let r = self
+            .allocate_space(eof, self.block_size, AllocateFlags::new())
+            .await?;
+        Ok((r.file_offset, r.state))
+    }
+
+    /// Handle a full-block allocation: set TFP, record for later completion,
+    /// and emit the data range.
+    fn allocate_full_block(
+        &self,
+        span: &BlockSpan,
+        original_mapping: BlockMapping,
+        new_offset: u64,
+        space_state: crate::space::SpaceState,
+        tfp_records: &mut Vec<TfpRecord>,
+        needs_flush_before_log: &mut bool,
+        ranges: &mut Vec<WriteRange>,
+    ) {
+        let new_mapping = BlockMapping::new()
+            .with_bat_state(original_mapping.bat_state())
+            .with_transitioning_to_fully_present(true)
+            .with_file_megabyte((new_offset / MB1) as u32);
+
+        self.bat.set_block_mapping(span.block_number, new_mapping);
+
+        tfp_records.push(TfpRecord {
+            block_number: span.block_number,
+            original_mapping,
+            allocated_offset: Some(new_offset),
+        });
+
+        if !space_state.is_safe() {
+            *needs_flush_before_log = true;
+        }
+
+        ranges.push(WriteRange::Data {
+            guest_offset: span.virtual_offset,
+            length: span.length,
+            file_offset: new_offset + span.block_offset as u64,
+        });
+    }
+
+    /// Handle a partial-block allocation: commit BAT immediately, allocate
+    /// SBM if needed (differencing disks), and emit data + zero-padding ranges.
+    async fn allocate_partial_block(
+        &self,
+        span: &BlockSpan,
+        mapping: BlockMapping,
+        new_offset: u64,
+        space_state: crate::space::SpaceState,
+        eof: &mut crate::space::EofState,
+        ranges: &mut Vec<WriteRange>,
+    ) -> Result<(), VhdxIoError> {
+        // For differencing disks: if the block was NotPresent (transparent
+        // to parent), allocate as PartiallyPresent so that unwritten sectors
+        // remain transparent. The sector bitmap will be updated in
+        // complete_write_inner() to mark only the written sectors as present.
+        //
+        // For non-diff disks or blocks in other states (Zero, Unmapped,
+        // Undefined): allocate as FullyPresent with zero-padding.
+        let is_partial_present =
+            self.has_parent && mapping.bat_state() == BatEntryState::NotPresent;
+
+        // Allocate SBM block if needed for PartiallyPresent.
+        if is_partial_present {
+            self.ensure_sbm_allocated(span.block_number, eof).await?;
+        }
+
+        let new_state = if is_partial_present {
+            BatEntryState::PartiallyPresent
+        } else {
+            BatEntryState::FullyPresent
+        };
+
+        let new_mapping = BlockMapping::new()
+            .with_bat_state(new_state)
+            .with_transitioning_to_fully_present(false)
+            .with_file_megabyte((new_offset / MB1) as u32);
+
+        // Capture per-page FSN when !is_safe. The FSN is captured now
+        // (before the caller writes data) to ensure the data flush
+        // completes before the BAT update is logged.
+        let pre_log_fsn = if !space_state.is_safe() {
+            self.log_state
+                .as_ref()
+                .map(|state| state.flush_sequencer.current_fsn())
+        } else {
+            None
+        };
+
+        // Per-entry cache write (write-through to disk).
+        self.bat
+            .write_block_mapping(
+                &self.cache,
+                BlockType::Payload,
+                span.block_number,
+                new_mapping,
+                pre_log_fsn,
+            )
+            .await?;
+
+        // Emit zero + data + zero ranges.
+        // For PartiallyPresent blocks, skip zero-fill — unwritten sectors
+        // are transparent to parent (the sector bitmap tracks presence).
+        // For FullyPresent blocks, zero-fill surround unless the space
+        // is already safe (zeroed).
+        if !is_partial_present && span.block_offset > 0 && !space_state.is_zero() {
+            ranges.push(WriteRange::Zero {
+                file_offset: new_offset,
+                length: span.block_offset,
+            });
+        }
+
+        ranges.push(WriteRange::Data {
+            guest_offset: span.virtual_offset,
+            length: span.length,
+            file_offset: new_offset + span.block_offset as u64,
+        });
+
+        let end_offset = span.block_offset + span.length;
+        if !is_partial_present && end_offset < self.block_size && !space_state.is_zero() {
+            ranges.push(WriteRange::Zero {
+                file_offset: new_offset + end_offset as u64,
+                length: self.block_size - end_offset,
+            });
+        }
+
+        Ok(())
+    }
+
+    /// Ensure the sector bitmap block for the given payload block's chunk
+    /// is allocated. No-op if the SBM is already present.
+    async fn ensure_sbm_allocated(
+        &self,
+        block_number: u32,
+        eof: &mut crate::space::EofState,
+    ) -> Result<(), VhdxIoError> {
+        let chunk_number = block_number / self.bat.chunk_ratio;
+        let sbm_mapping = self.bat.get_sector_bitmap_mapping(chunk_number);
+
+        if sbm_mapping.bat_state() == BatEntryState::FullyPresent {
+            return Ok(());
+        }
+
+        let sbm_alloc = self
+            .allocate_space(
+                eof,
+                crate::bat::SECTOR_BITMAP_BLOCK_SIZE,
+                AllocateFlags::new().with_zero(true),
+            )
+            .await?;
+
+        let new_sbm = BlockMapping::new()
+            .with_bat_state(BatEntryState::FullyPresent)
+            .with_file_megabyte((sbm_alloc.file_offset / MB1) as u32);
+
+        self.bat
+            .write_block_mapping(
+                &self.cache,
+                BlockType::SectorBitmap,
+                chunk_number,
+                new_sbm,
+                None,
+            )
+            .await?;
+
+        Ok(())
+    }
+
+    /// Finalize a write operation (internal implementation).
+    ///
+    /// Called by [`WriteIoGuard::complete()`] after the caller has written
+    /// data to the resolved ranges.
+    ///
+    /// Iterates the TFP records from `resolve_write`, clearing the TFP
+    /// flag and setting each block to `FullyPresent` via
+    /// [`write_block_mapping`] (which atomically updates the in-memory
+    /// BAT and dirties the cache page). Then updates sector bitmaps for
+    /// any `PartiallyPresent` blocks (differencing disk partial writes).
+    ///
+    /// If a cache write fails, the remaining TFP records are reverted
+    /// via [`abort_write_sync`] and the error is returned. The file will
+    /// be poisoned by the log pipeline shortly after.
+    async fn complete_write_inner(
+        &self,
+        offset: u64,
+        len: u32,
+        tfp_records: &[TfpRecord],
+        needs_flush_before_log: bool,
+    ) -> Result<(), VhdxIoError> {
+        let had_tfp = !tfp_records.is_empty();
+
+        // Capture FSN after the caller's data writes.
+        // Passed into write_block_mapping so it's stamped on the
+        // cache page atomically with the dirty-mark.
+        let pre_log_fsn = if needs_flush_before_log {
+            self.log_state
+                .as_ref()
+                .map(|state| state.flush_sequencer.current_fsn())
+        } else {
+            None
+        };
+
+        for (i, record) in tfp_records.iter().enumerate() {
+            let mapping = self.bat.get_block_mapping(record.block_number);
+            let final_mapping = BlockMapping::new()
+                .with_bat_state(BatEntryState::FullyPresent)
+                .with_transitioning_to_fully_present(false)
+                .with_file_megabyte(mapping.file_megabyte());
+
+            if let Err(e) = self
+                .bat
+                .write_block_mapping(
+                    &self.cache,
+                    BlockType::Payload,
+                    record.block_number,
+                    final_mapping,
+                    pre_log_fsn,
+                )
+                .await
+            {
+                self.abort_write_sync(&tfp_records[i..]);
+                return Err(e);
+            }
+        }
+
+        // Notify waiters ALWAYS, even on failure or cache write error.
+        if had_tfp {
+            self.allocation_event.notify(usize::MAX);
+        }
+
+        // Update sector bitmaps for partial writes to differencing disks.
+        // Walk the block range to find PartiallyPresent blocks that need
+        // their sector bitmaps updated. This is cheap (BAT state is in memory)
+        // and avoids carrying SBM records in the guard.
+        if self.has_parent && len > 0 {
+            for span in self.bat.block_spans(offset, len) {
+                let mapping = self.bat.get_block_mapping(span.block_number);
+                if !mapping.transitioning_to_fully_present()
+                    && mapping.bat_state() == BatEntryState::PartiallyPresent
+                {
+                    self.set_sector_bitmap_bits(span.virtual_offset, span.length, true)
+                        .await?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Revert TFP blocks to their original state.
+    ///
+    /// Called on two paths:
+    /// - `WriteIoGuard::drop()` without `complete()` (write aborted)
+    /// - `complete_write_inner()` when a cache write fails (partial completion)
+    ///
+    /// Restores each block's in-memory BAT to [`TfpRecord::original_mapping`],
+    /// releases any newly allocated space back to the free pool, and
+    /// notifies allocation waiters. Does not perform any file I/O.
+    fn abort_write_sync(&self, tfp_records: &[TfpRecord]) {
+        if tfp_records.is_empty() {
+            return;
+        }
+
+        for record in tfp_records {
+            self.bat
+                .set_block_mapping(record.block_number, record.original_mapping);
+            // Release allocated space back to free pool.
+            if let Some(offset) = record.allocated_offset {
+                self.free_space.release(offset, self.block_size);
+            }
+        }
+
+        self.allocation_event.notify(usize::MAX);
+    }
+
+    /// Flush all writes to stable storage.
+    ///
+    /// Commits dirty cache pages to the log task, waits for the WAL
+    /// entry to be written, then flushes to make everything durable:
+    /// user data writes, WAL entries, and apply-task writes.
+    pub async fn flush(&self) -> Result<(), VhdxIoError> {
+        self.failed.check()?;
+
+        if self.read_only {
+            return Err(VhdxIoErrorInner::ReadOnly.into());
+        }
+
+        // Stamp all uncommitted deferred entries with the current
+        // generation BEFORE commit(). This ensures that entries stamped
+        // here have their BAT pages swept into this commit's WAL entry.
+        // Any new trims that run concurrently will insert with
+        // committed_gen = None and wait for the next flush.
+        let flush_gen = self.deferred_releases.stamp_uncommitted();
+
+        let lsn = self.cache.commit().map_err(VhdxIoErrorInner::CommitCache)?;
+
+        let state = self
+            .log_state
+            .as_ref()
+            .expect("writable file has log_state");
+
+        // Wait for the log task to write WAL entries through this LSN.
+        state
+            .logged_lsn
+            .wait_for(lsn)
+            .await
+            .map_err(VhdxIoErrorInner::Failed)?;
+
+        // Flush everything: user data, WAL entries, applied pages.
+        state
+            .flush_sequencer
+            .flush(self.file.as_ref())
+            .await
+            .map_err(VhdxIoErrorInner::Flush)?;
+
+        // Now that the WAL is durable, promote entries committed at or
+        // before this generation. Their BAT changes are crash-safe.
+        for (block_number, file_offset, size, anchor) in
+            self.deferred_releases.drain_committed(flush_gen)
+        {
+            if anchor {
+                let _ = self
+                    .free_space
+                    .mark_trimmed_block(block_number, file_offset, size);
+            } else {
+                self.free_space.release(file_offset, size);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// I/O guards
+// ---------------------------------------------------------------------------
+
+/// Guard for read I/O. Drop after file reads are complete.
+///
+/// Returned by [`VhdxFile::resolve_read`]. Dropping this guard decrements
+/// per-block refcounts, allowing trim to proceed.
+pub struct ReadIoGuard<'a, F: AsyncFile> {
+    // Significant drop.
+    _bat_guard: BatGuard<'a>,
+    _phantom: std::marker::PhantomData<&'a VhdxFile<F>>,
+}
+
+impl<'a, F: AsyncFile> ReadIoGuard<'a, F> {
+    /// Create a new read guard with refcount tracking.
+    fn new(bat_guard: BatGuard<'a>) -> Self {
+        Self {
+            _bat_guard: bat_guard,
+            _phantom: std::marker::PhantomData,
+        }
+    }
+
+    fn empty() -> Self {
+        Self {
+            _bat_guard: BatGuard::empty(),
+            _phantom: std::marker::PhantomData,
+        }
+    }
+}
+
+/// Guard for write I/O. Call [`complete()`](Self::complete) to finalize,
+/// or drop to abort.
+///
+/// Returned by [`VhdxFile::resolve_write`]. Dropping without calling
+/// `complete()` aborts the write, reverting TFP blocks and releasing
+/// allocated space. In both cases, per-block refcounts are decremented
+/// via the owned [`ReadIoGuard`].
+pub struct WriteIoGuard<'a, F: AsyncFile> {
+    vhdx: &'a VhdxFile<F>,
+    // Significant drop.
+    _bat_guard: BatGuard<'a>,
+    /// The guest offset of the write (needed for SBM bitmap updates).
+    offset: u64,
+    /// The length of the write in bytes.
+    len: u32,
+    /// Whether `complete()` was called. If false on drop, the write is aborted.
+    completed: bool,
+    /// True when at least one TFP block was allocated from space that is
+    /// NOT safe (could contain stale data from another block). When true,
+    /// `complete_write_inner` must capture the current FSN and apply it
+    /// to the BAT pages so the log task waits for the data flush before
+    /// logging the BAT update.
+    needs_flush_before_log: bool,
+    /// TFP records collected during resolve_write, needed by complete/abort.
+    /// `None` after complete() or for zero-length writes.
+    tfp_records: Vec<TfpRecord>,
+}
+
+impl<'a, F: AsyncFile> WriteIoGuard<'a, F> {
+    /// Create a new write guard that takes ownership of a [`ReadIoGuard`]
+    /// for refcount management.
+    fn new(
+        vhdx: &'a VhdxFile<F>,
+        bat_guard: BatGuard<'a>,
+        offset: u64,
+        len: u32,
+        needs_flush_before_log: bool,
+        tfp_records: Vec<TfpRecord>,
+    ) -> Self {
+        Self {
+            vhdx,
+            _bat_guard: bat_guard,
+            offset,
+            len,
+            completed: false,
+            needs_flush_before_log,
+            tfp_records,
+        }
+    }
+
+    /// Create a write guard that is already completed (for zero-length writes).
+    fn new_completed(vhdx: &'a VhdxFile<F>) -> Self {
+        Self {
+            vhdx,
+            _bat_guard: BatGuard::empty(),
+            offset: 0,
+            len: 0,
+            completed: true,
+            needs_flush_before_log: false,
+            tfp_records: Vec::new(),
+        }
+    }
+
+    /// Create a write guard with no completion records (no allocation was
+    /// needed — all blocks were already FullyPresent or PartiallyPresent
+    /// with a sub-block write).
+    fn new_no_alloc(vhdx: &'a VhdxFile<F>, bat_guard: BatGuard<'a>, offset: u64, len: u32) -> Self {
+        Self {
+            vhdx,
+            _bat_guard: bat_guard,
+            offset,
+            len,
+            completed: false,
+            needs_flush_before_log: false,
+            tfp_records: Vec::new(),
+        }
+    }
+
+    /// Finalize the write after data has been written to resolved ranges.
+    ///
+    /// Commits TFP -> FullyPresent, updates sector bitmaps.
+    /// Consumes the guard. Refcounts are decremented when `self` is dropped
+    /// after this method returns.
+    pub async fn complete(mut self) -> Result<(), VhdxIoError> {
+        self.completed = true;
+        self.vhdx
+            .complete_write_inner(
+                self.offset,
+                self.len,
+                &self.tfp_records,
+                self.needs_flush_before_log,
+            )
+            .await
+    }
+}
+
+impl<F: AsyncFile> Drop for WriteIoGuard<'_, F> {
+    fn drop(&mut self) {
+        // If complete() was not called, abort the write.
+        if !self.completed {
+            self.vhdx.abort_write_sync(&self.tfp_records);
+        }
+        // Refcounts are decremented when self.bat_guard drops.
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/known_meta.rs b/vm/devices/storage/vhdx/src/known_meta.rs
new file mode 100644
index 0000000000..ca90450151
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/known_meta.rs
@@ -0,0 +1,473 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Known metadata parsing for VHDX files.
+//!
+//! Verifies that all required system metadata items are recognized, then
+//! reads and parses the well-known items (file parameters, disk size,
+//! sector sizes, page 83 data) into typed Rust values.
+
+use crate::AsyncFile;
+use crate::cache::PageCache;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::format;
+use crate::format::FileParameters;
+use crate::format::FileParametersFlags;
+use crate::metadata::MetadataTable;
+use guid::Guid;
+use zerocopy::FromBytes;
+
+/// Parsed metadata from a VHDX file's metadata region.
+pub(crate) struct KnownMetadata {
+    /// Block size in bytes.
+    pub block_size: u32,
+    /// Whether this is a differencing disk.
+    pub has_parent: bool,
+    /// Whether blocks should remain allocated (fixed VHD).
+    pub leave_blocks_allocated: bool,
+    /// Virtual disk size in bytes.
+    pub disk_size: u64,
+    /// Logical sector size (512 or 4096).
+    pub logical_sector_size: u32,
+    /// Physical sector size (512 or 4096).
+    pub physical_sector_size: u32,
+    /// Page 83 data GUID.
+    pub page_83_data: Guid,
+}
+
+/// Known system metadata item GUIDs that this parser understands.
+const KNOWN_ITEM_IDS: &[Guid] = &[
+    format::FILE_PARAMETERS_ITEM_GUID,
+    format::VIRTUAL_DISK_SIZE_ITEM_GUID,
+    format::PAGE_83_ITEM_GUID,
+    format::CHS_PARAMETERS_ITEM_GUID,
+    format::LOGICAL_SECTOR_SIZE_ITEM_GUID,
+    format::PHYSICAL_SECTOR_SIZE_ITEM_GUID,
+    format::PARENT_LOCATOR_ITEM_GUID,
+    format::PMEM_LABEL_STORAGE_AREA_ITEM_GUID,
+];
+
+/// Verify that all required system metadata items in the table are known to
+/// this parser. Unknown required items cause an error (except the incomplete
+/// file marker, which has special handling).
+pub(crate) fn verify_known_metadata(
+    table: &MetadataTable,
+    allow_incomplete: bool,
+) -> Result<(), OpenError> {
+    for entry in table.entries() {
+        // Only check system (non-user) entries that are required.
+        if entry.flags.is_user() || !entry.flags.is_required() {
+            continue;
+        }
+
+        if KNOWN_ITEM_IDS.contains(&entry.item_id) {
+            continue;
+        }
+
+        if entry.item_id == format::INCOMPLETE_FILE_ITEM_GUID {
+            if allow_incomplete {
+                continue;
+            }
+            return Err(CorruptionType::IncompleteFile.into());
+        }
+
+        return Err(CorruptionType::UnknownRequiredMetadata.into());
+    }
+    Ok(())
+}
+
+/// Read and parse all known metadata items from the file.
+pub(crate) async fn read_known_metadata(
+    cache: &PageCache<impl AsyncFile>,
+    table: &MetadataTable,
+) -> Result<KnownMetadata, OpenError> {
+    // --- Logical sector size (read first, needed for disk size validation) ---
+    let logical_sector_size = {
+        let entry = table
+            .find_entry(false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID)
+            .ok_or(CorruptionType::MissingRequiredMetadata)?;
+
+        if entry.length != 4 {
+            return Err(CorruptionType::InvalidLogicalSectorSizeSize.into());
+        }
+        if !entry.flags.is_virtual_disk() {
+            return Err(CorruptionType::LogicalSectorSizeMarkedVirtual.into());
+        }
+
+        let data = table
+            .read_item(cache, false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID)
+            .await?;
+        let value = u32::from_le_bytes(data.try_into().unwrap());
+        if value != 512 && value != 4096 {
+            return Err(CorruptionType::InvalidLogicalSectorSize.into());
+        }
+        value
+    };
+
+    // --- File parameters ---
+    let (block_size, has_parent, leave_blocks_allocated) = {
+        let entry = table
+            .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID)
+            .ok_or(CorruptionType::MissingRequiredMetadata)?;
+
+        if entry.length as usize != size_of::<FileParameters>() {
+            return Err(CorruptionType::InvalidFileParameterSize.into());
+        }
+        if entry.flags.is_virtual_disk() {
+            return Err(CorruptionType::FileParametersMarkedVirtual.into());
+        }
+
+        let data = table
+            .read_item(cache, false, &format::FILE_PARAMETERS_ITEM_GUID)
+            .await?;
+        let params = FileParameters::read_from_bytes(&data)
+            .map_err(|_| CorruptionType::InvalidFileParameterSize)?;
+        let supported = u32::from(
+            FileParametersFlags::new()
+                .with_leave_blocks_allocated(true)
+                .with_has_parent(true),
+        );
+        if u32::from(params.flags) & !supported != 0 {
+            return Err(CorruptionType::ReservedFileParametersFieldNonzero.into());
+        }
+
+        let bs = params.block_size;
+        if !bs.is_power_of_two()
+            || (bs as u64) < format::MB1
+            || bs as u64 > format::MAXIMUM_BLOCK_SIZE
+        {
+            return Err(CorruptionType::InvalidBlockSize.into());
+        }
+
+        (
+            bs,
+            params.flags.has_parent(),
+            params.flags.leave_blocks_allocated(),
+        )
+    };
+
+    // --- Virtual disk size ---
+    let disk_size = {
+        let entry = table
+            .find_entry(false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID)
+            .ok_or(CorruptionType::MissingRequiredMetadata)?;
+
+        if entry.length != 8 {
+            return Err(CorruptionType::InvalidDiskSize.into());
+        }
+        if !entry.flags.is_virtual_disk() {
+            return Err(CorruptionType::DiskMarkedVirtual.into());
+        }
+
+        let data = table
+            .read_item(cache, false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID)
+            .await?;
+        let value = u64::from_le_bytes(data.try_into().unwrap());
+        if value == 0
+            || value > format::MAXIMUM_DISK_SIZE
+            || !value.is_multiple_of(logical_sector_size as u64)
+        {
+            return Err(CorruptionType::InvalidDiskSize.into());
+        }
+        value
+    };
+
+    // --- Physical sector size ---
+    let physical_sector_size = {
+        let entry = table
+            .find_entry(false, &format::PHYSICAL_SECTOR_SIZE_ITEM_GUID)
+            .ok_or(CorruptionType::MissingRequiredMetadata)?;
+
+        if entry.length != 4 {
+            return Err(CorruptionType::InvalidSectorSize.into());
+        }
+        if !entry.flags.is_virtual_disk() {
+            return Err(CorruptionType::InvalidSectorSize.into());
+        }
+
+        let data = table
+            .read_item(cache, false, &format::PHYSICAL_SECTOR_SIZE_ITEM_GUID)
+            .await?;
+        let value = u32::from_le_bytes(data.try_into().unwrap());
+        if value != 512 && value != 4096 {
+            return Err(CorruptionType::InvalidSectorSize.into());
+        }
+        value
+    };
+
+    // --- Page 83 data ---
+    let page_83_data = {
+        let entry = table
+            .find_entry(false, &format::PAGE_83_ITEM_GUID)
+            .ok_or(CorruptionType::MissingRequiredMetadata)?;
+
+        if entry.length != 16 {
+            return Err(CorruptionType::MissingRequiredMetadata.into());
+        }
+        if !entry.flags.is_virtual_disk() {
+            return Err(CorruptionType::MissingRequiredMetadata.into());
+        }
+
+        let data = table
+            .read_item(cache, false, &format::PAGE_83_ITEM_GUID)
+            .await?;
+        Guid::read_from_bytes(&data).map_err(|_| CorruptionType::MissingRequiredMetadata)?
+    };
+
+    Ok(KnownMetadata {
+        block_size,
+        has_parent,
+        leave_blocks_allocated,
+        disk_size,
+        logical_sector_size,
+        physical_sector_size,
+        page_83_data,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::cache::PageCache;
+    use crate::error::OpenErrorInner;
+    use crate::metadata::METADATA_TAG;
+    use crate::region;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+    use std::sync::Arc;
+    use zerocopy::IntoBytes;
+
+    fn metadata_cache(file: InMemoryFile, metadata_offset: u64) -> PageCache<InMemoryFile> {
+        let mut cache = PageCache::new(Arc::new(file), None, None, 0);
+        cache.register_tag(METADATA_TAG, metadata_offset);
+        cache
+    }
+
+    #[async_test]
+    async fn read_known_metadata_from_created_file() {
+        let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+        let cache = metadata_cache(file, regions.metadata_offset);
+
+        let meta = read_known_metadata(&cache, &table).await.unwrap();
+
+        assert_eq!(meta.disk_size, format::GB1);
+        assert_eq!(meta.block_size, params.block_size);
+        assert_eq!(meta.logical_sector_size, params.logical_sector_size);
+        assert_eq!(meta.physical_sector_size, params.physical_sector_size);
+        assert!(!meta.has_parent);
+        assert!(!meta.leave_blocks_allocated);
+        assert_ne!(meta.page_83_data, Guid::ZERO);
+    }
+
+    #[async_test]
+    async fn verify_known_metadata_all_known() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        // All standard entries should be recognized.
+        verify_known_metadata(&table, false).unwrap();
+    }
+
+    #[async_test]
+    async fn verify_unknown_required_item() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+
+        // Add a fake required system metadata entry to the table.
+        let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize];
+        file.read_at(regions.metadata_offset, &mut buf)
+            .await
+            .unwrap();
+
+        let mut header = format::MetadataTableHeader::read_from_prefix(&buf)
+            .unwrap()
+            .0
+            .clone();
+        let old_count = header.entry_count;
+        header.entry_count = old_count + 1;
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        // Add a fake entry with unknown GUID.
+        let fake_guid = guid::guid!("deadbeef-dead-beef-dead-beefdeadbeef");
+        let fake_entry = format::MetadataTableEntry {
+            item_id: fake_guid,
+            offset: 0,
+            length: 0,
+            flags: format::MetadataTableEntryFlags::new().with_is_required(true),
+            reserved2: 0,
+        };
+        let entry_start = size_of::<format::MetadataTableHeader>();
+        let entry_size = size_of::<format::MetadataTableEntry>();
+        let off = entry_start + old_count as usize * entry_size;
+        let e_bytes = fake_entry.as_bytes();
+        buf[off..off + e_bytes.len()].copy_from_slice(e_bytes);
+
+        file.write_at(regions.metadata_offset, &buf).await.unwrap();
+
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        let result = verify_known_metadata(&table, false);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::UnknownRequiredMetadata
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn verify_incomplete_file() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+
+        // Add the incomplete file marker.
+        let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize];
+        file.read_at(regions.metadata_offset, &mut buf)
+            .await
+            .unwrap();
+
+        let mut header = format::MetadataTableHeader::read_from_prefix(&buf)
+            .unwrap()
+            .0
+            .clone();
+        let old_count = header.entry_count;
+        header.entry_count = old_count + 1;
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        let incomplete_entry = format::MetadataTableEntry {
+            item_id: format::INCOMPLETE_FILE_ITEM_GUID,
+            offset: 0,
+            length: 0,
+            flags: format::MetadataTableEntryFlags::new().with_is_required(true),
+            reserved2: 0,
+        };
+        let entry_start = size_of::<format::MetadataTableHeader>();
+        let entry_size = size_of::<format::MetadataTableEntry>();
+        let off = entry_start + old_count as usize * entry_size;
+        let e_bytes = incomplete_entry.as_bytes();
+        buf[off..off + e_bytes.len()].copy_from_slice(e_bytes);
+
+        file.write_at(regions.metadata_offset, &buf).await.unwrap();
+
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        // With allow_incomplete=false, should fail.
+        let result = verify_known_metadata(&table, false);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::IncompleteFile
+            )))
+        ));
+
+        // With allow_incomplete=true, should pass.
+        verify_known_metadata(&table, true).unwrap();
+    }
+
+    #[async_test]
+    async fn validate_block_size_power_of_two() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        // Overwrite file parameters with a non-power-of-2 block size.
+        let entry = table
+            .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID)
+            .unwrap();
+        let item_offset = regions.metadata_offset + entry.offset as u64;
+
+        let bad_params = FileParameters {
+            block_size: 3 * format::MB1 as u32, // not power of 2
+            flags: FileParametersFlags::new(),
+        };
+        file.write_at(item_offset, bad_params.as_bytes())
+            .await
+            .unwrap();
+
+        let cache = metadata_cache(file, regions.metadata_offset);
+        let result = read_known_metadata(&cache, &table).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidBlockSize
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn validate_file_parameters_reserved_flags() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        let entry = table
+            .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID)
+            .unwrap();
+        let item_offset = regions.metadata_offset + entry.offset as u64;
+
+        let params = FileParameters {
+            block_size: format::DEFAULT_BLOCK_SIZE,
+            flags: FileParametersFlags::new(),
+        };
+        let mut bytes = params.as_bytes().to_vec();
+        bytes[4..8].copy_from_slice(&4_u32.to_le_bytes());
+        file.write_at(item_offset, &bytes).await.unwrap();
+
+        let cache = metadata_cache(file, regions.metadata_offset);
+        let result = read_known_metadata(&cache, &table).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::ReservedFileParametersFieldNonzero
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn validate_sector_sizes() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        // Overwrite logical sector size with an invalid value.
+        let entry = table
+            .find_entry(false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID)
+            .unwrap();
+        let item_offset = regions.metadata_offset + entry.offset as u64;
+
+        let bad_value: u32 = 1024; // not 512 or 4096
+        file.write_at(item_offset, &bad_value.to_le_bytes())
+            .await
+            .unwrap();
+
+        let cache = metadata_cache(file, regions.metadata_offset);
+        let result = read_known_metadata(&cache, &table).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidLogicalSectorSize
+            )))
+        ));
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/lib.rs b/vm/devices/storage/vhdx/src/lib.rs
new file mode 100644
index 0000000000..96e03a8e76
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/lib.rs
@@ -0,0 +1,264 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Pure-Rust VHDX file format parser and writer.
+//!
+//! This crate implements the
+//! [VHDX format specification](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-vhdx/)
+//! with no platform-specific dependencies, enabling cross-platform support
+//! for dynamic, fixed, and differencing VHDX virtual hard disk files.
+//!
+//! # Overview
+//!
+//! A VHDX file stores a virtual disk as a collection of fixed-size data
+//! blocks tracked by a Block Allocation Table (BAT). Crash consistency is
+//! maintained through a write-ahead log (WAL) that journals metadata
+//! changes before they reach their final file locations.
+//!
+//! ## Lifecycle
+//!
+//! ```text
+//! create()  ──►  VhdxFile::open(file)  ──►  VhdxBuilder
+//!                                             ├── .read_only()  ──►  VhdxFile (read)
+//!                                             └── .writable()   ──►  VhdxFile (read/write)
+//!                                                                     │
+//!                                                   resolve_read / resolve_write / flush / trim
+//!                                                                     │
+//!                                                                   close()
+//! ```
+//!
+//! 1. **Create** — [`create::create()`] writes a valid, empty VHDX file.
+//! 2. **Open** — [`VhdxFile::open()`] returns a [`VhdxBuilder`] for
+//!    configuring options (block alignment, log replay policy) before
+//!    calling [`read_only()`](VhdxBuilder::read_only) or
+//!    [`writable()`](VhdxBuilder::writable).
+//! 3. **I/O** — [`VhdxFile::resolve_read()`](open::VhdxFile::resolve_read)
+//!    and [`VhdxFile::resolve_write()`](open::VhdxFile::resolve_write)
+//!    translate virtual disk offsets into file-level ranges. The caller
+//!    performs actual data I/O at the returned offsets, then calls
+//!    [`WriteIoGuard::complete()`] to finalize metadata.
+//! 4. **Flush** — [`VhdxFile::flush()`](open::VhdxFile::flush) commits
+//!    dirty pages through the WAL and flushes to stable storage.
+//! 5. **Close** — [`VhdxFile::close()`](open::VhdxFile::close) drains the
+//!    pipeline and clears the log GUID, leaving the file clean.
+//!
+//! ## Write pipeline (cache → log → apply)
+//!
+//! Writable opens spawn two background tasks that form a three-stage
+//! pipeline for crash-consistent metadata persistence:
+//!
+//! ```text
+//! ┌───────────┐    commit()    ┌──────────┐    apply    ┌────────────┐
+//! │   Cache   │ ──────────────►│ Log Task │ ───────────►│ Apply Task │
+//! │ (dirty    │   dirty pages  │ (WAL     │  logged     │ (final     │
+//! │  pages)   │                │  writer) │  pages      │  offsets)  │
+//! └───────────┘                └──────────┘             └────────────┘
+//!       ▲                           │                         │
+//!   LogPermits                  logged_lsn              applied_lsn
+//!  (backpressure)              (LsnWatermark)           (LsnWatermark)
+//! ```
+//!
+//! - The **cache** accumulates dirty 4 KiB metadata pages (BAT entries,
+//!   sector bitmap bits). On commit, pages are sent to the log task.
+//! - The **log task** writes WAL entries to the circular log region and
+//!   publishes `logged_lsn`.
+//! - The **apply task** writes logged pages to their final file offsets
+//!   and publishes `applied_lsn`.
+//! - A permit semaphore limits in-flight pages for backpressure. A flush
+//!   sequencer coalesces concurrent flush requests.
+//!
+//! # I/O model
+//!
+//! The crate separates **metadata I/O** from **payload I/O**.
+//!
+//! Metadata I/O (headers, BAT pages, sector bitmaps, WAL entries) is
+//! handled internally through [`AsyncFile`] — the caller provides an
+//! implementation at open time and never thinks about metadata again.
+//!
+//! Payload I/O (guest data reads and writes) is the caller's
+//! responsibility. [`resolve_read()`](open::VhdxFile::resolve_read) and
+//! [`resolve_write()`](open::VhdxFile::resolve_write) translate virtual
+//! disk offsets into file-level byte ranges ([`ReadRange`] /
+//! [`WriteRange`]). The caller performs its own data I/O at those
+//! offsets using whatever mechanism it prefers (io_uring, standard file
+//! I/O, etc.), then finalizes metadata via the returned I/O guard.
+//! This separation lets the caller use a different, potentially more
+//! performant I/O path for bulk data without the crate imposing any
+//! particular strategy.
+
+#![forbid(unsafe_code)]
+#![allow(async_fn_in_trait)]
+
+use std::borrow::Borrow;
+use std::future::Future;
+
+pub(crate) mod apply_task;
+pub(crate) mod bat;
+pub(crate) mod cache;
+pub(crate) mod create;
+pub(crate) mod error;
+pub(crate) mod flush;
+pub(crate) mod format;
+pub(crate) mod header;
+pub(crate) mod io;
+pub(crate) mod known_meta;
+pub(crate) mod locator;
+pub(crate) mod log;
+pub(crate) mod log_permits;
+pub(crate) mod log_task;
+pub(crate) mod lsn_watermark;
+pub(crate) mod metadata;
+pub(crate) mod open;
+pub(crate) mod region;
+pub(crate) mod sector_bitmap;
+pub(crate) mod space;
+pub(crate) mod trim;
+
+pub use create::CreateParams;
+pub use create::create;
+pub use error::CreateError;
+pub use error::InvalidFormatReason;
+pub use error::OpenError;
+pub use error::VhdxIoError;
+pub use error::VhdxIoErrorKind;
+pub use io::ReadIoGuard;
+pub use io::ReadRange;
+pub use io::WriteIoGuard;
+pub use io::WriteRange;
+pub use locator::LocatorKeyValue;
+pub use locator::ParentLocator;
+pub use locator::ParentPaths;
+pub use open::VhdxBuilder;
+pub use open::VhdxFile;
+pub use trim::TrimMode;
+pub use trim::TrimRequest;
+
+#[cfg(test)]
+mod tests;
+
+/// Trait abstracting metadata file I/O for the VHDX parser.
+///
+/// The crate uses this trait for all internal metadata access (headers,
+/// BAT pages, sector bitmaps, WAL entries, log replay). Payload data
+/// I/O is **not** routed through this trait — the caller handles it
+/// directly at the file offsets returned by
+/// [`VhdxFile::resolve_read()`](open::VhdxFile::resolve_read) and
+/// [`VhdxFile::resolve_write()`](open::VhdxFile::resolve_write).
+///
+/// All async methods return `Send` futures so that the log task (spawned
+/// on a multi-threaded executor) can call them.
+///
+/// This trait is **not** dyn-compatible due to `impl Future` return types.
+/// When dynamic dispatch is needed (e.g. `disk_backend` integration),
+/// create a separate dyn-compatible wrapper trait with a blanket impl.
+pub trait AsyncFile: Send + Sync {
+    /// Buffer type for owned I/O operations.
+    ///
+    /// Implementations control allocation strategy (e.g., alignment for
+    /// O_DIRECT). Buffers are owned and `'static`, so they can be safely
+    /// moved into `blocking::unblock`, io_uring submissions, etc.
+    type Buffer: AsRef<[u8]> + AsMut<[u8]> + Clone + Send + Sync + 'static;
+
+    /// Allocate a zero-initialized buffer of the given length.
+    fn alloc_buffer(&self, len: usize) -> Self::Buffer;
+
+    /// Read from the file into an owned buffer. Returns the filled buffer.
+    ///
+    /// The read starts at `offset` and fills `buf.as_mut().len()` bytes.
+    fn read_into(
+        &self,
+        offset: u64,
+        buf: Self::Buffer,
+    ) -> impl Future<Output = Result<Self::Buffer, std::io::Error>> + Send;
+
+    /// Write a buffer to the file at the given offset.
+    ///
+    /// Accepts any type that borrows as `Self::Buffer`, enabling zero-copy
+    /// writes from `Arc<Self::Buffer>` (the `Arc` moves into the I/O
+    /// closure; data is borrowed in place without copying).
+    fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Self::Buffer> + Send + 'static,
+    ) -> impl Future<Output = Result<(), std::io::Error>> + Send;
+
+    /// Flush all buffered writes to stable storage.
+    fn flush(&self) -> impl Future<Output = Result<(), std::io::Error>> + Send;
+
+    /// Return the current size of the file in bytes.
+    fn file_size(&self) -> impl Future<Output = Result<u64, std::io::Error>> + Send;
+
+    /// Set (truncate or extend) the file to the given size in bytes.
+    fn set_file_size(&self, size: u64) -> impl Future<Output = Result<(), std::io::Error>> + Send;
+
+    /// Zero a byte range of the file.
+    ///
+    /// Implementations may use platform-specific APIs (e.g., `fallocate`
+    /// with `FALLOC_FL_ZERO_RANGE` on Linux, or `FSCTL_SET_ZERO_DATA` on
+    /// Windows) for efficiency. The default implementation writes zeros
+    /// in fixed-size chunks via [`write_from`](Self::write_from).
+    fn zero_range(
+        &self,
+        offset: u64,
+        len: u64,
+    ) -> impl Future<Output = Result<(), std::io::Error>> + Send {
+        async move {
+            const CHUNK: usize = 64 * 1024;
+            let zeros = self.alloc_buffer(CHUNK);
+            let mut remaining = len;
+            let mut pos = offset;
+            while remaining > 0 {
+                let n = (remaining as usize).min(CHUNK);
+                if n < CHUNK {
+                    let small = self.alloc_buffer(n);
+                    self.write_from(pos, small).await?;
+                } else {
+                    self.write_from(pos, zeros.clone()).await?;
+                }
+                pos += n as u64;
+                remaining -= n as u64;
+            }
+            Ok(())
+        }
+    }
+}
+
+/// Extension trait providing slice-based `read_at`/`write_at` convenience
+/// methods. These allocate a temporary buffer internally, so they involve
+/// an extra copy compared to `read_into`/`write_from`.
+///
+/// Automatically implemented for all [`AsyncFile`] types.
+///
+/// Only used for tests within this crate, not to be exposed publicly.
+#[cfg(test)]
+pub trait AsyncFileExt: AsyncFile {
+    /// Read exactly `buf.len()` bytes from the file at the given byte offset.
+    fn read_at(
+        &self,
+        offset: u64,
+        buf: &mut [u8],
+    ) -> impl Future<Output = Result<(), std::io::Error>> + Send {
+        async move {
+            let owned = self.alloc_buffer(buf.len());
+            let owned = self.read_into(offset, owned).await?;
+            buf.copy_from_slice(owned.as_ref());
+            Ok(())
+        }
+    }
+
+    /// Write exactly `buf.len()` bytes to the file at the given byte offset.
+    fn write_at(
+        &self,
+        offset: u64,
+        buf: &[u8],
+    ) -> impl Future<Output = Result<(), std::io::Error>> + Send {
+        async move {
+            let mut owned = self.alloc_buffer(buf.len());
+            owned.as_mut().copy_from_slice(buf);
+            self.write_from(offset, owned).await
+        }
+    }
+}
+
+#[cfg(test)]
+impl<T: AsyncFile> AsyncFileExt for T {}
diff --git a/vm/devices/storage/vhdx/src/locator.rs b/vm/devices/storage/vhdx/src/locator.rs
new file mode 100644
index 0000000000..612b96972e
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/locator.rs
@@ -0,0 +1,393 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Parent locator parsing for VHDX files.
+//!
+//! Parses the parent locator metadata item (a key-value table of UTF-16LE
+//! strings) into a structured Rust type.
+
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::format;
+use crate::format::ParentLocatorEntry;
+use crate::format::ParentLocatorHeader;
+use guid::Guid;
+use zerocopy::FromBytes;
+
+/// A parsed key-value pair from a parent locator.
+#[derive(Debug, Clone)]
+pub struct LocatorKeyValue {
+    /// The key string.
+    pub key: String,
+    /// The value string.
+    pub value: String,
+}
+
+/// A parsed parent locator.
+#[derive(Debug, Clone)]
+pub struct ParentLocator {
+    /// The locator type GUID.
+    pub locator_type: Guid,
+    /// The key-value entries.
+    pub entries: Vec<LocatorKeyValue>,
+}
+
+/// Decode a UTF-16LE string from `data` at the given byte offset and length.
+///
+/// Returns `None` if the string is empty, misaligned, out of bounds,
+/// contains embedded nulls, or is not valid UTF-16.
+fn decode_utf16le(data: &[u8], offset: usize, length: usize) -> Option<String> {
+    if length == 0 || !length.is_multiple_of(2) || !offset.is_multiple_of(2) {
+        return None;
+    }
+    let end = offset.checked_add(length)?;
+    if end > data.len() {
+        return None;
+    }
+    let slice = &data[offset..end];
+    if has_embedded_null(slice) {
+        return None;
+    }
+    let u16s: Vec<u16> = slice
+        .chunks_exact(2)
+        .map(|c| u16::from_le_bytes([c[0], c[1]]))
+        .collect();
+    String::from_utf16(&u16s).ok()
+}
+
+/// Check that a UTF-16LE byte slice does not contain embedded null characters.
+fn has_embedded_null(data: &[u8]) -> bool {
+    data.chunks_exact(2)
+        .any(|c| u16::from_le_bytes([c[0], c[1]]) == 0)
+}
+
+impl ParentLocator {
+    /// Parse a parent locator from its raw metadata item bytes.
+    pub fn parse(data: &[u8]) -> Result<Self, OpenError> {
+        let header_size = size_of::<ParentLocatorHeader>();
+
+        // Check minimum size for the header.
+        if data.len() < header_size {
+            return Err((CorruptionType::LocatorTooSmallForHeader).into());
+        }
+
+        let header = ParentLocatorHeader::read_from_prefix(data)
+            .map_err(|_| CorruptionType::LocatorTooSmallForHeader)?
+            .0
+            .clone();
+
+        // Validate key-value count.
+        if header.key_value_count == 0
+            || header.key_value_count > format::PARENT_LOCATOR_MAXIMUM_KEY_VALUE_COUNT
+        {
+            return Err((CorruptionType::InvalidLocatorKeyValueCount).into());
+        }
+
+        // Check that the buffer is large enough for header + all entries.
+        let entry_size = size_of::<ParentLocatorEntry>();
+        let entries_end = header_size + header.key_value_count as usize * entry_size;
+        if data.len() < entries_end {
+            return Err((CorruptionType::LocatorTooSmallForEntries).into());
+        }
+
+        // Parse each entry.
+        let mut entries = Vec::with_capacity(header.key_value_count as usize);
+        for i in 0..header.key_value_count as usize {
+            let off = header_size + i * entry_size;
+            let entry = ParentLocatorEntry::read_from_prefix(&data[off..])
+                .unwrap()
+                .0
+                .clone();
+
+            // Validate and decode key.
+            let key_offset = entry.key_offset as usize;
+            let key_length = entry.key_length as usize;
+            let key = decode_utf16le(data, key_offset, key_length)
+                .ok_or(CorruptionType::InvalidLocatorEntryKey)?;
+
+            // Validate and decode value.
+            let value_offset = entry.value_offset as usize;
+            let value_length = entry.value_length as usize;
+            let value = decode_utf16le(data, value_offset, value_length)
+                .ok_or(CorruptionType::InvalidLocatorEntryValue)?;
+
+            entries.push(LocatorKeyValue { key, value });
+        }
+
+        Ok(ParentLocator {
+            locator_type: header.locator_type,
+            entries,
+        })
+    }
+
+    /// Find a value by key name (case-sensitive match).
+    pub fn find(&self, key: &str) -> Option<&str> {
+        self.entries
+            .iter()
+            .find(|e| e.key == key)
+            .map(|e| e.value.as_str())
+    }
+
+    /// Extract well-known parent paths from the locator.
+    ///
+    /// This looks up the standard VHDX parent locator keys and returns
+    /// them in a structured form. Returns `None` for any key not present.
+    pub fn parent_paths(&self) -> ParentPaths {
+        ParentPaths {
+            parent_linkage: self
+                .find(format::PARENT_LOCATOR_KEY_PARENT_LINKAGE)
+                .map(String::from),
+            relative_path: self
+                .find(format::PARENT_LOCATOR_KEY_RELATIVE_PATH)
+                .map(String::from),
+            absolute_win32_path: self
+                .find(format::PARENT_LOCATOR_KEY_ABSOLUTE_PATH)
+                .map(String::from),
+            volume_path: self
+                .find(format::PARENT_LOCATOR_KEY_VOLUME_PATH)
+                .map(String::from),
+        }
+    }
+}
+
+/// Paths extracted from a VHDX parent locator.
+///
+/// Contains the well-known path entries from the standard VHDX parent
+/// locator type. The caller should try paths in order of preference:
+/// relative, then absolute, then volume path.
+#[derive(Debug, Clone)]
+#[non_exhaustive]
+pub struct ParentPaths {
+    /// Parent's data write GUID at child creation time.
+    pub parent_linkage: Option<String>,
+    /// Relative path from child to parent.
+    pub relative_path: Option<String>,
+    /// Absolute Win32 path to parent.
+    pub absolute_win32_path: Option<String>,
+    /// Volume GUID path to parent.
+    pub volume_path: Option<String>,
+}
+
+/// Helper to encode a Rust string into a UTF-16LE byte vector.
+#[cfg(test)]
+fn encode_utf16le(s: &str) -> Vec<u8> {
+    s.encode_utf16().flat_map(|c| c.to_le_bytes()).collect()
+}
+
+/// Build a valid parent locator binary blob from parts.
+#[cfg(test)]
+pub(crate) fn build_locator(locator_type: Guid, kvs: &[(&str, &str)]) -> Vec<u8> {
+    use zerocopy::IntoBytes;
+
+    let header_size = size_of::<ParentLocatorHeader>();
+    let entry_size = size_of::<ParentLocatorEntry>();
+    let entries_end = header_size + kvs.len() * entry_size;
+
+    // Encode all key/value strings.
+    let encoded: Vec<(Vec<u8>, Vec<u8>)> = kvs
+        .iter()
+        .map(|(k, v)| (encode_utf16le(k), encode_utf16le(v)))
+        .collect();
+
+    // Compute total size.
+    let strings_size: usize = encoded.iter().map(|(k, v)| k.len() + v.len()).sum();
+    let total = entries_end + strings_size;
+    let mut buf = vec![0u8; total];
+
+    // Write header.
+    let header = ParentLocatorHeader {
+        locator_type,
+        reserved: 0,
+        key_value_count: kvs.len() as u16,
+    };
+    let h_bytes = header.as_bytes();
+    buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+    // Write entries and string data.
+    let mut string_offset = entries_end;
+    for (i, (key_bytes, val_bytes)) in encoded.iter().enumerate() {
+        let entry = ParentLocatorEntry {
+            key_offset: string_offset as u32,
+            value_offset: (string_offset + key_bytes.len()) as u32,
+            key_length: key_bytes.len() as u16,
+            value_length: val_bytes.len() as u16,
+        };
+        let e_bytes = entry.as_bytes();
+        let off = header_size + i * entry_size;
+        buf[off..off + e_bytes.len()].copy_from_slice(e_bytes);
+
+        buf[string_offset..string_offset + key_bytes.len()].copy_from_slice(key_bytes);
+        string_offset += key_bytes.len();
+        buf[string_offset..string_offset + val_bytes.len()].copy_from_slice(val_bytes);
+        string_offset += val_bytes.len();
+    }
+
+    buf
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::error::OpenErrorInner;
+
+    use super::*;
+
+    #[test]
+    fn parse_valid_locator() {
+        let data = build_locator(
+            format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            &[
+                ("parent_linkage", "guid-value-here"),
+                ("relative_path", "..\\parent.vhdx"),
+                ("absolute_win32_path", "C:\\vms\\parent.vhdx"),
+            ],
+        );
+
+        let locator = ParentLocator::parse(&data).unwrap();
+        assert_eq!(locator.locator_type, format::PARENT_LOCATOR_VHDX_TYPE_GUID);
+        assert_eq!(locator.entries.len(), 3);
+        assert_eq!(locator.entries[0].key, "parent_linkage");
+        assert_eq!(locator.entries[0].value, "guid-value-here");
+        assert_eq!(locator.entries[1].key, "relative_path");
+        assert_eq!(locator.entries[1].value, "..\\parent.vhdx");
+        assert_eq!(locator.entries[2].key, "absolute_win32_path");
+        assert_eq!(locator.entries[2].value, "C:\\vms\\parent.vhdx");
+    }
+
+    #[test]
+    fn find_by_key() {
+        let data = build_locator(
+            format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            &[("parent_linkage", "link-val"), ("relative_path", "rel-val")],
+        );
+
+        let locator = ParentLocator::parse(&data).unwrap();
+        assert_eq!(locator.find("parent_linkage"), Some("link-val"));
+        assert_eq!(locator.find("relative_path"), Some("rel-val"));
+        assert_eq!(locator.find("nonexistent"), None);
+    }
+
+    #[test]
+    fn parse_empty_locator() {
+        // Build a header with 0 entries.
+        use zerocopy::IntoBytes;
+        let header = ParentLocatorHeader {
+            locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            reserved: 0,
+            key_value_count: 0,
+        };
+        let data = header.as_bytes().to_vec();
+        let result = ParentLocator::parse(&data);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidLocatorKeyValueCount
+            )))
+        ));
+    }
+
+    #[test]
+    fn parse_invalid_utf16() {
+        // Build a locator where key has odd byte length.
+        use zerocopy::IntoBytes;
+
+        let header_size = size_of::<ParentLocatorHeader>();
+        let entry_size = size_of::<ParentLocatorEntry>();
+
+        // Total buffer: header + 1 entry + key(3 bytes, odd) + value(2 bytes)
+        let total = header_size + entry_size + 3 + 2;
+        let mut buf = vec![0u8; total];
+
+        let header = ParentLocatorHeader {
+            locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            reserved: 0,
+            key_value_count: 1,
+        };
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        let string_start = header_size + entry_size;
+        let entry = ParentLocatorEntry {
+            key_offset: string_start as u32,
+            value_offset: (string_start + 3) as u32,
+            key_length: 3, // odd = invalid
+            value_length: 2,
+        };
+        let e_bytes = entry.as_bytes();
+        buf[header_size..header_size + e_bytes.len()].copy_from_slice(e_bytes);
+
+        let result = ParentLocator::parse(&buf);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidLocatorEntryKey
+            )))
+        ));
+    }
+
+    #[test]
+    fn parse_embedded_null() {
+        // Build a locator where key contains an embedded null.
+        use zerocopy::IntoBytes;
+
+        let header_size = size_of::<ParentLocatorHeader>();
+        let entry_size = size_of::<ParentLocatorEntry>();
+
+        // Key: "a\0b" in UTF-16LE = [0x61, 0x00, 0x00, 0x00, 0x62, 0x00] (6 bytes)
+        let key_data: Vec<u8> = vec![0x61, 0x00, 0x00, 0x00, 0x62, 0x00];
+        let value_data = encode_utf16le("val");
+
+        let total = header_size + entry_size + key_data.len() + value_data.len();
+        let mut buf = vec![0u8; total];
+
+        let header = ParentLocatorHeader {
+            locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            reserved: 0,
+            key_value_count: 1,
+        };
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        let string_start = header_size + entry_size;
+        let entry = ParentLocatorEntry {
+            key_offset: string_start as u32,
+            value_offset: (string_start + key_data.len()) as u32,
+            key_length: key_data.len() as u16,
+            value_length: value_data.len() as u16,
+        };
+        let e_bytes = entry.as_bytes();
+        buf[header_size..header_size + e_bytes.len()].copy_from_slice(e_bytes);
+
+        buf[string_start..string_start + key_data.len()].copy_from_slice(&key_data);
+        let vs = string_start + key_data.len();
+        buf[vs..vs + value_data.len()].copy_from_slice(&value_data);
+
+        let result = ParentLocator::parse(&buf);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidLocatorEntryKey
+            )))
+        ));
+    }
+
+    #[test]
+    fn parse_truncated_locator() {
+        // Header claims 5 entries but buffer only holds header.
+        use zerocopy::IntoBytes;
+
+        let header = ParentLocatorHeader {
+            locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            reserved: 0,
+            key_value_count: 5,
+        };
+        let data = header.as_bytes().to_vec();
+
+        let result = ParentLocator::parse(&data);
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::LocatorTooSmallForEntries
+            )))
+        ));
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/log.rs b/vm/devices/storage/vhdx/src/log.rs
new file mode 100644
index 0000000000..34360773d8
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/log.rs
@@ -0,0 +1,2134 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! VHDX write-ahead log (WAL) — replay and entry construction.
+//!
+//! The VHDX format uses a write-ahead log stored in a circular "log region"
+//! to ensure crash consistency of metadata (BAT, sector bitmaps). This
+//! module provides:
+//!
+//! - [`replay_log`] — scans the log region for valid entries and applies
+//!   them to the file.
+//! - [`LogWriter`] — constructs and writes new log entries.
+//!
+//! This module is self-contained and depends only on [`crate::format`],
+//! [`crate::error`], and external crates.
+
+use crate::AsyncFile;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::error::OpenErrorInner;
+use crate::format::LOG_DATA_SECTOR_SIGNATURE;
+use crate::format::LOG_DESCRIPTOR_DATA_SIGNATURE;
+use crate::format::LOG_DESCRIPTOR_ZERO_SIGNATURE;
+use crate::format::LOG_ENTRY_HEADER_SIGNATURE;
+use crate::format::LOG_SECTOR_SIZE;
+use crate::format::LOGGABLE_OFFSET;
+use crate::format::LogDataDescriptor;
+use crate::format::LogDataSector;
+use crate::format::LogEntryHeader;
+use crate::format::LogZeroDescriptor;
+use crate::format::compute_checksum;
+use guid::Guid;
+use zerocopy::FromBytes;
+use zerocopy::IntoBytes;
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+const SECTOR: u32 = LOG_SECTOR_SIZE as u32;
+const HEADER_SIZE: u32 = size_of::<LogEntryHeader>() as u32; // 64
+const DESCRIPTOR_SIZE: u32 = size_of::<LogDataDescriptor>() as u32; // 32
+
+// ---------------------------------------------------------------------------
+// LogRegion — circular buffer helpers
+// ---------------------------------------------------------------------------
+
+/// Describes the circular log region in the VHDX file.
+#[derive(Debug, Clone)]
+pub struct LogRegion {
+    /// File offset where the log region starts.
+    pub file_offset: u64,
+    /// Length of the log region in bytes (always a multiple of `LOG_SECTOR_SIZE`).
+    pub length: u32,
+}
+
+impl LogRegion {
+    /// Modular add within the region: `(offset + length) % self.length`.
+    fn log_add(&self, offset: u32, len: u32) -> u32 {
+        let sum = offset + len;
+        if sum >= self.length {
+            sum - self.length
+        } else {
+            sum
+        }
+    }
+
+    /// Length of the circular sequence `[tail, head)`.
+    ///
+    /// If `head == tail`, the sequence spans the entire log (full).
+    fn sequence_length(&self, tail: u32, head: u32) -> u32 {
+        if head > tail {
+            head - tail
+        } else if head < tail {
+            head + self.length - tail
+        } else {
+            // head == tail → full
+            self.length
+        }
+    }
+
+    /// Free (unused) space in the log given the current `[tail, head)`.
+    fn free_space(&self, tail: u32, head: u32) -> u32 {
+        self.length - self.sequence_length(tail, head)
+    }
+
+    /// Returns `true` if `offset` lies within the circular range `[tail, head)`.
+    fn is_within_sequence(&self, tail: u32, head: u32, offset: u32) -> bool {
+        self.sequence_length(offset, head) <= self.sequence_length(tail, head)
+    }
+
+    /// Read a single sector from the log at `base + offset` (modular).
+    async fn read_sector<F: AsyncFile>(
+        &self,
+        file: &F,
+        base: u32,
+        offset: u32,
+        buf: F::Buffer,
+    ) -> Result<F::Buffer, OpenError> {
+        let pos = self.log_add(base, offset);
+        let buf = file
+            .read_into(self.file_offset + pos as u64, buf)
+            .await
+            .map_err(OpenErrorInner::Io)?;
+        Ok(buf)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Entry size helpers
+// ---------------------------------------------------------------------------
+
+/// Length of the descriptor area (header + descriptor sectors), rounded up
+/// to a multiple of `LOG_SECTOR_SIZE`.
+fn descriptor_area_length(descriptor_count: u32) -> u32 {
+    let raw = HEADER_SIZE + descriptor_count * DESCRIPTOR_SIZE;
+    raw.div_ceil(SECTOR) * SECTOR
+}
+
+/// Total entry length: descriptor area + one data sector per data descriptor.
+fn entry_length(data_count: u32, zero_count: u32) -> u32 {
+    descriptor_area_length(data_count + zero_count) + data_count * SECTOR
+}
+
+// ---------------------------------------------------------------------------
+// Replay
+// ---------------------------------------------------------------------------
+
+/// Result of log replay.
+#[derive(Debug, Clone)]
+pub struct ReplayResult {
+    /// Whether any entries were replayed.
+    pub replayed: bool,
+}
+
+/// A validated log sequence found during scanning.
+#[derive(Debug, Clone)]
+struct LogSequence {
+    tail: u32,
+    head: u32,
+    last_lsn: u64,
+}
+
+/// Replay the VHDX log.
+///
+/// Scans the log region for valid entries matching `log_guid`, applies
+/// them to the file, and flushes. Returns information about what was done.
+///
+/// This function is self-contained: it takes a file handle and log region
+/// parameters, not a `VhdxFile`. It can be called before the file is
+/// fully parsed.
+pub async fn replay_log<F: AsyncFile>(
+    file: &F,
+    log_region: &LogRegion,
+    log_guid: Guid,
+) -> Result<ReplayResult, OpenError> {
+    // Step 1: find the best valid sequence.
+    let sequence = find_log_sequence(file, log_region, &log_guid).await?;
+
+    // Step 2: apply the sequence.
+    apply_sequence(file, log_region, &log_guid, &sequence).await
+}
+
+/// Scan the entire log for the sequence with the highest LSN.
+async fn find_log_sequence<F: AsyncFile>(
+    file: &F,
+    region: &LogRegion,
+    log_guid: &Guid,
+) -> Result<LogSequence, OpenError> {
+    let mut best: Option<LogSequence> = None;
+    let mut tail: u32 = 0;
+
+    loop {
+        match find_sequence_from_tail(file, region, log_guid, tail).await {
+            Ok(seq) => {
+                let dominated = best.as_ref().is_some_and(|b| b.last_lsn >= seq.last_lsn);
+                if !dominated {
+                    best = Some(seq.clone());
+                }
+                // Advance past this sequence.
+                if seq.head <= tail {
+                    break; // wrapped
+                }
+                tail = seq.head;
+            }
+            Err(FindError::NoSequence) => {
+                tail += SECTOR;
+                if tail >= region.length {
+                    break;
+                }
+            }
+            Err(FindError::Vhdx(e)) => return Err(e),
+        }
+    }
+
+    Ok(best.ok_or(CorruptionType::NoValidLogEntries)?)
+}
+
+enum FindError {
+    NoSequence,
+    Vhdx(OpenError),
+}
+
+/// Try to build a sequence starting at `original_tail`.
+async fn find_sequence_from_tail<F: AsyncFile>(
+    file: &F,
+    region: &LogRegion,
+    log_guid: &Guid,
+    original_tail: u32,
+) -> Result<LogSequence, FindError> {
+    let mut seq = LogSequence {
+        tail: 0,
+        head: original_tail,
+        last_lsn: 0,
+    };
+    let mut first = true;
+
+    loop {
+        match expand_sequence(file, region, log_guid, &mut seq, first).await {
+            Ok(()) => {
+                first = false;
+            }
+            Err(FindError::NoSequence) if !first => break,
+            Err(e) => return Err(e),
+        }
+    }
+
+    // Verify that the sequence's tail lies within the validated range.
+    if !region.is_within_sequence(original_tail, seq.head, seq.tail) {
+        return Err(FindError::NoSequence);
+    }
+
+    Ok(seq)
+}
+
+/// Try to grow the sequence by one entry at `seq.head`.
+async fn expand_sequence<F: AsyncFile>(
+    file: &F,
+    region: &LogRegion,
+    log_guid: &Guid,
+    seq: &mut LogSequence,
+    first: bool,
+) -> Result<(), FindError> {
+    let mut sector_buf = file.alloc_buffer(SECTOR as usize);
+
+    // Read the first sector at the candidate position.
+    sector_buf = region
+        .read_sector(file, seq.head, 0, sector_buf)
+        .await
+        .map_err(FindError::Vhdx)?;
+
+    let header = LogEntryHeader::read_from_bytes(&sector_buf.as_ref()[..HEADER_SIZE as usize])
+        .map_err(|_| FindError::NoSequence)?
+        .clone();
+
+    // Validate header fields.
+    if header.signature != LOG_ENTRY_HEADER_SIGNATURE {
+        return Err(FindError::NoSequence);
+    }
+    if header.log_guid != *log_guid {
+        return Err(FindError::NoSequence);
+    }
+    if !validate_entry_header_fields(&header, region) {
+        return Err(FindError::NoSequence);
+    }
+
+    let new_head = region.log_add(seq.head, header.entry_length);
+
+    if !first {
+        // Check sequence continuity.
+        if header.sequence_number != seq.last_lsn + 1 {
+            return Err(FindError::NoSequence);
+        }
+        // Entry must fit in the free space.
+        if header.entry_length > region.free_space(seq.tail, seq.head) {
+            return Err(FindError::NoSequence);
+        }
+        // New tail must be within the growing sequence.
+        if !region.is_within_sequence(seq.tail, new_head, header.tail) {
+            return Err(FindError::NoSequence);
+        }
+    }
+
+    // Compute CRC-32C over the entire entry.
+    let buf_ref = sector_buf.as_ref();
+    let mut crc = compute_checksum(buf_ref, 4);
+
+    // Read and checksum additional descriptor sectors.
+    let desc_area_len = descriptor_area_length(header.descriptor_count);
+    let mut data_descriptor_count: u32 = 0;
+
+    // Validate descriptors.
+    for i in 0..header.descriptor_count {
+        let byte_offset = HEADER_SIZE + i * DESCRIPTOR_SIZE;
+        if byte_offset.is_multiple_of(SECTOR) {
+            // Need to read a new sector.
+            sector_buf = region
+                .read_sector(file, seq.head, byte_offset, sector_buf)
+                .await
+                .map_err(FindError::Vhdx)?;
+            crc = crc32c::crc32c_append(crc, sector_buf.as_ref());
+        }
+
+        let local_off = (byte_offset % SECTOR) as usize;
+        let desc_bytes = &sector_buf.as_ref()[local_off..local_off + DESCRIPTOR_SIZE as usize];
+
+        // Check descriptor signature.
+        let sig = u32::from_le_bytes(desc_bytes[0..4].try_into().unwrap());
+        if sig == LOG_DESCRIPTOR_DATA_SIGNATURE {
+            let desc = LogDataDescriptor::read_from_bytes(desc_bytes)
+                .map_err(|_| FindError::NoSequence)?;
+            if !validate_data_descriptor(&desc, &header, region) {
+                return Err(FindError::NoSequence);
+            }
+            data_descriptor_count += 1;
+        } else if sig == LOG_DESCRIPTOR_ZERO_SIGNATURE {
+            let desc = LogZeroDescriptor::read_from_bytes(desc_bytes)
+                .map_err(|_| FindError::NoSequence)?;
+            if !validate_zero_descriptor(&desc, &header, region) {
+                return Err(FindError::NoSequence);
+            }
+        } else {
+            return Err(FindError::NoSequence);
+        }
+    }
+
+    // Validate that the entry length matches.
+    let expected_len = entry_length(
+        data_descriptor_count,
+        header.descriptor_count - data_descriptor_count,
+    );
+    if header.entry_length != expected_len {
+        return Err(FindError::NoSequence);
+    }
+
+    // Read and validate data sectors.
+    for i in 0..data_descriptor_count {
+        let offset = desc_area_len + i * SECTOR;
+        sector_buf = region
+            .read_sector(file, seq.head, offset, sector_buf)
+            .await
+            .map_err(FindError::Vhdx)?;
+        crc = crc32c::crc32c_append(crc, sector_buf.as_ref());
+
+        let data_sector = LogDataSector::read_from_bytes(sector_buf.as_ref())
+            .map_err(|_| FindError::NoSequence)?;
+        if data_sector.signature != LOG_DATA_SECTOR_SIGNATURE {
+            return Err(FindError::NoSequence);
+        }
+        if data_sector.sequence_low != header.sequence_number as u32
+            || data_sector.sequence_high != (header.sequence_number >> 32) as u32
+        {
+            return Err(FindError::NoSequence);
+        }
+    }
+
+    // Verify CRC.
+    if crc != header.checksum {
+        return Err(FindError::NoSequence);
+    }
+
+    // Entry is valid — update the sequence.
+    seq.last_lsn = header.sequence_number;
+    seq.tail = header.tail;
+    seq.head = new_head;
+
+    Ok(())
+}
+
+fn validate_entry_header_fields(header: &LogEntryHeader, region: &LogRegion) -> bool {
+    if header.tail >= region.length {
+        return false;
+    }
+    if !header.entry_length.is_multiple_of(SECTOR)
+        || header.entry_length < SECTOR
+        || header.entry_length >= region.length
+    {
+        return false;
+    }
+    // Descriptor count must fit within the entry.
+    let max_desc = (header.entry_length - HEADER_SIZE) / DESCRIPTOR_SIZE;
+    if header.descriptor_count > max_desc {
+        return false;
+    }
+    true
+}
+
+fn validate_data_descriptor(
+    desc: &LogDataDescriptor,
+    header: &LogEntryHeader,
+    region: &LogRegion,
+) -> bool {
+    if !desc.file_offset.is_multiple_of(LOG_SECTOR_SIZE) {
+        return false;
+    }
+    if desc.file_offset < LOGGABLE_OFFSET {
+        return false;
+    }
+    if desc.sequence_number != header.sequence_number {
+        return false;
+    }
+    // Must not overlap the log region.
+    let write_end = desc.file_offset.checked_add(LOG_SECTOR_SIZE);
+    if let Some(end) = write_end {
+        if desc.file_offset < region.file_offset + region.length as u64 && end > region.file_offset
+        {
+            return false;
+        }
+    } else {
+        return false;
+    }
+    true
+}
+
+fn validate_zero_descriptor(
+    desc: &LogZeroDescriptor,
+    header: &LogEntryHeader,
+    region: &LogRegion,
+) -> bool {
+    if !desc.file_offset.is_multiple_of(LOG_SECTOR_SIZE) {
+        return false;
+    }
+    if desc.file_offset < LOGGABLE_OFFSET {
+        return false;
+    }
+    if !desc.length.is_multiple_of(LOG_SECTOR_SIZE) {
+        return false;
+    }
+    if desc.sequence_number != header.sequence_number {
+        return false;
+    }
+    // Must not overlap the log region.
+    let write_end = desc.file_offset.checked_add(desc.length);
+    if let Some(end) = write_end {
+        if desc.file_offset < region.file_offset + region.length as u64 && end > region.file_offset
+        {
+            return false;
+        }
+    } else {
+        return false;
+    }
+    true
+}
+
+/// Apply a validated sequence to the file.
+async fn apply_sequence<F: AsyncFile>(
+    file: &F,
+    region: &LogRegion,
+    log_guid: &Guid,
+    sequence: &LogSequence,
+) -> Result<ReplayResult, OpenError> {
+    let mut tail = sequence.tail;
+    let head = sequence.head;
+    let mut last_file_offset: u64 = 0;
+    let mut replayed = false;
+
+    let mut sector_buf = file.alloc_buffer(SECTOR as usize);
+
+    while tail != head {
+        // Read header.
+        sector_buf = region.read_sector(file, tail, 0, sector_buf).await?;
+        let header = LogEntryHeader::read_from_bytes(&sector_buf.as_ref()[..HEADER_SIZE as usize])
+            .map_err(|_| CorruptionType::BadLogEntryOnReplay)?
+            .clone();
+
+        if header.signature != LOG_ENTRY_HEADER_SIGNATURE || header.log_guid != *log_guid {
+            return Err(CorruptionType::BadLogEntryOnReplay.into());
+        }
+
+        last_file_offset = header.last_file_offset;
+
+        let desc_area_len = descriptor_area_length(header.descriptor_count);
+        let mut data_sector_index: u32 = 0;
+
+        for i in 0..header.descriptor_count {
+            let byte_offset = HEADER_SIZE + i * DESCRIPTOR_SIZE;
+            if byte_offset.is_multiple_of(SECTOR) || i == 0 {
+                // (Re-)read the descriptor sector. For the first descriptor
+                // the sector_buf already contains the header sector.
+                if byte_offset >= SECTOR {
+                    sector_buf = region
+                        .read_sector(file, tail, byte_offset - (byte_offset % SECTOR), sector_buf)
+                        .await?;
+                }
+            }
+
+            let local_off = (byte_offset % SECTOR) as usize;
+            let desc_bytes = &sector_buf.as_ref()[local_off..local_off + DESCRIPTOR_SIZE as usize];
+            let sig = u32::from_le_bytes(desc_bytes[0..4].try_into().unwrap());
+
+            if sig == LOG_DESCRIPTOR_ZERO_SIGNATURE {
+                let desc = LogZeroDescriptor::read_from_bytes(desc_bytes)
+                    .map_err(|_| CorruptionType::BadLogEntryOnReplay)?;
+
+                // Write zeros.
+                file.zero_range(desc.file_offset, desc.length)
+                    .await
+                    .map_err(OpenErrorInner::Io)?;
+            } else if sig == LOG_DESCRIPTOR_DATA_SIGNATURE {
+                let desc = LogDataDescriptor::read_from_bytes(desc_bytes)
+                    .map_err(|_| CorruptionType::BadLogEntryOnReplay)?
+                    .clone();
+
+                // Read the data sector from the log.
+                let data_offset = desc_area_len + data_sector_index * SECTOR;
+                let data_buf = file.alloc_buffer(SECTOR as usize);
+                let mut data_buf = region
+                    .read_sector(file, tail, data_offset, data_buf)
+                    .await?;
+
+                // Reconstruct the original 4096-byte sector.
+                // Replace first 8 bytes (signature + sequence_high) with leading_bytes.
+                let leading = desc.leading_bytes.to_le_bytes();
+                data_buf.as_mut()[0..8].copy_from_slice(&leading);
+                // Replace last 4 bytes (sequence_low) with trailing_bytes.
+                let trailing = desc.trailing_bytes.to_le_bytes();
+                data_buf.as_mut()[SECTOR as usize - 4..].copy_from_slice(&trailing);
+
+                file.write_from(desc.file_offset, data_buf)
+                    .await
+                    .map_err(OpenErrorInner::Io)?;
+                data_sector_index += 1;
+            } else {
+                return Err(CorruptionType::BadLogEntryOnReplay.into());
+            }
+        }
+
+        replayed = true;
+        tail = region.log_add(tail, header.entry_length);
+    }
+
+    // Extend file if needed.
+    if replayed {
+        let file_sz = file.file_size().await.map_err(OpenErrorInner::Io)?;
+        if file_sz < last_file_offset {
+            file.set_file_size(last_file_offset)
+                .await
+                .map_err(OpenErrorInner::Io)?;
+        }
+        file.flush().await.map_err(OpenErrorInner::Io)?;
+    }
+
+    Ok(ReplayResult { replayed })
+}
+
+// ---------------------------------------------------------------------------
+// LogWriter
+// ---------------------------------------------------------------------------
+
+/// A page to be logged: 4096 bytes of data at a file offset.
+pub struct DataPage<'a> {
+    /// Target file offset (must be aligned to LOG_SECTOR_SIZE).
+    pub file_offset: u64,
+    /// The 4096-byte data payload.
+    pub payload: &'a [u8; SECTOR as usize],
+}
+
+/// A range to be zeroed during replay.
+pub struct ZeroRange {
+    /// Target file offset (must be aligned to LOG_SECTOR_SIZE).
+    pub file_offset: u64,
+    /// Length in bytes (must be a multiple of LOG_SECTOR_SIZE).
+    pub length: u64,
+}
+
+/// Active log state for writing new entries.
+pub struct LogWriter {
+    region: LogRegion,
+    tail: u32,
+    head: u32,
+    sequence_number: u64,
+    log_guid: Guid,
+    flushed_file_offset: u64,
+    last_file_offset: u64,
+}
+
+impl LogWriter {
+    /// Create a new `LogWriter` for an empty log.
+    ///
+    /// Writes an initial empty entry (zero data descriptors, zero zero-ranges)
+    /// and flushes. Returns the writer ready for subsequent entries.
+    pub async fn initialize<F: AsyncFile>(
+        file: &F,
+        region: LogRegion,
+        log_guid: Guid,
+        last_file_offset: u64,
+    ) -> Result<Self, OpenError> {
+        let mut writer = LogWriter {
+            region,
+            tail: 0,
+            head: 0,
+            sequence_number: 0,
+            log_guid,
+            flushed_file_offset: last_file_offset,
+            last_file_offset,
+        };
+
+        // Write an initial empty entry with sequence number 1.
+        writer
+            .write_entry(file, &[], &[])
+            .await
+            .map_err(OpenErrorInner::Io)?
+            .ok_or(CorruptionType::LogFull)?;
+        file.flush().await.map_err(OpenErrorInner::Io)?;
+        Ok(writer)
+    }
+
+    /// Create a `LogWriter` from an existing valid log sequence.
+    ///
+    /// Returns the amount of free space remaining in the log.
+    pub fn free_space(&self) -> u32 {
+        self.region.free_space(self.tail, self.head)
+    }
+
+    /// Advance the log tail by `len` bytes, reclaiming space.
+    ///
+    /// The caller must ensure that all entries in the range `[old_tail, old_tail + len)`
+    /// have been fully applied and their pages are durable at final file offsets.
+    pub fn advance_tail(&mut self, new_tail: u32) {
+        self.tail = new_tail;
+    }
+
+    /// Returns the current head offset within the log region.
+    pub fn head(&self) -> u32 {
+        self.head
+    }
+
+    /// Write a log entry containing the given data pages and zero ranges.
+    ///
+    /// Returns `Some(sequence_number)` on success, or `None` if the log
+    /// doesn't have enough free space (caller should drain and retry).
+    pub async fn write_entry<F: AsyncFile>(
+        &mut self,
+        file: &F,
+        data_pages: &[DataPage<'_>],
+        zero_ranges: &[ZeroRange],
+    ) -> Result<Option<u64>, std::io::Error> {
+        let data_count = data_pages.len() as u32;
+        let zero_count = zero_ranges.len() as u32;
+        let total_desc = data_count + zero_count;
+        let elen = entry_length(data_count, zero_count);
+
+        // We always leave room for at least one more sector to avoid completely
+        // filling the log.
+        let needed = elen + SECTOR;
+        if self.tail == self.head {
+            // Empty log — the full region is free.
+            if needed > self.region.length {
+                return Ok(None);
+            }
+        } else if needed > self.free_space() {
+            return Ok(None);
+        }
+
+        self.sequence_number += 1;
+        let seq = self.sequence_number;
+
+        // --- Allocate entry buffers ---
+        // The entry may wrap around the circular log boundary. Since head
+        // and elen are both sector-aligned, the split always falls on a
+        // sector boundary. Allocate one or two buffers accordingly.
+        let remaining = (self.region.length - self.head) as usize;
+        let wraps = (elen as usize) > remaining;
+        let mut buf1 = file.alloc_buffer(if wraps { remaining } else { elen as usize });
+        let b1 = buf1.as_mut();
+        let mut buf2 = if wraps {
+            Some(file.alloc_buffer(elen as usize - remaining))
+        } else {
+            None
+        };
+        let b2 = buf2.as_mut().map_or(&mut [][..], |b| b.as_mut());
+        let split = remaining; // byte offset within the entry where the split occurs
+
+        fn entry_slice<'a>(
+            b1: &'a mut [u8],
+            b2: &'a mut [u8],
+            split: usize,
+            offset: usize,
+            len: usize,
+        ) -> &'a mut [u8] {
+            if offset < split {
+                debug_assert!(offset + len <= split, "access straddles split boundary");
+                &mut b1[offset..offset + len]
+            } else {
+                let off2 = offset - split;
+                &mut b2[off2..off2 + len]
+            }
+        }
+
+        // --- Build the entry ---
+
+        // Header (first 64 bytes of first sector).
+        let header = LogEntryHeader {
+            signature: LOG_ENTRY_HEADER_SIGNATURE,
+            checksum: 0,
+            entry_length: elen,
+            tail: self.tail,
+            sequence_number: seq,
+            descriptor_count: total_desc,
+            reserved: 0,
+            log_guid: self.log_guid,
+            flushed_file_offset: self.flushed_file_offset,
+            last_file_offset: self.last_file_offset,
+        };
+        entry_slice(b1, b2, split, 0, HEADER_SIZE as usize).copy_from_slice(header.as_bytes());
+
+        // Descriptors.
+        let desc_area_len = descriptor_area_length(total_desc);
+        let mut desc_offset = HEADER_SIZE as usize;
+        let mut data_sector_offset = desc_area_len as usize;
+
+        for dp in data_pages {
+            let leading = u64::from_le_bytes(dp.payload[0..8].try_into().unwrap());
+            let trailing = u32::from_le_bytes(
+                dp.payload[SECTOR as usize - 4..SECTOR as usize]
+                    .try_into()
+                    .unwrap(),
+            );
+            let desc = LogDataDescriptor {
+                signature: LOG_DESCRIPTOR_DATA_SIGNATURE,
+                trailing_bytes: trailing,
+                leading_bytes: leading,
+                file_offset: dp.file_offset,
+                sequence_number: seq,
+            };
+            entry_slice(b1, b2, split, desc_offset, DESCRIPTOR_SIZE as usize)
+                .copy_from_slice(desc.as_bytes());
+            desc_offset += DESCRIPTOR_SIZE as usize;
+
+            let ds = build_data_sector(dp.payload, seq);
+            entry_slice(b1, b2, split, data_sector_offset, SECTOR as usize)
+                .copy_from_slice(ds.as_bytes());
+            data_sector_offset += SECTOR as usize;
+        }
+
+        for zr in zero_ranges {
+            let desc = LogZeroDescriptor {
+                signature: LOG_DESCRIPTOR_ZERO_SIGNATURE,
+                reserved: 0,
+                length: zr.length,
+                file_offset: zr.file_offset,
+                sequence_number: seq,
+            };
+            entry_slice(b1, b2, split, desc_offset, DESCRIPTOR_SIZE as usize)
+                .copy_from_slice(desc.as_bytes());
+            desc_offset += DESCRIPTOR_SIZE as usize;
+        }
+
+        // Compute CRC-32C across both buffers (checksum field is already zero).
+        let mut crc = crc32c::crc32c(buf1.as_ref());
+        if let Some(ref b2) = buf2 {
+            crc = crc32c::crc32c_append(crc, b2.as_ref());
+        }
+        // Write checksum into the header (always in buf1, bytes 4..8).
+        buf1.as_mut()[4..8].copy_from_slice(&crc.to_le_bytes());
+
+        // --- Write to file ---
+        file.write_from(self.region.file_offset + self.head as u64, buf1)
+            .await?;
+        if let Some(buf2) = buf2 {
+            file.write_from(self.region.file_offset, buf2).await?;
+        }
+
+        // Advance head.
+        self.head = self.region.log_add(self.head, elen);
+
+        Ok(Some(seq))
+    }
+}
+
+/// Build a `LogDataSector` from a page of original data and a sequence number.
+fn build_data_sector(source: &[u8; SECTOR as usize], sequence_number: u64) -> LogDataSector {
+    let mut data = [0u8; 4084];
+    data.copy_from_slice(&source[8..SECTOR as usize - 4]);
+    LogDataSector {
+        signature: LOG_DATA_SECTOR_SIGNATURE,
+        sequence_high: (sequence_number >> 32) as u32,
+        data,
+        sequence_low: sequence_number as u32,
+    }
+}
+
+// ===========================================================================
+// Tests
+// ===========================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+
+    /// Helpers for tests.
+    const TEST_LOG_SIZE: u32 = 64 * SECTOR; // 256 KiB
+    const TEST_LOG_OFFSET: u64 = 1024 * 1024; // 1 MiB into the file
+
+    fn test_region() -> LogRegion {
+        LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: TEST_LOG_SIZE,
+        }
+    }
+
+    fn test_guid() -> Guid {
+        guid::guid!("12345678-1234-1234-1234-123456789abc")
+    }
+
+    /// Create a file large enough for the log region and a target area.
+    fn test_file() -> InMemoryFile {
+        // 4 MiB file: enough for the log at 1 MiB and target writes at 192 KiB+.
+        InMemoryFile::new(4 * 1024 * 1024)
+    }
+
+    // -----------------------------------------------------------------------
+    // Circular buffer helper tests
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn log_add_no_wrap() {
+        let r = LogRegion {
+            file_offset: 0,
+            length: 1000,
+        };
+        assert_eq!(r.log_add(100, 200), 300);
+    }
+
+    #[test]
+    fn log_add_with_wrap() {
+        let r = LogRegion {
+            file_offset: 0,
+            length: 1000,
+        };
+        assert_eq!(r.log_add(800, 300), 100);
+    }
+
+    #[test]
+    fn sequence_length_head_gt_tail() {
+        let r = LogRegion {
+            file_offset: 0,
+            length: 1000,
+        };
+        assert_eq!(r.sequence_length(100, 500), 400);
+    }
+
+    #[test]
+    fn sequence_length_head_lt_tail() {
+        let r = LogRegion {
+            file_offset: 0,
+            length: 1000,
+        };
+        assert_eq!(r.sequence_length(800, 200), 400);
+    }
+
+    #[test]
+    fn sequence_length_head_eq_tail() {
+        let r = LogRegion {
+            file_offset: 0,
+            length: 1000,
+        };
+        // Full log.
+        assert_eq!(r.sequence_length(500, 500), 1000);
+    }
+
+    #[test]
+    fn free_space_computation() {
+        let r = LogRegion {
+            file_offset: 0,
+            length: 1000,
+        };
+        assert_eq!(r.free_space(100, 500), 600);
+        assert_eq!(r.free_space(800, 200), 600);
+    }
+
+    #[test]
+    fn is_within_sequence_cases() {
+        let r = LogRegion {
+            file_offset: 0,
+            length: 1000,
+        };
+        // Normal range [100, 500)
+        assert!(r.is_within_sequence(100, 500, 100)); // at tail
+        assert!(r.is_within_sequence(100, 500, 300)); // in middle
+        assert!(!r.is_within_sequence(100, 500, 500)); // at head (not within)
+        assert!(!r.is_within_sequence(100, 500, 50)); // before tail
+
+        // Wrapped range [800, 200)
+        assert!(r.is_within_sequence(800, 200, 900)); // in first part
+        assert!(r.is_within_sequence(800, 200, 100)); // in second part
+        assert!(!r.is_within_sequence(800, 200, 500)); // outside
+    }
+
+    // -----------------------------------------------------------------------
+    // Log Writer tests
+    // -----------------------------------------------------------------------
+
+    #[async_test]
+    async fn writer_initialize_creates_empty_entry() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        assert_eq!(writer.sequence_number, 1);
+        assert_eq!(writer.tail, 0);
+        // The empty entry is 1 sector (header only, 0 descriptors).
+        assert_eq!(writer.head, SECTOR);
+
+        // Read back and validate.
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(TEST_LOG_OFFSET, &mut buf).await.unwrap();
+        let header = LogEntryHeader::read_from_bytes(&buf[..HEADER_SIZE as usize]).unwrap();
+        assert_eq!(header.signature, LOG_ENTRY_HEADER_SIGNATURE);
+        assert_eq!(header.sequence_number, 1);
+        assert_eq!(header.descriptor_count, 0);
+        assert_eq!(header.entry_length, SECTOR);
+        assert_eq!(header.log_guid, guid);
+
+        // Validate CRC.
+        let stored_crc = header.checksum;
+        let computed_crc = compute_checksum(&buf, 4);
+        assert_eq!(stored_crc, computed_crc);
+    }
+
+    #[async_test]
+    async fn writer_one_data_page() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let mut page_data = [0u8; SECTOR as usize];
+        for (i, b) in page_data.iter_mut().enumerate() {
+            *b = (i % 256) as u8;
+        }
+        let target_offset = LOGGABLE_OFFSET + 4096;
+
+        let lsn = writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: target_offset,
+                    payload: &page_data,
+                }],
+                &[],
+            )
+            .await
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(lsn, 2);
+
+        // Read back the entry header and verify.
+        let entry_start = TEST_LOG_OFFSET + SECTOR as u64; // after the init entry
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(entry_start, &mut buf).await.unwrap();
+        let header = LogEntryHeader::read_from_bytes(&buf[..HEADER_SIZE as usize]).unwrap();
+        assert_eq!(header.signature, LOG_ENTRY_HEADER_SIGNATURE);
+        assert_eq!(header.sequence_number, 2);
+        assert_eq!(header.descriptor_count, 1);
+        assert_eq!(header.entry_length, entry_length(1, 0));
+    }
+
+    #[async_test]
+    async fn writer_one_zero_range() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let target_offset = LOGGABLE_OFFSET;
+        let lsn = writer
+            .write_entry(
+                &file,
+                &[],
+                &[ZeroRange {
+                    file_offset: target_offset,
+                    length: 8192,
+                }],
+            )
+            .await
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(lsn, 2);
+
+        // Read back and verify descriptor.
+        let entry_start = TEST_LOG_OFFSET + SECTOR as u64;
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(entry_start, &mut buf).await.unwrap();
+        let header = LogEntryHeader::read_from_bytes(&buf[..HEADER_SIZE as usize]).unwrap();
+        assert_eq!(header.descriptor_count, 1);
+        // Zero descriptor: entry is just 1 sector (header + descriptor fits)
+        assert_eq!(header.entry_length, entry_length(0, 1));
+    }
+
+    #[async_test]
+    async fn writer_multiple_entries_advance() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let page = [0xAAu8; SECTOR as usize];
+        for i in 0..3 {
+            let lsn = writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: LOGGABLE_OFFSET + (i as u64) * 4096,
+                        payload: &page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap()
+                .unwrap();
+            assert_eq!(lsn, i as u64 + 2);
+        }
+
+        // Head should have advanced past the initial empty entry + 3 data entries.
+        let one_data_entry_len = entry_length(1, 0);
+        let expected_head = SECTOR + 3 * one_data_entry_len;
+        assert_eq!(writer.head, expected_head);
+        assert_eq!(writer.sequence_number, 4);
+    }
+
+    #[async_test]
+    async fn writer_wrap_around() {
+        let file = test_file();
+        // Use a small log that will force wrap-around.
+        let small_log_size = 16 * SECTOR; // 64 KiB
+        let region = LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: small_log_size,
+        };
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        // Each data entry = entry_length(1, 0) = 2 sectors.
+        // After init (1 sector), we have 15 sectors free minus 1 reserved = 14 usable.
+        // Each entry = 2 sectors, so we can fit 7 entries before needing to advance tail.
+        // But the writer doesn't advance tail on its own — we just write until full.
+        let page = [0xBBu8; SECTOR as usize];
+        let entry_len = entry_length(1, 0);
+
+        let mut entries_written = 0u32;
+        loop {
+            // Check if we have space for entry + 1 sector.
+            let needed = entry_len + SECTOR;
+            if writer.tail == writer.head {
+                // Empty — full space available.
+                if needed > writer.region.length {
+                    break;
+                }
+            } else if needed > writer.free_space() {
+                break;
+            }
+
+            writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: LOGGABLE_OFFSET + (entries_written as u64) * 4096,
+                        payload: &page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap();
+            entries_written += 1;
+        }
+
+        assert!(entries_written > 0);
+        // Verify that the head has wrapped around or is near the end.
+        // The exact value depends on the arithmetic, but the write should have succeeded.
+    }
+
+    #[async_test]
+    async fn writer_log_full_error() {
+        let file = test_file();
+        // Tiny log: 4 sectors.
+        let region = LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: 4 * SECTOR,
+        };
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        // After init entry (1 sector), 3 sectors free. A data entry needs 2 sectors + 1 reserved = 3. Fits.
+        let page = [0xCCu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Now head - tail = 3 sectors used, 1 free. Next entry needs 2 + 1 = 3. Won't fit.
+        let result = writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 4096,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await;
+
+        // write_entry returns None when the log is full.
+        assert_eq!(result.unwrap(), None);
+    }
+
+    // -----------------------------------------------------------------------
+    // Log Replay tests
+    // -----------------------------------------------------------------------
+
+    #[async_test]
+    async fn replay_single_data_entry() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        // Write a single data entry.
+        let mut page_data = [0u8; SECTOR as usize];
+        for (i, b) in page_data.iter_mut().enumerate() {
+            *b = ((i + 1) % 256) as u8;
+        }
+        let target_offset = LOGGABLE_OFFSET + 4096;
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: target_offset,
+                    payload: &page_data,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Now replay.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // Verify that the data was written to the target offset.
+        let mut read_buf = [0u8; SECTOR as usize];
+        file.read_at(target_offset, &mut read_buf).await.unwrap();
+        assert_eq!(read_buf, page_data);
+    }
+
+    #[async_test]
+    async fn replay_data_and_zero_descriptors() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        // Write some non-zero data to the zero target first.
+        let zero_target = LOGGABLE_OFFSET + 8192;
+        let garbage = [0xFFu8; 8192];
+        file.write_at(zero_target, &garbage).await.unwrap();
+
+        let mut page_data = [0x42u8; SECTOR as usize];
+        page_data[0] = 0xDE;
+        page_data[1] = 0xAD;
+        let data_target = LOGGABLE_OFFSET + 4096;
+
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: data_target,
+                    payload: &page_data,
+                }],
+                &[ZeroRange {
+                    file_offset: zero_target,
+                    length: 8192,
+                }],
+            )
+            .await
+            .unwrap();
+
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // Verify data was applied.
+        let mut read_buf = [0u8; SECTOR as usize];
+        file.read_at(data_target, &mut read_buf).await.unwrap();
+        assert_eq!(read_buf, page_data);
+
+        // Verify zeros were applied.
+        let mut zero_buf = vec![0u8; 8192];
+        file.read_at(zero_target, &mut zero_buf).await.unwrap();
+        assert!(zero_buf.iter().all(|&b| b == 0));
+    }
+
+    #[async_test]
+    async fn replay_multiple_sequential_entries() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let mut pages = Vec::new();
+        for n in 0..3 {
+            let mut page = [0u8; SECTOR as usize];
+            page.fill(n as u8 + 1);
+            pages.push(page);
+        }
+
+        for (n, page) in pages.iter().enumerate() {
+            let offset = LOGGABLE_OFFSET + (n as u64) * 4096;
+            writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: offset,
+                        payload: page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap();
+        }
+
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        for (n, page) in pages.iter().enumerate() {
+            let offset = LOGGABLE_OFFSET + (n as u64) * 4096;
+            let mut buf = [0u8; SECTOR as usize];
+            file.read_at(offset, &mut buf).await.unwrap();
+            assert_eq!(buf, *page, "mismatch at entry {n}");
+        }
+    }
+
+    #[async_test]
+    async fn replay_invalid_crc_skipped() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        // Write a self-contained entry: start the writer at a non-zero position
+        // so the entry's tail == its own start. The first (corrupt) entry
+        // comes before it and is skipped.
+        //
+        // Strategy: write two independent sequences. The first is the init
+        // entry whose CRC we corrupt. The second is an independent entry
+        // that references itself as the tail.
+        let mut writer = LogWriter {
+            region: region.clone(),
+            tail: 4 * SECTOR,
+            head: 4 * SECTOR,
+            sequence_number: 10,
+            log_guid: guid,
+            flushed_file_offset: 4 * 1024 * 1024,
+            last_file_offset: 4 * 1024 * 1024,
+        };
+
+        let page = [0xAAu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Also write a garbage entry at offset 0 that looks like a valid header
+        // but has a corrupt CRC.
+        let _bad_writer = LogWriter {
+            region: region.clone(),
+            tail: 0,
+            head: 0,
+            sequence_number: 0,
+            log_guid: guid,
+            flushed_file_offset: 4 * 1024 * 1024,
+            last_file_offset: 4 * 1024 * 1024,
+        };
+        // We just need a valid-looking header at offset 0 with bad CRC.
+        // Write an init-like entry, then corrupt its CRC.
+        let header = LogEntryHeader {
+            signature: LOG_ENTRY_HEADER_SIGNATURE,
+            checksum: 0xDEADBEEF, // intentionally wrong
+            entry_length: SECTOR,
+            tail: 0,
+            sequence_number: 1,
+            descriptor_count: 0,
+            reserved: 0,
+            log_guid: guid,
+            flushed_file_offset: 4 * 1024 * 1024,
+            last_file_offset: 4 * 1024 * 1024,
+        };
+        let mut buf = [0u8; SECTOR as usize];
+        buf[..HEADER_SIZE as usize].copy_from_slice(header.as_bytes());
+        file.write_at(TEST_LOG_OFFSET, &buf).await.unwrap();
+
+        // Replay should skip the bad entry at offset 0 and find the good entry
+        // at offset 4*SECTOR.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // The data should have been applied.
+        let mut read_buf = [0u8; SECTOR as usize];
+        file.read_at(LOGGABLE_OFFSET, &mut read_buf).await.unwrap();
+        assert_eq!(read_buf, page);
+    }
+
+    #[async_test]
+    async fn replay_wrong_guid_skipped() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+        let wrong_guid = guid::guid!("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee");
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let page = [0x55u8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Try to replay with the wrong GUID — should find nothing.
+        let result = replay_log(&file, &region, wrong_guid).await;
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            OpenError(OpenErrorInner::Corrupt(CorruptionType::NoValidLogEntries)) => {}
+            other => panic!("expected NoValidLogEntries, got {:?}", other),
+        }
+    }
+
+    #[async_test]
+    async fn replay_bad_signature_skipped() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        // Place a valid self-contained entry at sector 4.
+        let mut writer = LogWriter {
+            region: region.clone(),
+            tail: 4 * SECTOR,
+            head: 4 * SECTOR,
+            sequence_number: 20,
+            log_guid: guid,
+            flushed_file_offset: 4 * 1024 * 1024,
+            last_file_offset: 4 * 1024 * 1024,
+        };
+
+        let page = [0x33u8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Write a bad-signature entry at offset 0 so the scanner has something
+        // to skip.
+        let mut bad_header_buf = [0u8; SECTOR as usize];
+        bad_header_buf[0..4].copy_from_slice(b"XXXX");
+        file.write_at(TEST_LOG_OFFSET, &bad_header_buf)
+            .await
+            .unwrap();
+
+        // The entry at sector 4 should still be found.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+    }
+
+    #[async_test]
+    async fn replay_empty_log_errors() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let result = replay_log(&file, &region, guid).await;
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            OpenError(OpenErrorInner::Corrupt(CorruptionType::NoValidLogEntries)) => {}
+            other => panic!("expected NoValidLogEntries, got {:?}", other),
+        }
+    }
+
+    #[async_test]
+    async fn replay_torn_write_last_entry() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        // Write two entries.
+        let page1 = [0x11u8; SECTOR as usize];
+        let page2 = [0x22u8; SECTOR as usize];
+
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page1,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        let entry3_start = writer.head;
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 4096,
+                    payload: &page2,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Simulate torn write: zero out part of the last entry (its second sector).
+        let torn_offset = TEST_LOG_OFFSET + region.log_add(entry3_start, SECTOR) as u64;
+        let zeros = [0u8; SECTOR as usize];
+        file.write_at(torn_offset, &zeros).await.unwrap();
+
+        // Replay should apply only entries 1 and 2 (the init entry + first data entry).
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // page1 should be applied.
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(LOGGABLE_OFFSET, &mut buf).await.unwrap();
+        assert_eq!(buf, page1);
+
+        // page2 should NOT be applied (it was in the torn entry).
+        // The file might have whatever garbage was at that location.
+    }
+
+    #[async_test]
+    async fn writer_then_replay_roundtrip() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        // Write several entries.
+        let mut pages = Vec::new();
+        for n in 0..5 {
+            let mut page = [0u8; SECTOR as usize];
+            for (i, b) in page.iter_mut().enumerate() {
+                *b = ((n * 37 + i) % 256) as u8;
+            }
+            pages.push((LOGGABLE_OFFSET + (n as u64) * 4096, page));
+        }
+
+        for (offset, page) in &pages {
+            writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: *offset,
+                        payload: page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap();
+        }
+
+        // Replay.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // Verify all pages.
+        for (offset, expected) in &pages {
+            let mut buf = [0u8; SECTOR as usize];
+            file.read_at(*offset, &mut buf).await.unwrap();
+            assert_eq!(&buf, expected, "mismatch at offset {offset:#x}");
+        }
+    }
+
+    #[async_test]
+    async fn replay_entry_wrapping_circular_buffer() {
+        // Use a log small enough that an entry wraps around.
+        let file = test_file();
+        let small_log = 8 * SECTOR;
+        let region = LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: small_log,
+        };
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        // Fill the log partially to get head near the end.
+        // Init entry = 1 sector, data entry = 2 sectors.
+        // After init: head=1. Write 2 data entries: head=1+2+2=5.
+        let page_a = [0xAAu8; SECTOR as usize];
+        let page_b = [0xBBu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page_a,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 4096,
+                    payload: &page_b,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // head is now at 5 sectors. Advance tail to free space.
+        // We simulate this by manually setting the tail forward.
+        writer.tail = writer.head - entry_length(1, 0); // keep only the last entry valid
+        // Actually, let's just test the replay path as-is — the entries wrap
+        // if the head is near the end.
+
+        // head = 5*SECTOR. Log = 8 sectors. Free = 3 sectors.
+        // A 2-sector entry would need 2 + 1 reserved = 3. Fits!
+        // It starts at sector 5 and wraps: sector 5 (desc) + sector 6 → wraps to sector 0 (data).
+        // Wait — that's not wrapping because sector 5+1=6 < 8.
+        // Let's write another entry to push head to 7:
+        // Actually head = 5 sectors. Next entry = 2 sectors → head = 7. Free = 1. Can't write more.
+
+        // Let me reconsider. After init (head=1), 2 data entries (head=5).
+        // Advance tail to 3 (past the init and first data entry).
+        writer.tail = 3 * SECTOR;
+        // Free space = 8 - (5*SECTOR - 3*SECTOR)/div... = using sequence_length.
+        // sequence_length(3S, 5S) = 2S. Free = 8S - 2S = 6S. Next entry = 2S + 1S = 3S. Fits.
+
+        let page_wrap = [0xCCu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 8192,
+                    payload: &page_wrap,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // head is now at 7 sectors. Free = 8S - seq_len(3S, 7S) = 8S - 4S = 4S. Write one more.
+        let page_wrap2 = [0xDDu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 12288,
+                    payload: &page_wrap2,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // head should now be at (7+2) % 8 = 1 sector. This entry wrapped!
+        assert_eq!(writer.head, SECTOR);
+
+        // Replay: the scanner needs to find the sequence starting at tail=3S up to head=1S.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // Verify the wrapped entry was applied.
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(LOGGABLE_OFFSET + 8192, &mut buf)
+            .await
+            .unwrap();
+        assert_eq!(buf, page_wrap);
+        file.read_at(LOGGABLE_OFFSET + 12288, &mut buf)
+            .await
+            .unwrap();
+        assert_eq!(buf, page_wrap2);
+    }
+
+    #[async_test]
+    async fn replay_highest_lsn_sequence_chosen() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        // Write a sequence with lower LSNs.
+        let mut writer1 = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let page_old = [0x11u8; SECTOR as usize];
+        writer1
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page_old,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Now write a second sequence with higher LSNs starting at a different
+        // position in the log. We'll manually create a writer at a different head.
+        let new_head = writer1.head + 4 * SECTOR; // skip a gap
+        let mut writer2 = LogWriter {
+            region: region.clone(),
+            tail: new_head,
+            head: new_head,
+            sequence_number: 100, // much higher
+            log_guid: guid,
+            flushed_file_offset: 4 * 1024 * 1024,
+            last_file_offset: 4 * 1024 * 1024,
+        };
+
+        let page_new = [0x99u8; SECTOR as usize];
+        writer2
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page_new,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Replay should pick the sequence with LSN 101 over LSN 1-2.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // The data at LOGGABLE_OFFSET should be from the newer sequence.
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(LOGGABLE_OFFSET, &mut buf).await.unwrap();
+        assert_eq!(buf, page_new);
+    }
+
+    #[async_test]
+    async fn roundtrip_crash_replay() {
+        // Write entries, "crash" (no cleanup), then replay.
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let page = [0x77u8; SECTOR as usize];
+        let target = LOGGABLE_OFFSET + 4096;
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: target,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Simulating crash — no cleanup.
+        let _ = writer;
+
+        // Replay should recover the data.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(target, &mut buf).await.unwrap();
+        assert_eq!(buf, page);
+    }
+
+    #[async_test]
+    async fn file_extension_on_replay() {
+        // Start with a small file, write entries referencing a large last_file_offset.
+        let file = InMemoryFile::new(2 * 1024 * 1024); // 2 MiB
+        let region = test_region();
+        let guid = test_guid();
+
+        let desired_size = 4 * 1024 * 1024u64;
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, desired_size)
+            .await
+            .unwrap();
+
+        let page = [0xABu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // The file might still be 2 MiB.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // After replay, the file should be at least desired_size.
+        let sz = file.file_size().await.unwrap();
+        assert!(sz >= desired_size, "expected >= {desired_size}, got {sz}");
+    }
+
+    // -----------------------------------------------------------------------
+    // Tail advancement tests
+    // -----------------------------------------------------------------------
+
+    /// advance_tail reclaims space visible to free_space().
+    #[async_test]
+    async fn advance_tail_reclaims_free_space() {
+        let file = test_file();
+        let region = test_region();
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let initial_free = writer.free_space();
+        let page = [0xAAu8; SECTOR as usize];
+
+        // Write an entry — free space decreases.
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        let after_write = writer.free_space();
+        assert!(after_write < initial_free, "writing should consume space");
+
+        // Advance tail to head — reclaims all space.
+        writer.advance_tail(writer.head);
+
+        // When tail == head, free_space returns 0 (sequence_length returns
+        // length for the full-log case). So instead of checking against
+        // region.length, verify it's more than before the advance.
+        // Actually, when tail == head AND the log is "empty" (we just
+        // advanced past everything), the writer treats it as full.
+        // The write_entry check handles this: if tail == head it uses
+        // the full region. Let's verify we can write another entry.
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 4096,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // After advancing tail and writing one more, free space should be
+        // close to what it was after just the init entry + one data entry.
+        assert!(
+            writer.free_space() >= after_write,
+            "after advancing tail and writing, free space should be >= previous"
+        );
+    }
+
+    /// Write entries until the log is full, advance tail, write more.
+    ///
+    /// This is the core scenario: without advance_tail, the log
+    /// fills up and returns LogFull. With it, space is reclaimed.
+    #[async_test]
+    async fn write_advance_write_more() {
+        let file = test_file();
+        // Use a small log (16 sectors = 64 KiB) to hit the limit quickly.
+        let small_log = 16 * SECTOR;
+        let region = LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: small_log,
+        };
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let page = [0xBBu8; SECTOR as usize];
+        let elen = entry_length(1, 0); // 2 sectors per entry
+
+        // Fill the log until we can't write anymore.
+        let mut entries_written = 0u32;
+        loop {
+            let needed = elen + SECTOR; // entry + 1 reserved
+            if writer.tail == writer.head {
+                if needed > writer.region.length {
+                    break;
+                }
+            } else if needed > writer.free_space() {
+                break;
+            }
+
+            writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: LOGGABLE_OFFSET + (entries_written as u64) * 4096,
+                        payload: &page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap();
+            entries_written += 1;
+        }
+
+        assert!(
+            entries_written > 0,
+            "should have written at least one entry"
+        );
+
+        // Confirm the log is now full.
+        let result = writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page,
+                }],
+                &[],
+            )
+            .await;
+        assert!(matches!(result, Ok(None)), "log should be full");
+
+        // Advance tail past all entries — reclaim everything.
+        writer.advance_tail(writer.head);
+
+        // Now we should be able to write again.
+        let mut more_written = 0u32;
+        loop {
+            let needed = elen + SECTOR;
+            if writer.tail == writer.head {
+                if needed > writer.region.length {
+                    break;
+                }
+            } else if needed > writer.free_space() {
+                break;
+            }
+
+            writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: LOGGABLE_OFFSET
+                            + ((entries_written + more_written) as u64) * 4096,
+                        payload: &page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap();
+            more_written += 1;
+        }
+
+        assert!(
+            more_written > 0,
+            "should write more entries after advancing tail"
+        );
+    }
+
+    /// Incremental tail advancement: advance after each entry, write many
+    /// more entries than the log can hold without reclamation.
+    #[async_test]
+    async fn incremental_advance_exceeds_log_capacity() {
+        let file = test_file();
+        // Tiny log: 8 sectors = 32 KiB.
+        let small_log = 8 * SECTOR;
+        let region = LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: small_log,
+        };
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let page = [0xCCu8; SECTOR as usize];
+
+        // The log has 8 sectors. Init takes 1. Each data entry takes 2.
+        // Without advancement, we can fit ~3 entries before full.
+        // With incremental advancement, we can write indefinitely.
+        // Write 50 entries — well beyond the log's raw capacity.
+        for i in 0..50u32 {
+            let head_before = writer.head;
+            writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: LOGGABLE_OFFSET + (i as u64) * 4096,
+                        payload: &page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap_or_else(|e| panic!("entry {i} failed: {e}"));
+            // Advance tail to where head was before this entry.
+            // This simulates "apply completed for the previous entry."
+            writer.advance_tail(head_before);
+        }
+    }
+
+    /// Replay after tail advancement: entries before the advanced tail are
+    /// not part of the valid sequence, so replay only applies entries from
+    /// the new tail onward.
+    #[async_test]
+    async fn replay_after_tail_advance() {
+        let file = test_file();
+        let small_log = 16 * SECTOR;
+        let region = LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: small_log,
+        };
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        // Write entry A at LOGGABLE_OFFSET.
+        let page_a = [0xAAu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET,
+                    payload: &page_a,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        let head_after_a = writer.head;
+
+        // Write entry B at LOGGABLE_OFFSET + 4096.
+        let page_b = [0xBBu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 4096,
+                    payload: &page_b,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Advance tail past the init entry and entry A.
+        // The next write_entry will embed this new tail in its header.
+        writer.advance_tail(head_after_a);
+
+        // Write entry C to embed the new tail.
+        let page_c = [0xCCu8; SECTOR as usize];
+        writer
+            .write_entry(
+                &file,
+                &[DataPage {
+                    file_offset: LOGGABLE_OFFSET + 8192,
+                    payload: &page_c,
+                }],
+                &[],
+            )
+            .await
+            .unwrap();
+
+        // Zero out the target areas to prove replay writes them.
+        let zeros = [0u8; SECTOR as usize];
+        file.write_at(LOGGABLE_OFFSET, &zeros).await.unwrap();
+        file.write_at(LOGGABLE_OFFSET + 4096, &zeros).await.unwrap();
+        file.write_at(LOGGABLE_OFFSET + 8192, &zeros).await.unwrap();
+
+        // Replay. The scanner should find the sequence starting at the
+        // new tail (head_after_a), which includes entries B and C.
+        // Entry A is before tail — it may or may not be replayed depending
+        // on scanner behavior (it's idempotent either way).
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // Entries B and C must be replayed.
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(LOGGABLE_OFFSET + 4096, &mut buf)
+            .await
+            .unwrap();
+        assert_eq!(buf, page_b, "entry B should be replayed");
+        file.read_at(LOGGABLE_OFFSET + 8192, &mut buf)
+            .await
+            .unwrap();
+        assert_eq!(buf, page_c, "entry C should be replayed");
+    }
+
+    /// Wrap-around with incremental tail advancement: write enough entries
+    /// with per-entry advancement to force both head and tail past the
+    /// circular boundary.
+    #[async_test]
+    async fn wrap_around_with_incremental_advance() {
+        let file = test_file();
+        // 8-sector log. Each data entry = 2 sectors. After init (1 sector),
+        // without advancement we'd fit ~3 entries. With advancement we wrap.
+        let small_log = 8 * SECTOR;
+        let region = LogRegion {
+            file_offset: TEST_LOG_OFFSET,
+            length: small_log,
+        };
+        let guid = test_guid();
+
+        let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let page = [0xDDu8; SECTOR as usize];
+
+        // Write 20 entries, advancing tail before each write to keep
+        // only the last entry valid. This forces both head and tail
+        // to wrap multiple times.
+        let mut last_head = writer.head;
+        for i in 0..20u32 {
+            writer.advance_tail(last_head);
+            last_head = writer.head;
+
+            writer
+                .write_entry(
+                    &file,
+                    &[DataPage {
+                        file_offset: LOGGABLE_OFFSET + (i as u64) * 4096,
+                        payload: &page,
+                    }],
+                    &[],
+                )
+                .await
+                .unwrap_or_else(|e| panic!("entry {i} failed during wrap-around: {e}"));
+        }
+
+        // Head and tail should both have wrapped past the log boundary.
+        // With 20 entries of 2 sectors each in an 8-sector log, we've
+        // gone around 5+ times.
+        // Verify replay works with the final state.
+        let result = replay_log(&file, &region, guid).await.unwrap();
+        assert!(result.replayed);
+
+        // The last entry wrote to LOGGABLE_OFFSET + 19*4096.
+        let mut buf = [0u8; SECTOR as usize];
+        file.read_at(LOGGABLE_OFFSET + 19 * 4096, &mut buf)
+            .await
+            .unwrap();
+        assert_eq!(buf, page, "last entry should be replayed correctly");
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/log_permits.rs b/vm/devices/storage/vhdx/src/log_permits.rs
new file mode 100644
index 0000000000..4bb1c1465d
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/log_permits.rs
@@ -0,0 +1,211 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Failable semaphore for log pipeline backpressure.
+//!
+//! [`LogPermits`] limits how many pages can be in-flight in the
+//! cache → log → apply pipeline at once. This bounds memory
+//! consumption: each in-flight page holds an `Arc<[u8; 4096]>`
+//! that cannot be freed until the apply task writes it to its
+//! final file offset.
+//!
+//! **Lifecycle of a permit:**
+//! 1. Cache acquires a permit before transitioning a page to
+//!    `HasPermit` / `Dirty`.
+//! 2. The permit stays consumed through commit → log → apply.
+//! 3. The apply task releases the permit after writing the page
+//!    to its final offset and flushing.
+//!
+//! If the log task fails, the semaphore is **poisoned** — all
+//! pending and future acquires return an error.
+
+use crate::error::PipelineFailed;
+use event_listener::Event;
+use parking_lot::Mutex;
+
+/// Failable semaphore shared between the cache and the apply task.
+///
+/// The cache acquires permits before dirtying pages. The **apply task**
+/// releases permits after writing pages to their final file offsets.
+/// Do NOT release permits at commit time — that defeats backpressure
+/// and allows unbounded in-flight allocations.
+///
+/// If the log task fails, it poisons the semaphore — all waiters and
+/// future callers get errors.
+pub(crate) struct LogPermits {
+    state: Mutex<PermitState>,
+    event: Event,
+    max_permits: usize,
+}
+
+struct PermitState {
+    available: usize,
+    failed: Option<String>,
+}
+
+impl LogPermits {
+    /// Create a new semaphore with `max_in_flight` permits.
+    pub fn new(max_in_flight: usize) -> Self {
+        Self {
+            state: Mutex::new(PermitState {
+                available: max_in_flight,
+                failed: None,
+            }),
+            event: Event::new(),
+            max_permits: max_in_flight,
+        }
+    }
+
+    /// Acquire `count` permits.
+    ///
+    /// Blocks if insufficient permits are available. Returns an error
+    /// if the semaphore has been poisoned.
+    pub async fn acquire(&self, count: usize) -> Result<(), PipelineFailed> {
+        loop {
+            let listener = self.event.listen();
+            {
+                let mut state = self.state.lock();
+                if let Some(ref err) = state.failed {
+                    return Err(PipelineFailed(err.clone()));
+                }
+                if state.available >= count {
+                    state.available -= count;
+                    return Ok(());
+                }
+            }
+            listener.await;
+        }
+    }
+
+    /// Release `count` permits back to the pool.
+    ///
+    /// Called by the apply task after writing pages to their final offsets.
+    pub fn release(&self, count: usize) {
+        {
+            let mut state = self.state.lock();
+            state.available += count;
+            assert!(
+                state.available <= self.max_permits,
+                "released more permits than were acquired: available {} > max {}",
+                state.available,
+                self.max_permits,
+            );
+        }
+        self.event.notify(usize::MAX);
+    }
+
+    /// Poison the semaphore. All pending and future acquires will fail.
+    ///
+    /// Called by the log task on error.
+    pub fn fail(&self, error: String) {
+        {
+            let mut state = self.state.lock();
+            state.failed = Some(error);
+        }
+        self.event.notify(usize::MAX);
+    }
+
+    /// Returns the number of currently available permits.
+    #[cfg(test)]
+    pub fn available(&self) -> usize {
+        self.state.lock().available
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pal_async::async_test;
+
+    #[async_test]
+    async fn acquire_and_release() {
+        let permits = LogPermits::new(10);
+        permits.acquire(3).await.unwrap();
+        assert_eq!(permits.available(), 7);
+        permits.release(3);
+        assert_eq!(permits.available(), 10);
+    }
+
+    #[async_test]
+    async fn acquire_exact_capacity() {
+        let permits = LogPermits::new(5);
+        permits.acquire(5).await.unwrap();
+        assert_eq!(permits.available(), 0);
+        permits.release(5);
+        assert_eq!(permits.available(), 5);
+    }
+
+    #[async_test]
+    async fn acquire_blocks_then_unblocks() {
+        let permits = std::sync::Arc::new(LogPermits::new(2));
+        permits.acquire(2).await.unwrap();
+        assert_eq!(permits.available(), 0);
+
+        let p = permits.clone();
+        let (acquired_tx, acquired_rx) = mesh::oneshot();
+        let handle = std::thread::spawn(move || {
+            futures::executor::block_on(async {
+                p.acquire(1).await.unwrap();
+                acquired_tx.send(());
+            });
+        });
+
+        // Give the thread time to block on acquire.
+        std::thread::sleep(std::time::Duration::from_millis(50));
+        // Should still be blocked (0 available).
+        assert_eq!(permits.available(), 0);
+
+        // Release one permit — unblocks the waiter.
+        permits.release(1);
+        acquired_rx.await.unwrap();
+        handle.join().unwrap();
+    }
+
+    #[async_test]
+    async fn poison_fails_pending_acquire() {
+        let permits = std::sync::Arc::new(LogPermits::new(0));
+
+        let p = permits.clone();
+        let handle = std::thread::spawn(move || {
+            futures::executor::block_on(async {
+                let result = p.acquire(1).await;
+                assert!(result.is_err());
+            });
+        });
+
+        // Give the thread time to block.
+        std::thread::sleep(std::time::Duration::from_millis(50));
+        permits.fail("log write failed".into());
+        handle.join().unwrap();
+    }
+
+    #[async_test]
+    async fn poison_fails_future_acquire() {
+        let permits = LogPermits::new(10);
+        permits.fail("log write failed".into());
+        let result = permits.acquire(1).await;
+        assert!(result.is_err());
+    }
+
+    #[async_test]
+    async fn release_after_poison_is_harmless() {
+        let permits = LogPermits::new(5);
+        permits.acquire(3).await.unwrap();
+        permits.fail("oops".into());
+        // Release after poison doesn't panic.
+        permits.release(3);
+        // But acquire still fails.
+        assert!(permits.acquire(1).await.is_err());
+    }
+
+    #[async_test]
+    async fn multiple_acquires_serialize() {
+        let permits = LogPermits::new(3);
+        permits.acquire(2).await.unwrap();
+        permits.acquire(1).await.unwrap();
+        assert_eq!(permits.available(), 0);
+        permits.release(1);
+        permits.acquire(1).await.unwrap();
+        assert_eq!(permits.available(), 0);
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/log_task.rs b/vm/devices/storage/vhdx/src/log_task.rs
new file mode 100644
index 0000000000..8e109dcb6f
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/log_task.rs
@@ -0,0 +1,727 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Log task — a single async task that owns all log state and provides
+//! crash-consistent metadata persistence.
+//!
+//! The log task receives [`LogRequest`] messages via a `mesh` channel.
+//! [`LogRequest::Commit`] is fire-and-forget: the cache sends a batch
+//! of dirty pages and moves on. The log task writes WAL entries,
+//! releases permits, and publishes `logged_through_lsn`.
+//!
+//! After logging a batch, the log task sends it to the
+//! [apply task](crate::apply_task) for writing to final file offsets.
+//! The apply task publishes `applied_through_lsn`, which the log task
+//! reads to advance its tail.
+//!
+//! # Crash Consistency
+//!
+//! Metadata changes (BAT entries, sector bitmap bits) are journaled before
+//! being committed to their final locations. On crash,
+//! [`replay_log()`](crate::log::replay_log) restores them.
+
+use crate::AsyncFile;
+use crate::apply_task::ApplyBatch;
+use crate::error::PipelineFailed;
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::flush::FlushSequencer;
+use crate::flush::Fsn;
+use crate::format::LOG_SECTOR_SIZE;
+use crate::log::DataPage;
+use crate::log::LogWriter;
+use crate::log_permits::LogPermits;
+use crate::lsn_watermark::LsnWatermark;
+use crate::open::FailureFlag;
+use mesh::rpc::Rpc;
+use std::collections::VecDeque;
+use std::sync::Arc;
+use thiserror::Error;
+
+const LOG_DATA_PAGE_SIZE: usize = LOG_SECTOR_SIZE as usize;
+
+/// Internal error type for the log task.
+///
+/// Captures the three failure modes of the log pipeline without
+/// pulling in the public [`VhdxIoError`]. Converted to [`VhdxIoError`]
+/// only at the [`LogClient::close`] boundary.
+#[derive(Debug, Error)]
+pub(crate) enum LogTaskError {
+    /// An I/O error from WAL writes or flushes.
+    #[error("flush error")]
+    Flush(#[source] std::io::Error),
+    /// The apply task (or another pipeline stage) has failed.
+    #[error("pipeline failed")]
+    PipelineFailed(#[source] PipelineFailed),
+    #[error("failed to write log entry")]
+    Write(#[source] std::io::Error),
+    #[error("log transaction too big ({0} pages)")]
+    TransactionTooBig(usize),
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub(crate) struct Lsn(u64);
+
+impl Lsn {
+    pub const ZERO: Lsn = Lsn(0);
+
+    #[cfg(test)]
+    pub(crate) const fn new(value: u64) -> Self {
+        Self(value)
+    }
+}
+
+/// A request to the log task.
+pub(crate) enum LogRequest<B> {
+    /// Log a batch of dirty pages (fire-and-forget).
+    Commit(Transaction<B>),
+
+    /// Graceful shutdown: log all pending, wait for apply, clear log GUID.
+    Close(Rpc<(), Result<(), LogTaskError>>),
+}
+
+/// Committed data at a log-data-page-aligned file offset.
+pub(crate) struct LogData<B> {
+    /// File offset where this data should ultimately be written.
+    file_offset: u64,
+    /// Log-data-page-aligned data (shared with the cache via Arc COW).
+    data: Arc<B>,
+}
+
+impl<B: AsRef<[u8]>> LogData<B> {
+    pub(crate) fn new(file_offset: u64, data: Arc<B>) -> Self {
+        let len = data.as_ref().as_ref().len();
+        assert_ne!(len, 0);
+        assert!(
+            file_offset.is_multiple_of(LOG_DATA_PAGE_SIZE as u64),
+            "committed data offset {file_offset:#x} is not {LOG_DATA_PAGE_SIZE}-byte aligned",
+        );
+        assert!(
+            len.is_multiple_of(LOG_DATA_PAGE_SIZE),
+            "committed data length {len} is not {LOG_DATA_PAGE_SIZE}-byte aligned"
+        );
+        Self { file_offset, data }
+    }
+
+    pub(crate) fn page_count(&self) -> usize {
+        self.data.as_ref().as_ref().len() / LOG_DATA_PAGE_SIZE
+    }
+
+    #[cfg(test)]
+    pub(crate) fn data(&self) -> &[u8] {
+        self.data.as_ref().as_ref()
+    }
+
+    pub(crate) fn into_parts(self) -> (u64, Arc<B>) {
+        (self.file_offset, self.data)
+    }
+}
+
+/// A batch of dirty pages to be logged atomically.
+pub(crate) struct Transaction<B> {
+    /// The LSN assigned by the cache at commit time.
+    pub lsn: Lsn,
+    /// The data in this batch.
+    pub data: Vec<LogData<B>>,
+    /// If set, the log task must wait for this FSN to complete before
+    /// writing the WAL entry.
+    pub pre_log_fsn: Option<Fsn>,
+}
+
+/// Client-side handle for sending transactions to the log task.
+///
+/// Couples the `Sender<LogRequest>` with the LSN counter so that
+/// LSN assignment and channel send are always atomic. All methods
+/// take `&mut self` — the caller (cache's `PageMap` lock) provides
+/// exclusivity.
+pub(crate) struct LogClient<B> {
+    sender: mesh::Sender<LogRequest<B>>,
+    current_lsn: Lsn,
+}
+
+impl<B: Send + Sync + 'static> LogClient<B> {
+    /// Create a new log client wrapping the given sender.
+    pub fn new(sender: mesh::Sender<LogRequest<B>>) -> Self {
+        Self {
+            sender,
+            current_lsn: Lsn::ZERO,
+        }
+    }
+
+    /// Returns the most recently committed LSN (0 if none).
+    pub fn current_lsn(&self) -> Lsn {
+        self.current_lsn
+    }
+
+    /// Begin a new transaction. The returned [`LogTransaction`] borrows
+    /// `self` mutably, preventing interleaved transactions.
+    ///
+    /// The LSN is not assigned until [`LogTransaction::commit()`] is
+    /// called. Dropping the transaction without committing is a no-op.
+    pub fn begin(&mut self) -> LogTransaction<'_, B> {
+        LogTransaction { client: self }
+    }
+
+    /// Send a graceful close request to the log task and wait for
+    /// it to finish processing all pending batches.
+    ///
+    /// Consumes the client (drops the sender after the RPC completes).
+    pub async fn close(self) -> Result<(), VhdxIoError> {
+        use mesh::rpc::RpcSend;
+        self.sender
+            .call(LogRequest::Close, ())
+            .await
+            .map_err(|_| VhdxIoErrorInner::Failed(PipelineFailed("log task closed".into())))?
+            .map_err(VhdxIoErrorInner::LogClose)?;
+        Ok(())
+    }
+}
+
+/// An in-progress log transaction. Borrows the [`LogClient`] mutably
+/// to prevent interleaved sends.
+///
+/// Call [`commit()`](Self::commit) to assign an LSN and send the
+/// transaction to the log task. Dropping without committing is safe
+/// and does not advance the LSN.
+pub(crate) struct LogTransaction<'a, B> {
+    client: &'a mut LogClient<B>,
+}
+
+impl<B: Send + Sync + 'static> LogTransaction<'_, B> {
+    /// The LSN that will be assigned if this transaction is committed.
+    pub fn lsn(&self) -> Lsn {
+        Lsn(self.client.current_lsn.0 + 1)
+    }
+
+    /// Commit the transaction: assign the next LSN and send it to the
+    /// log task. Consumes the transaction.
+    pub fn commit(self, log_data: Vec<LogData<B>>, pre_log_fsn: Option<Fsn>) -> Lsn {
+        self.client.current_lsn.0 += 1;
+        let lsn = self.client.current_lsn;
+        self.client.sender.send(LogRequest::Commit(Transaction {
+            lsn,
+            data: log_data,
+            pre_log_fsn,
+        }));
+        lsn
+    }
+}
+/// Tracks a batch that has been sent to the applier but whose tail
+/// hasn't been advanced yet.
+struct PendingTail {
+    /// The LSN of the batch. Once `applied_lsn >= lsn`, the tail
+    /// can advance to `new_tail`.
+    lsn: Lsn,
+    /// The log-region offset to advance the tail to.
+    new_tail: u32,
+}
+
+/// All mutable state owned by the log task.
+pub(crate) struct LogTask<F: AsyncFile> {
+    file: Arc<F>,
+    log_writer: LogWriter,
+    flush_sequencer: Arc<FlushSequencer>,
+    log_permits: Arc<LogPermits>,
+    logged_lsn: Arc<LsnWatermark>,
+    applied_lsn: Arc<LsnWatermark>,
+    apply_tx: mesh::Sender<ApplyBatch<F::Buffer>>,
+    pending_tails: VecDeque<PendingTail>,
+    failure_flag: Arc<FailureFlag>,
+}
+
+impl<F: AsyncFile> LogTask<F> {
+    /// Create a new log task with the given dependencies.
+    pub(crate) fn new(
+        file: Arc<F>,
+        log_writer: LogWriter,
+        flush_sequencer: Arc<FlushSequencer>,
+        log_permits: Arc<LogPermits>,
+        logged_lsn: Arc<LsnWatermark>,
+        applied_lsn: Arc<LsnWatermark>,
+        apply_tx: mesh::Sender<ApplyBatch<F::Buffer>>,
+        failure_flag: Arc<FailureFlag>,
+    ) -> Self {
+        Self {
+            file,
+            log_writer,
+            flush_sequencer,
+            log_permits,
+            logged_lsn,
+            applied_lsn,
+            apply_tx,
+            pending_tails: VecDeque::new(),
+            failure_flag,
+        }
+    }
+
+    /// Run the log task main loop.
+    ///
+    /// Consumes requests from `rx` until a `Close` request is received
+    /// or the channel is dropped.
+    pub async fn run(mut self, mut rx: mesh::Receiver<LogRequest<F::Buffer>>) {
+        loop {
+            self.advance_tails();
+
+            let request = match rx.recv().await {
+                Ok(req) => req,
+                Err(_) => {
+                    tracing::warn!("VHDX log task: channel closed without close() — file is dirty");
+                    break;
+                }
+            };
+
+            match request {
+                LogRequest::<F::Buffer>::Commit(txn) => {
+                    if let Err(e) = self.handle_commit(txn).await {
+                        tracing::error!("VHDX log task fatal error: {e}");
+                        self.log_permits.fail(e.to_string());
+                        self.logged_lsn.fail(e.to_string());
+                        self.failure_flag.set(&e);
+                        break;
+                    }
+                }
+                LogRequest::<F::Buffer>::Close(rpc) => {
+                    rpc.handle(async |()| self.graceful_close().await).await;
+                    break;
+                }
+            }
+        }
+    }
+
+    /// Advance the log tail for all batches whose applied data has
+    /// been flushed (i.e., `applied_fsn <= completed_fsn`).
+    fn advance_tails(&mut self) {
+        let flushed_fsn = self.flush_sequencer.completed_fsn();
+        let (applied, applied_fsn) = self.applied_lsn.get_with_fsn();
+        while let Some(front) = self.pending_tails.front() {
+            if front.lsn <= applied && applied_fsn <= flushed_fsn {
+                self.log_writer.advance_tail(front.new_tail);
+                self.pending_tails.pop_front();
+            } else {
+                break;
+            }
+        }
+    }
+
+    /// Flush applied data and advance tails. Used when the log is full
+    /// and we need to reclaim space.
+    async fn flush_and_advance_tails(&mut self) -> Result<(), LogTaskError> {
+        if let Some(front) = self.pending_tails.front() {
+            let target = front.lsn;
+            let applied_fsn = self
+                .applied_lsn
+                .wait_for(target)
+                .await
+                .map_err(LogTaskError::PipelineFailed)?;
+            self.flush_sequencer
+                .flush_through(self.file.as_ref(), applied_fsn)
+                .await
+                .map_err(LogTaskError::Flush)?;
+            self.advance_tails();
+        }
+        Ok(())
+    }
+
+    /// Write a WAL entry for the given pages (no flush).
+    ///
+    /// Returns `Ok(true)` if the entry was written, `Ok(false)` if the
+    /// log is full (caller should drain and retry), or `Err` on I/O error.
+    async fn write_log_entry(
+        &mut self,
+        pages: &[LogData<F::Buffer>],
+    ) -> Result<bool, LogTaskError> {
+        let page_count = pages.iter().map(LogData::page_count).sum();
+        let mut data_pages = Vec::with_capacity(page_count);
+        for p in pages {
+            for (i, payload) in p.data.as_ref().as_ref().as_chunks().0.iter().enumerate() {
+                data_pages.push(DataPage {
+                    file_offset: p.file_offset + (i * LOG_DATA_PAGE_SIZE) as u64,
+                    payload,
+                });
+            }
+        }
+
+        Ok(self
+            .log_writer
+            .write_entry(self.file.as_ref(), &data_pages, &[])
+            .await
+            .map_err(LogTaskError::Write)?
+            .is_some())
+    }
+
+    /// Handle a Commit request: write WAL entry, publish LSN, send batch
+    /// to applier. If the log is full, flushes applied data and retries.
+    ///
+    /// Returns `Err` on any fatal error. The caller (`run`) poisons
+    /// the permits and watermarks — individual methods don't.
+    async fn handle_commit(&mut self, txn: Transaction<F::Buffer>) -> Result<(), LogTaskError> {
+        let lsn = txn.lsn;
+
+        // Ensure pre_log_fsn constraint is met before logging.
+        if let Some(fsn) = txn.pre_log_fsn {
+            self.flush_sequencer
+                .flush_through(self.file.as_ref(), fsn)
+                .await
+                .map_err(LogTaskError::Flush)?;
+        }
+
+        // Write WAL entry, retrying if the log is full.
+        while !self.write_log_entry(&txn.data).await? {
+            if self.pending_tails.is_empty() {
+                return Err(LogTaskError::TransactionTooBig(
+                    txn.data.iter().map(LogData::page_count).sum(),
+                ));
+            }
+            self.flush_and_advance_tails().await?;
+        }
+
+        // Capture FSN after the WAL write. Flushing through this FSN
+        // makes the WAL entry durable. We don't flush here —
+        // VhdxFile::flush() will do it, or the LogFull path will if
+        // space is needed.
+        let wal_fsn = self.flush_sequencer.current_fsn();
+        self.logged_lsn.advance(lsn, wal_fsn);
+
+        let new_tail = self.log_writer.head();
+
+        // Send to applier for background apply.
+        self.apply_tx.send(ApplyBatch {
+            data: txn.data,
+            lsn,
+        });
+
+        self.pending_tails.push_back(PendingTail { lsn, new_tail });
+        Ok(())
+    }
+
+    /// Graceful close: wait for all applies, flush, advance tails.
+    ///
+    /// After this returns, the log region is fully drained. The caller
+    /// is responsible for clearing the log GUID in the header.
+    async fn graceful_close(&mut self) -> Result<(), LogTaskError> {
+        // Wait for all pending applies and flush.
+        if let Some(last) = self.pending_tails.back() {
+            let target_lsn = last.lsn;
+            let applied_fsn = self
+                .applied_lsn
+                .wait_for(target_lsn)
+                .await
+                .map_err(LogTaskError::PipelineFailed)?;
+            self.flush_sequencer
+                .flush_through(self.file.as_ref(), applied_fsn)
+                .await
+                .map_err(LogTaskError::Flush)?;
+        }
+
+        // Advance all tails — data is durable at final offsets.
+        for pt in self.pending_tails.drain(..) {
+            self.log_writer.advance_tail(pt.new_tail);
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::apply_task;
+    use crate::log::LogRegion;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+    use pal_async::task::Spawn;
+
+    const LOG_SIZE: u32 = 64 * 4096; // 256 KiB — deliberately small
+    const LOG_OFFSET: u64 = 1024 * 1024; // 1 MiB into the file
+
+    /// Set up a log task + apply task connected via channels.
+    /// Returns (log_tx, file, permits, logged_lsn, applied_lsn,
+    /// log_task_handle, apply_task_handle).
+    async fn setup_pipeline(
+        driver: &pal_async::DefaultDriver,
+        log_size: u32,
+        permit_count: usize,
+    ) -> (
+        mesh::Sender<LogRequest<Vec<u8>>>,
+        Arc<InMemoryFile>,
+        Arc<LogPermits>,
+        Arc<LsnWatermark>,
+        Arc<LsnWatermark>,
+        pal_async::task::Task<()>,
+        pal_async::task::Task<()>,
+    ) {
+        let file = Arc::new(InMemoryFile::new(4 * 1024 * 1024));
+        setup_pipeline_with_file(driver, file, log_size, permit_count).await
+    }
+
+    /// Like `setup_pipeline`, but with a caller-provided file.
+    async fn setup_pipeline_with_file(
+        driver: &pal_async::DefaultDriver,
+        file: Arc<InMemoryFile>,
+        log_size: u32,
+        permit_count: usize,
+    ) -> (
+        mesh::Sender<LogRequest<Vec<u8>>>,
+        Arc<InMemoryFile>,
+        Arc<LogPermits>,
+        Arc<LsnWatermark>,
+        Arc<LsnWatermark>,
+        pal_async::task::Task<()>,
+        pal_async::task::Task<()>,
+    ) {
+        let region = LogRegion {
+            file_offset: LOG_OFFSET,
+            length: log_size,
+        };
+        let guid = guid::Guid::new_random();
+        let log_writer = LogWriter::initialize(file.as_ref(), region, guid, 4 * 1024 * 1024)
+            .await
+            .unwrap();
+
+        let flush_sequencer = Arc::new(FlushSequencer::new());
+        let log_permits = Arc::new(LogPermits::new(permit_count));
+        let logged_lsn = Arc::new(LsnWatermark::new());
+        let applied_lsn = Arc::new(LsnWatermark::new());
+
+        let failure_flag = Arc::new(FailureFlag::new());
+
+        let (apply_tx, apply_rx) = mesh::channel::<ApplyBatch<Vec<u8>>>();
+        let (log_tx, log_rx) = mesh::channel::<LogRequest<Vec<u8>>>();
+
+        // Spawn apply task.
+        let apply_task = driver.spawn(
+            "test-apply",
+            apply_task::run_apply_task(
+                apply_rx,
+                file.clone(),
+                flush_sequencer.clone(),
+                applied_lsn.clone(),
+                log_permits.clone(),
+                failure_flag.clone(),
+            ),
+        );
+
+        // Spawn log task.
+        let log_task = driver.spawn(
+            "test-log",
+            LogTask::new(
+                file.clone(),
+                log_writer,
+                flush_sequencer,
+                log_permits.clone(),
+                logged_lsn.clone(),
+                applied_lsn.clone(),
+                apply_tx,
+                failure_flag,
+            )
+            .run(log_rx),
+        );
+
+        (
+            log_tx,
+            file,
+            log_permits,
+            logged_lsn,
+            applied_lsn,
+            log_task,
+            apply_task,
+        )
+    }
+
+    /// Build a Transaction with `n` fake pages.
+    fn make_txn(lsn: Lsn, n: usize) -> Transaction<Vec<u8>> {
+        let pages = (0..n)
+            .map(|i| {
+                LogData::new(
+                    (2 * 1024 * 1024 + i * LOG_DATA_PAGE_SIZE) as u64,
+                    Arc::new(vec![lsn.0 as u8; LOG_DATA_PAGE_SIZE]),
+                )
+            })
+            .collect();
+        Transaction {
+            lsn,
+            data: pages,
+            pre_log_fsn: None,
+        }
+    }
+
+    /// Acquire permits and send a commit. Mirrors what the cache does:
+    /// acquire permits for each page, then commit (which sends the
+    /// transaction to the log task).
+    async fn send_commit(
+        tx: &mesh::Sender<LogRequest<Vec<u8>>>,
+        permits: &LogPermits,
+        lsn: Lsn,
+        page_count: usize,
+    ) {
+        permits.acquire(page_count).await.unwrap();
+        tx.send(LogRequest::Commit(make_txn(lsn, page_count)));
+    }
+
+    #[async_test]
+    async fn single_commit_publishes_lsn(driver: pal_async::DefaultDriver) {
+        let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) =
+            setup_pipeline(&driver, LOG_SIZE, 100).await;
+
+        send_commit(&tx, &permits, Lsn(1), 1).await;
+        logged_lsn.wait_for(Lsn(1)).await.unwrap();
+    }
+
+    #[async_test]
+    async fn permits_return_after_apply(driver: pal_async::DefaultDriver) {
+        let permit_count = 10;
+        let (tx, _file, permits, logged_lsn, applied_lsn, _log_task, _apply_task) =
+            setup_pipeline(&driver, LOG_SIZE, permit_count).await;
+
+        // Send a commit of 5 pages (acquires 5 permits).
+        send_commit(&tx, &permits, Lsn(1), 5).await;
+
+        // Wait for the apply task to finish.
+        logged_lsn.wait_for(Lsn(1)).await.unwrap();
+        applied_lsn.wait_for(Lsn(1)).await.unwrap();
+
+        // The apply task should have released 5 permits.
+        // All 10 should be available again.
+        assert_eq!(permits.available(), permit_count);
+    }
+
+    #[async_test]
+    async fn multiple_commits_sequential(driver: pal_async::DefaultDriver) {
+        let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) =
+            setup_pipeline(&driver, LOG_SIZE, 100).await;
+
+        for lsn in 1..=10u64 {
+            send_commit(&tx, &permits, Lsn(lsn), 1).await;
+        }
+
+        // All 10 should be logged.
+        logged_lsn.wait_for(Lsn(10)).await.unwrap();
+    }
+
+    #[async_test]
+    async fn log_full_retry_makes_progress(driver: pal_async::DefaultDriver) {
+        // Use a small log (256 KiB). Each page + entry overhead ~ 8 KiB.
+        // With ~30 entries the log will fill up, forcing the retry path.
+        let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) =
+            setup_pipeline(&driver, LOG_SIZE, 500).await;
+
+        // Send 50 single-page commits. This will exceed the 256 KiB log
+        // and force LogFull → wait for apply → advance tail → retry.
+        for lsn in 1..=50u64 {
+            send_commit(&tx, &permits, Lsn(lsn), 1).await;
+        }
+
+        // If LogFull retry works, all 50 will eventually be logged.
+        logged_lsn.wait_for(Lsn(50)).await.unwrap();
+    }
+
+    #[async_test]
+    async fn large_batches_through_small_log(driver: pal_async::DefaultDriver) {
+        // Each batch has 5 pages (~24 KiB with overhead). 256 KiB log
+        // fits maybe 10 batches. Send 30 — forces multiple cycles of
+        // LogFull → drain → retry.
+        let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) =
+            setup_pipeline(&driver, LOG_SIZE, 500).await;
+
+        for lsn in 1..=30u64 {
+            send_commit(&tx, &permits, Lsn(lsn), 5).await;
+        }
+
+        logged_lsn.wait_for(Lsn(30)).await.unwrap();
+    }
+
+    #[async_test]
+    async fn close_after_commits(driver: pal_async::DefaultDriver) {
+        use mesh::rpc::RpcSend;
+
+        let (tx, _file, permits, logged_lsn, applied_lsn, _log_task, _apply_task) =
+            setup_pipeline(&driver, LOG_SIZE, 100).await;
+
+        for lsn in 1..=5u64 {
+            send_commit(&tx, &permits, Lsn(lsn), 1).await;
+        }
+        logged_lsn.wait_for(Lsn(5)).await.unwrap();
+
+        // Graceful close should wait for all applies and succeed.
+        let result = tx.call(LogRequest::<Vec<u8>>::Close, ()).await.unwrap();
+        result.unwrap();
+
+        // All commits should be applied.
+        assert!(applied_lsn.get() >= Lsn(5));
+    }
+
+    #[async_test]
+    async fn applied_data_is_at_final_offset(driver: pal_async::DefaultDriver) {
+        let (tx, file, permits, logged_lsn, applied_lsn, _log_task, _apply_task) =
+            setup_pipeline(&driver, LOG_SIZE, 100).await;
+
+        let target_offset: u64 = 2 * 1024 * 1024; // 2 MiB
+        let data = Arc::new(vec![0xAB_u8; LOG_DATA_PAGE_SIZE]);
+        permits.acquire(1).await.unwrap();
+        tx.send(LogRequest::Commit(Transaction {
+            lsn: Lsn(1),
+            data: vec![LogData::new(target_offset, data.clone())],
+            pre_log_fsn: None,
+        }));
+
+        logged_lsn.wait_for(Lsn(1)).await.unwrap();
+        applied_lsn.wait_for(Lsn(1)).await.unwrap();
+
+        // Read back from the final offset — should match.
+        let mut buf = [0u8; LOG_DATA_PAGE_SIZE];
+        file.read_at(target_offset, &mut buf).await.unwrap();
+        assert!(buf.iter().all(|&b| b == 0xAB));
+    }
+
+    #[async_test]
+    async fn apply_write_failure_poisons_pipeline(driver: pal_async::DefaultDriver) {
+        use crate::tests::support::IoInterceptor;
+
+        // Interceptor that fails writes only outside the log region
+        // (i.e., apply writes to final offsets), not WAL writes.
+        struct FailApplyInterceptor {
+            fail: std::sync::atomic::AtomicBool,
+        }
+        impl IoInterceptor for FailApplyInterceptor {
+            fn before_write(&self, offset: u64, _data: &[u8]) -> Result<(), std::io::Error> {
+                // Log region is at LOG_OFFSET (1 MiB). Apply writes go
+                // to 2 MiB+. Only fail writes outside the log region.
+                if self.fail.load(std::sync::atomic::Ordering::Relaxed) && offset >= 2 * 1024 * 1024
+                {
+                    return Err(std::io::Error::other("injected apply write failure"));
+                }
+                Ok(())
+            }
+        }
+
+        let interceptor = Arc::new(FailApplyInterceptor {
+            fail: std::sync::atomic::AtomicBool::new(false),
+        });
+        let file = Arc::new(InMemoryFile::with_interceptor(
+            4 * 1024 * 1024,
+            interceptor.clone() as Arc<dyn IoInterceptor>,
+        ));
+
+        let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) =
+            setup_pipeline_with_file(&driver, file, LOG_SIZE, 100).await;
+
+        // First commit succeeds end-to-end.
+        send_commit(&tx, &permits, Lsn(1), 1).await;
+        logged_lsn.wait_for(Lsn(1)).await.unwrap();
+
+        // Now fail apply writes (but not WAL writes).
+        interceptor
+            .fail
+            .store(true, std::sync::atomic::Ordering::Relaxed);
+
+        // Second commit: WAL write succeeds, but apply write will fail.
+        send_commit(&tx, &permits, Lsn(2), 1).await;
+        logged_lsn.wait_for(Lsn(2)).await.unwrap();
+
+        // The apply task should have poisoned permits after the write failure.
+        // Future permit acquires must fail.
+        let result = permits.acquire(1).await;
+        assert!(result.is_err(), "acquire should fail after apply error");
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/lsn_watermark.rs b/vm/devices/storage/vhdx/src/lsn_watermark.rs
new file mode 100644
index 0000000000..8d15f43e39
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/lsn_watermark.rs
@@ -0,0 +1,224 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! LSN watermark — a shared monotonic counter with async waiters.
+//!
+//! Used to publish progress through the log/apply pipeline:
+//!
+//! - `logged_lsn`: the log task updates this after writing each WAL
+//!   entry. The paired FSN lets callers flush through the sequencer
+//!   to make the WAL entry durable.
+//! - `applied_lsn`: the apply task updates this after writing pages
+//!   to their final offsets. The paired FSN lets the log task flush
+//!   to make the applied data durable (needed for tail advancement).
+//!
+//! Both watermarks carry an `(lsn, fsn)` pair. The LSN tracks progress;
+//! the FSN tells consumers which flush sequence number to
+//! [`flush_through()`](crate::flush::FlushSequencer::flush_through)
+//! to make that progress durable on disk.
+
+use crate::{error::PipelineFailed, flush::Fsn, log_task::Lsn};
+use event_listener::Event;
+use parking_lot::Mutex;
+
+/// A shared monotonic `(lsn, fsn)` counter with async waiting and poisoning.
+///
+/// Writers publish new values via [`advance()`](Self::advance).
+/// Readers wait for the LSN to reach a target via
+/// [`wait_for()`](Self::wait_for), which returns the associated FSN.
+///
+/// If the producer fails, it calls [`fail()`](Self::fail) to poison the
+/// watermark — all pending and future [`wait_for()`](Self::wait_for) calls
+/// return an error.
+pub(crate) struct LsnWatermark {
+    state: Mutex<WatermarkState>,
+    event: Event,
+}
+
+struct WatermarkState {
+    lsn: Lsn,
+    fsn: Fsn,
+    failed: Option<String>,
+}
+
+impl LsnWatermark {
+    /// Create a new watermark starting at LSN 0, FSN 0.
+    pub fn new() -> Self {
+        Self {
+            state: Mutex::new(WatermarkState {
+                lsn: Lsn::ZERO,
+                fsn: Fsn::ZERO,
+                failed: None,
+            }),
+            event: Event::new(),
+        }
+    }
+
+    /// Read the current LSN value.
+    pub fn get(&self) -> Lsn {
+        self.state.lock().lsn
+    }
+
+    /// Read the current `(lsn, fsn)` pair atomically.
+    pub fn get_with_fsn(&self) -> (Lsn, Fsn) {
+        let s = self.state.lock();
+        (s.lsn, s.fsn)
+    }
+
+    /// Advance the watermark to `(new_lsn, new_fsn)`.
+    ///
+    /// Both values are advanced independently via `max()`. Callers are
+    /// sequential task loops, so in practice LSN and FSN always advance
+    /// together.
+    pub fn advance(&self, new_lsn: Lsn, new_fsn: Fsn) {
+        {
+            let mut s = self.state.lock();
+            s.lsn = s.lsn.max(new_lsn);
+            s.fsn = s.fsn.max(new_fsn);
+        }
+        self.event.notify(usize::MAX);
+    }
+
+    /// Wait until the LSN reaches at least `target`.
+    ///
+    /// Returns the FSN associated with the reached LSN. Callers should
+    /// [`flush_through()`](crate::flush::FlushSequencer::flush_through)
+    /// the returned FSN to ensure durability.
+    ///
+    /// Returns an error if the watermark has been poisoned.
+    pub async fn wait_for(&self, target: Lsn) -> Result<Fsn, PipelineFailed> {
+        loop {
+            let listener = self.event.listen();
+            {
+                let s = self.state.lock();
+                if let Some(ref err) = s.failed {
+                    return Err(PipelineFailed(err.clone()));
+                }
+                if s.lsn >= target {
+                    return Ok(s.fsn);
+                }
+            }
+            listener.await;
+        }
+    }
+
+    /// Poison the watermark. All pending and future `wait_for()` calls
+    /// will return an error.
+    pub fn fail(&self, error: String) {
+        self.state.lock().failed = Some(error);
+        self.event.notify(usize::MAX);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pal_async::async_test;
+
+    #[async_test]
+    async fn starts_at_zero() {
+        let wm = LsnWatermark::new();
+        assert_eq!(wm.get(), Lsn::ZERO);
+        assert_eq!(wm.get_with_fsn(), (Lsn::ZERO, Fsn::ZERO));
+    }
+
+    #[async_test]
+    async fn advance_and_read() {
+        let wm = LsnWatermark::new();
+        wm.advance(Lsn::new(5), Fsn::new(100));
+        assert_eq!(wm.get(), Lsn::new(5));
+        assert_eq!(wm.get_with_fsn(), (Lsn::new(5), Fsn::new(100)));
+        wm.advance(Lsn::new(10), Fsn::new(200));
+        assert_eq!(wm.get(), Lsn::new(10));
+        assert_eq!(wm.get_with_fsn(), (Lsn::new(10), Fsn::new(200)));
+    }
+
+    #[async_test]
+    async fn advance_is_monotonic() {
+        let wm = LsnWatermark::new();
+        wm.advance(Lsn::new(10), Fsn::new(200));
+        wm.advance(Lsn::new(5), Fsn::new(100)); // no-op (both LSN and FSN stay at max)
+        assert_eq!(wm.get(), Lsn::new(10));
+        assert_eq!(wm.get_with_fsn(), (Lsn::new(10), Fsn::new(200)));
+    }
+
+    #[async_test]
+    async fn wait_for_already_reached() {
+        let wm = LsnWatermark::new();
+        wm.advance(Lsn::new(10), Fsn::new(100));
+        let fsn = wm.wait_for(Lsn::new(5)).await.unwrap();
+        assert_eq!(fsn, Fsn::new(100));
+        let fsn = wm.wait_for(Lsn::new(10)).await.unwrap();
+        assert_eq!(fsn, Fsn::new(100));
+    }
+
+    #[async_test]
+    async fn wait_for_returns_fsn() {
+        let wm = LsnWatermark::new();
+        wm.advance(Lsn::new(5), Fsn::new(42));
+        let fsn = wm.wait_for(Lsn::new(5)).await.unwrap();
+        assert_eq!(fsn, Fsn::new(42));
+    }
+
+    #[async_test]
+    async fn wait_for_blocks_then_completes() {
+        let wm = std::sync::Arc::new(LsnWatermark::new());
+
+        let w = wm.clone();
+        let (done_tx, done_rx) = mesh::oneshot();
+        let handle = std::thread::spawn(move || {
+            futures::executor::block_on(async {
+                let fsn = w.wait_for(Lsn::new(5)).await.unwrap();
+                done_tx.send(fsn);
+            });
+        });
+
+        std::thread::sleep(std::time::Duration::from_millis(50));
+        wm.advance(Lsn::new(5), Fsn::new(77));
+        let fsn = done_rx.await.unwrap();
+        assert_eq!(fsn, Fsn::new(77));
+        handle.join().unwrap();
+    }
+
+    #[async_test]
+    async fn wait_for_zero_returns_immediately() {
+        let wm = LsnWatermark::new();
+        let fsn = wm.wait_for(Lsn::ZERO).await.unwrap();
+        assert_eq!(fsn, Fsn::ZERO);
+    }
+
+    #[async_test]
+    async fn poison_fails_future_wait() {
+        let wm = LsnWatermark::new();
+        wm.fail("broken".into());
+        assert!(wm.wait_for(Lsn::new(1)).await.is_err());
+    }
+
+    #[async_test]
+    async fn poison_fails_pending_wait() {
+        let wm = std::sync::Arc::new(LsnWatermark::new());
+
+        let w = wm.clone();
+        let (done_tx, done_rx) = mesh::oneshot();
+        let handle = std::thread::spawn(move || {
+            futures::executor::block_on(async {
+                let result = w.wait_for(Lsn::new(5)).await;
+                assert!(result.is_err());
+                done_tx.send(());
+            });
+        });
+
+        std::thread::sleep(std::time::Duration::from_millis(50));
+        wm.fail("task died".into());
+        done_rx.await.unwrap();
+        handle.join().unwrap();
+    }
+
+    #[async_test]
+    async fn poison_fails_even_for_already_reached() {
+        let wm = LsnWatermark::new();
+        wm.advance(Lsn::new(10), Fsn::new(100));
+        wm.fail("broken".into());
+        assert!(wm.wait_for(Lsn::new(5)).await.is_err());
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/metadata.rs b/vm/devices/storage/vhdx/src/metadata.rs
new file mode 100644
index 0000000000..b7e58d8755
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/metadata.rs
@@ -0,0 +1,477 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Metadata table parsing and item reading for VHDX files.
+//!
+//! Reads the metadata table from the metadata region, validates entries,
+//! and provides lookup and raw-read access for individual metadata items.
+
+use crate::AsyncFile;
+use crate::cache::PAGE_SIZE;
+use crate::cache::PageCache;
+use crate::cache::PageKey;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::error::OpenErrorInner;
+use crate::format;
+use crate::format::MetadataTableEntry;
+use crate::format::MetadataTableHeader;
+use guid::Guid;
+use zerocopy::FromBytes;
+
+/// Cache tag for metadata region pages.
+pub(crate) const METADATA_TAG: u8 = 1;
+
+/// In-memory representation of the validated metadata table.
+pub(crate) struct MetadataTable {
+    /// The validated entries (sorted by offset for free-space scanning).
+    entries: Vec<MetadataTableEntry>,
+}
+
+impl MetadataTable {
+    /// Read and validate the metadata table from the file.
+    pub async fn read(
+        file: &impl AsyncFile,
+        metadata_offset: u64,
+        metadata_length: u32,
+    ) -> Result<Self, OpenError> {
+        // Read the metadata table (first 64 KiB of the metadata region).
+        let buf = file.alloc_buffer(format::METADATA_TABLE_SIZE as usize);
+        let buf = file
+            .read_into(metadata_offset, buf)
+            .await
+            .map_err(OpenErrorInner::Io)?;
+        let buf = buf.as_ref();
+
+        // Validate signature.
+        let header = MetadataTableHeader::read_from_prefix(buf)
+            .map_err(|_| CorruptionType::InvalidMetadataTableSignature)?
+            .0
+            .clone();
+        if header.signature != format::METADATA_TABLE_SIGNATURE {
+            return Err(CorruptionType::InvalidMetadataTableSignature.into());
+        }
+
+        // Validate entry count.
+        if header.entry_count as u64 > format::METADATA_ENTRY_MAX_COUNT {
+            return Err(CorruptionType::MetadataTableEntryCountTooHigh.into());
+        }
+        if header.reserved != 0 || header.reserved2.iter().any(|&value| value != 0) {
+            return Err(CorruptionType::ReservedMetadataTableFieldNonzero.into());
+        }
+
+        // Validate metadata region size.
+        if metadata_length as u64 > format::MAXIMUM_METADATA_REGION_SIZE {
+            return Err(CorruptionType::MetadataRegionTooLarge.into());
+        }
+
+        // Parse entries.
+        let header_size = size_of::<MetadataTableHeader>();
+        let entry_size = size_of::<MetadataTableEntry>();
+        let mut entries = Vec::with_capacity(header.entry_count as usize);
+        for i in 0..header.entry_count as usize {
+            let off = header_size + i * entry_size;
+            let entry = MetadataTableEntry::read_from_prefix(&buf[off..])
+                .unwrap()
+                .0
+                .clone();
+            let supported = u32::from(
+                format::MetadataTableEntryFlags::new()
+                    .with_is_user(true)
+                    .with_is_virtual_disk(true)
+                    .with_is_required(true),
+            );
+            if entry.reserved2 != 0 || u32::from(entry.flags) & !supported != 0 {
+                return Err(CorruptionType::ReservedMetadataTableFieldNonzero.into());
+            }
+            entries.push(entry);
+        }
+
+        // Sort by (is_user, item_id) for duplicate detection.
+        entries.sort_by(|a, b| {
+            a.flags
+                .is_user()
+                .cmp(&b.flags.is_user())
+                .then_with(|| a.item_id.cmp(&b.item_id))
+        });
+
+        // Check for duplicates.
+        for i in 1..entries.len() {
+            if entries[i].flags.is_user() == entries[i - 1].flags.is_user()
+                && entries[i].item_id == entries[i - 1].item_id
+            {
+                return Err(CorruptionType::MetadataDuplicateGuid.into());
+            }
+        }
+
+        // Re-sort by offset for overlap checking.
+        entries.sort_by_key(|e| e.offset);
+
+        // Validate each entry and check for overlaps.
+        let mut user_item_count: u16 = 0;
+        let mut system_item_count: u16 = 0;
+        let mut system_metadata_size: u64 = 0;
+        let mut user_metadata_size: u64 = 0;
+        let mut last_end: u32 = 0;
+
+        for entry in &entries {
+            // User + required is invalid.
+            if entry.flags.is_user() && entry.flags.is_required() {
+                return Err(CorruptionType::MetadataUserRequired.into());
+            }
+
+            // Item size limit.
+            if entry.length as u64 > format::MAXIMUM_METADATA_ITEM_SIZE {
+                return Err(CorruptionType::MetadataItemTooLarge.into());
+            }
+
+            // Zero GUID is invalid.
+            if entry.item_id == Guid::ZERO {
+                return Err(CorruptionType::ZeroMetadataItemId.into());
+            }
+
+            if entry.length == 0 {
+                // Zero-length entries must have zero offset.
+                if entry.offset != 0 {
+                    return Err(CorruptionType::InvalidMetadataEntryOffset.into());
+                }
+            } else {
+                // Non-zero entries: offset must be >= table size and fit in region.
+                if entry.offset < format::METADATA_TABLE_SIZE as u32 {
+                    return Err(CorruptionType::MetadataOverlapping.into());
+                }
+                let end = entry
+                    .offset
+                    .checked_add(entry.length)
+                    .ok_or(CorruptionType::MetadataOverlapping)?;
+                if end > metadata_length {
+                    return Err(CorruptionType::MetadataOverlapping.into());
+                }
+                // Check overlap with previous entry.
+                if entry.offset < last_end {
+                    return Err(CorruptionType::MetadataOverlapping.into());
+                }
+                last_end = end;
+            }
+
+            // Track sizes per category.
+            if entry.flags.is_user() {
+                user_item_count += 1;
+                user_metadata_size += entry.length as u64;
+            } else {
+                system_item_count += 1;
+                system_metadata_size += entry.length as u64;
+            }
+        }
+
+        // Validate entry counts.
+        if user_item_count as u64 > format::METADATA_USER_ENTRY_MAX_COUNT {
+            return Err(CorruptionType::MetadataUserCountExceeded.into());
+        }
+        if system_item_count as u64 > format::METADATA_SYSTEM_ENTRY_MAX_COUNT {
+            return Err(CorruptionType::MetadataTableEntryCountTooHigh.into());
+        }
+
+        // Validate total sizes per category.
+        if system_metadata_size > format::MAXIMUM_TOTAL_METADATA_SIZE_PER_CATEGORY
+            || user_metadata_size > format::MAXIMUM_TOTAL_METADATA_SIZE_PER_CATEGORY
+        {
+            return Err(CorruptionType::TotalMetadataSizeExceeded.into());
+        }
+
+        Ok(MetadataTable { entries })
+    }
+
+    /// Find an entry by GUID and user/system flag.
+    pub fn find_entry(&self, is_user: bool, item_id: &Guid) -> Option<&MetadataTableEntry> {
+        self.entries
+            .iter()
+            .find(|e| e.flags.is_user() == is_user && &e.item_id == item_id)
+    }
+
+    /// Read the raw bytes of a metadata item through the page cache.
+    pub async fn read_item<F: AsyncFile>(
+        &self,
+        cache: &PageCache<F>,
+        is_user: bool,
+        item_id: &Guid,
+    ) -> Result<Vec<u8>, OpenError> {
+        let entry = self
+            .find_entry(is_user, item_id)
+            .ok_or(CorruptionType::MissingRequiredMetadata)?;
+
+        let mut data = vec![0; entry.length as usize];
+        let mut data_offset = 0;
+        let mut item_offset = entry.offset as u64;
+
+        while data_offset < data.len() {
+            let page_offset = item_offset & !(PAGE_SIZE as u64 - 1);
+            let page_delta = (item_offset - page_offset) as usize;
+            let len = (data.len() - data_offset).min(PAGE_SIZE - page_delta);
+
+            let page = cache
+                .acquire_read(PageKey {
+                    tag: METADATA_TAG,
+                    offset: page_offset,
+                })
+                .await
+                .map_err(OpenErrorInner::MetadataCache)?;
+            data[data_offset..data_offset + len]
+                .copy_from_slice(&page[page_delta..page_delta + len]);
+
+            data_offset += len;
+            item_offset += len as u64;
+        }
+
+        Ok(data)
+    }
+
+    /// Returns an iterator over all entries.
+    pub fn entries(&self) -> &[MetadataTableEntry] {
+        &self.entries
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::cache::PageCache;
+    use crate::region;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+    use std::sync::Arc;
+    use zerocopy::IntoBytes;
+
+    fn metadata_cache(file: InMemoryFile, metadata_offset: u64) -> PageCache<InMemoryFile> {
+        let mut cache = PageCache::new(Arc::new(file), None, None, 0);
+        cache.register_tag(METADATA_TAG, metadata_offset);
+        cache
+    }
+
+    #[async_test]
+    async fn parse_valid_metadata_table() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        // Should have 5 system entries: file params, disk size, logical sector,
+        // physical sector, page 83.
+        assert_eq!(table.entries.len(), 5);
+    }
+
+    #[async_test]
+    async fn find_entry_by_guid() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        // Known GUIDs should be found.
+        assert!(
+            table
+                .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID)
+                .is_some()
+        );
+        assert!(
+            table
+                .find_entry(false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID)
+                .is_some()
+        );
+        assert!(
+            table
+                .find_entry(false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID)
+                .is_some()
+        );
+        assert!(
+            table
+                .find_entry(false, &format::PHYSICAL_SECTOR_SIZE_ITEM_GUID)
+                .is_some()
+        );
+        assert!(
+            table
+                .find_entry(false, &format::PAGE_83_ITEM_GUID)
+                .is_some()
+        );
+
+        // Unknown GUID should not be found.
+        assert!(
+            table
+                .find_entry(false, &format::PARENT_LOCATOR_ITEM_GUID)
+                .is_none()
+        );
+    }
+
+    #[async_test]
+    async fn read_item_bytes() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+        let cache = metadata_cache(file, regions.metadata_offset);
+
+        // Read disk size — should be 1 GiB.
+        let data = table
+            .read_item(&cache, false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID)
+            .await
+            .unwrap();
+        assert_eq!(data.len(), 8);
+        let disk_size = u64::from_le_bytes(data.try_into().unwrap());
+        assert_eq!(disk_size, format::GB1);
+    }
+
+    #[async_test]
+    async fn invalid_signature() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = region::parse_region_tables(&file).await.unwrap();
+
+        // Corrupt the metadata table signature.
+        let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize];
+        file.read_at(regions.metadata_offset, &mut buf)
+            .await
+            .unwrap();
+        buf[0] ^= 0xFF;
+        file.write_at(regions.metadata_offset, &buf).await.unwrap();
+
+        let result =
+            MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidMetadataTableSignature
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn duplicate_guid() {
+        let file = InMemoryFile::new(4 * format::MB1);
+
+        // Build a metadata table with two entries sharing the same GUID.
+        let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize];
+        let header = MetadataTableHeader {
+            signature: format::METADATA_TABLE_SIGNATURE,
+            reserved: 0,
+            entry_count: 2,
+            reserved2: [0; 5],
+        };
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        let entry = MetadataTableEntry {
+            item_id: format::FILE_PARAMETERS_ITEM_GUID,
+            offset: format::METADATA_TABLE_SIZE as u32,
+            length: 8,
+            flags: format::MetadataTableEntryFlags::new().with_is_required(true),
+            reserved2: 0,
+        };
+        let e_bytes = entry.as_bytes();
+        let entry_start = size_of::<MetadataTableHeader>();
+        buf[entry_start..entry_start + e_bytes.len()].copy_from_slice(e_bytes);
+
+        let entry2 = MetadataTableEntry {
+            item_id: format::FILE_PARAMETERS_ITEM_GUID,
+            offset: format::METADATA_TABLE_SIZE as u32 + 8,
+            length: 8,
+            flags: format::MetadataTableEntryFlags::new().with_is_required(true),
+            reserved2: 0,
+        };
+        let e2_bytes = entry2.as_bytes();
+        let off2 = entry_start + e_bytes.len();
+        buf[off2..off2 + e2_bytes.len()].copy_from_slice(e2_bytes);
+
+        let metadata_offset = 2 * format::MB1;
+        file.write_at(metadata_offset, &buf).await.unwrap();
+
+        let result = MetadataTable::read(&file, metadata_offset, format::MB1 as u32).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::MetadataDuplicateGuid
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn user_required_invalid() {
+        let file = InMemoryFile::new(4 * format::MB1);
+
+        let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize];
+        let header = MetadataTableHeader {
+            signature: format::METADATA_TABLE_SIGNATURE,
+            reserved: 0,
+            entry_count: 1,
+            reserved2: [0; 5],
+        };
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        let entry = MetadataTableEntry {
+            item_id: format::FILE_PARAMETERS_ITEM_GUID,
+            offset: format::METADATA_TABLE_SIZE as u32,
+            length: 8,
+            flags: format::MetadataTableEntryFlags::new()
+                .with_is_user(true)
+                .with_is_required(true),
+            reserved2: 0,
+        };
+        let e_bytes = entry.as_bytes();
+        let entry_start = size_of::<MetadataTableHeader>();
+        buf[entry_start..entry_start + e_bytes.len()].copy_from_slice(e_bytes);
+
+        let metadata_offset = 2 * format::MB1;
+        file.write_at(metadata_offset, &buf).await.unwrap();
+
+        let result = MetadataTable::read(&file, metadata_offset, format::MB1 as u32).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::MetadataUserRequired
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn reserved_metadata_entry_flags_invalid() {
+        let file = InMemoryFile::new(4 * format::MB1);
+
+        let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize];
+        let header = MetadataTableHeader {
+            signature: format::METADATA_TABLE_SIGNATURE,
+            reserved: 0,
+            entry_count: 1,
+            reserved2: [0; 5],
+        };
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        let entry = MetadataTableEntry {
+            item_id: format::FILE_PARAMETERS_ITEM_GUID,
+            offset: format::METADATA_TABLE_SIZE as u32,
+            length: 8,
+            flags: format::MetadataTableEntryFlags::new().with_is_required(true),
+            reserved2: 0,
+        };
+        let e_bytes = entry.as_bytes();
+        let entry_start = size_of::<MetadataTableHeader>();
+        buf[entry_start..entry_start + e_bytes.len()].copy_from_slice(e_bytes);
+        let flags_offset = entry_start + 24;
+        buf[flags_offset..flags_offset + 4].copy_from_slice(&8_u32.to_le_bytes());
+
+        let metadata_offset = 2 * format::MB1;
+        file.write_at(metadata_offset, &buf).await.unwrap();
+
+        let result = MetadataTable::read(&file, metadata_offset, format::MB1 as u32).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::ReservedMetadataTableFieldNonzero
+            )))
+        ));
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/open.rs b/vm/devices/storage/vhdx/src/open.rs
new file mode 100644
index 0000000000..f1b7dd0dc6
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/open.rs
@@ -0,0 +1,1490 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! VHDX file open orchestration.
+//!
+//! Ties together header, region, metadata, and BAT parsing into
+//! [`VhdxFile::open()`], which returns a [`VhdxBuilder`] for
+//! configuring options before finalizing as read-only or writable.
+
+use crate::AsyncFile;
+use crate::bat::BAT_TAG;
+use crate::bat::Bat;
+use crate::cache::PageCache;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::error::OpenErrorInner;
+use crate::error::PipelineFailed;
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::flush::FlushSequencer;
+use crate::format;
+use crate::format::FileIdentifier;
+use crate::header::HeaderState;
+use crate::header::WriteMode;
+use crate::header::parse_headers;
+use crate::header::serialize_header;
+use crate::known_meta::read_known_metadata;
+use crate::known_meta::verify_known_metadata;
+use crate::log;
+use crate::log::LogRegion;
+use crate::log_task::LogRequest;
+use crate::metadata::METADATA_TAG;
+use crate::metadata::MetadataTable;
+use crate::region::parse_region_tables;
+use crate::sector_bitmap::SBM_TAG;
+use crate::space::DeferredReleases;
+use crate::space::EofState;
+use crate::space::FreeSpaceTracker;
+use guid::Guid;
+use parking_lot::Mutex;
+use std::sync::Arc;
+use zerocopy::FromBytes;
+
+/// Builder for opening a VHDX file.
+///
+/// Created via [`VhdxFile::open()`], then configured with builder methods
+/// before calling [`read_only()`](Self::read_only) or
+/// [`writable()`](Self::writable) to produce a [`VhdxFile`].
+///
+/// # Examples
+///
+/// ```ignore
+/// // Default options:
+/// let vhdx = VhdxFile::open(file).read_only().await?;
+///
+/// // With block alignment (for NTFS-DAX / PMEM volumes):
+/// let vhdx = VhdxFile::open(file)
+///     .block_alignment(2 * 1024 * 1024)
+///     .writable(&spawner)
+///     .await?;
+/// ```
+pub struct VhdxBuilder<F> {
+    file: F,
+    options: OpenOptions,
+}
+
+/// Internal options collected by [`VhdxBuilder`].
+#[derive(Debug, Clone)]
+struct OpenOptions {
+    /// Block data alignment in bytes. Must be 0 or a power of 2.
+    ///
+    /// When non-zero and ≤ the VHDX block size, new data block allocations
+    /// from the end of the file are rounded up to this alignment. This
+    /// matches the host filesystem's cluster size (e.g. 2 MiB on NTFS-DAX
+    /// volumes) so that data blocks land on cluster boundaries.
+    ///
+    /// Default: 0 (no alignment — blocks use the natural 1 MiB granularity).
+    block_alignment: u32,
+    /// Whether to allow log replay on a read-only open.
+    ///
+    /// When true, a dirty log is replayed (the file handle must support
+    /// writes for the replay I/O) but the resulting `VhdxFile` is still
+    /// read-only. When false, a dirty log returns
+    /// [`CorruptionType::LogReplayRequired`].
+    ///
+    /// Ignored for writable opens (log replay always happens).
+    ///
+    /// Default: false.
+    allow_replay: bool,
+}
+
+impl OpenOptions {
+    fn new() -> Self {
+        Self {
+            block_alignment: 0,
+            allow_replay: false,
+        }
+    }
+}
+
+impl<F: 'static + AsyncFile> VhdxBuilder<F> {
+    /// Set the block data alignment in bytes.
+    ///
+    /// Must be 0 or a power of 2. If larger than the VHDX block size,
+    /// it is silently ignored at open time.
+    ///
+    /// This should be set to the host filesystem's cluster size when the
+    /// VHDX file lives on a volume with clusters larger than 1 MiB (e.g.
+    /// NTFS-DAX with 2 MiB clusters).
+    pub fn block_alignment(mut self, alignment: u32) -> Self {
+        self.options.block_alignment = alignment;
+        self
+    }
+
+    /// Allow log replay when opening read-only.
+    ///
+    /// When true, a dirty log is replayed (the file handle must support
+    /// writes for the replay I/O) but the resulting [`VhdxFile`] is still
+    /// read-only. When false, a dirty log returns an error.
+    ///
+    /// Has no effect on [`writable()`](Self::writable) opens, which always
+    /// replay.
+    pub fn allow_replay(mut self, allow: bool) -> Self {
+        self.options.allow_replay = allow;
+        self
+    }
+
+    /// Open the VHDX file in read-only mode.
+    pub async fn read_only(self) -> Result<VhdxFile<F>, OpenError>
+    where
+        F: AsyncFile,
+    {
+        VhdxFile::open_read_only(self.file, &self.options).await
+    }
+
+    /// Open the VHDX file in writable mode with a log task.
+    ///
+    /// Replays a dirty log if needed, then spawns a log task for
+    /// crash-consistent metadata writes.
+    ///
+    /// Call [`VhdxFile::close()`] for a clean shutdown.
+    pub async fn writable(
+        self,
+        spawner: &impl pal_async::task::Spawn,
+    ) -> Result<VhdxFile<F>, OpenError>
+    where
+        F: AsyncFile,
+    {
+        VhdxFile::open_writable(self.file, spawner, &self.options).await
+    }
+}
+
+/// An open VHDX file handle.
+///
+/// Created via [`VhdxFile::open()`], which returns a [`VhdxBuilder`]
+/// for configuring options before calling
+/// [`read_only()`](VhdxBuilder::read_only) or
+/// [`writable()`](VhdxBuilder::writable).
+//
+// Lock ordering (must acquire in this order, never reverse):
+//   1. header_state.inner  (futures::lock::Mutex — async, may be held across .await)
+//   2. allocation_lock     (futures::lock::Mutex — async, may be held across .await)
+//   3. bat_state            (parking_lot::RwLock — synchronous, NEVER across .await)
+//   4. free_space.inner     (parking_lot::Mutex — synchronous, NEVER across .await)
+//   5. cache.pages/tags     (parking_lot::Mutex — brief, NEVER across .await)
+//
+// header_state.inner serializes all header writes (enable_write_mode, set_log_guid,
+// clear_log_guid). Its write_mode AtomicU8 provides a lock-free fast path for
+// enable_write_mode, which is called on every write.
+// The allocation_lock serializes the entire allocation decision (check BAT, allocate
+// space, mark TFP). It is released AFTER TFP is set but BEFORE data I/O begins.
+// The bat_state RwLock is held for < 1μs per access (reading/writing in-memory entries).
+pub struct VhdxFile<F: AsyncFile> {
+    pub(crate) file: Arc<F>,
+    pub(crate) cache: PageCache<F>,
+    pub(crate) bat: Bat,
+
+    // Parsed metadata
+    pub(crate) disk_size: u64,
+    pub(crate) block_size: u32,
+    pub(crate) logical_sector_size: u32,
+    physical_sector_size: u32,
+    pub(crate) has_parent: bool,
+    is_fully_allocated: bool,
+    page_83_data: Guid,
+
+    // Metadata table (kept for on-demand metadata reads).
+    metadata_table: MetadataTable,
+
+    // Header and write-mode state (async mutex for serialization,
+    // AtomicU8 for lock-free hot-path write-mode checks).
+    pub(crate) header_state: HeaderState,
+
+    /// Serializes block allocation decisions and protects EOF geometry
+    /// state. Only one allocation sequence runs at a time.
+    /// Uses futures::lock::Mutex because it may be held across .await points.
+    pub(crate) allocation_lock: futures::lock::Mutex<EofState>,
+
+    /// Broadcast event notified when a TFP block completes post-allocation.
+    /// Writers that encounter a TFP block listen on this event and retry.
+    pub(crate) allocation_event: event_listener::Event,
+
+    /// Free space tracker. Manages all space allocation within the file,
+    /// replacing the simple EOF-bump allocator.
+    pub(crate) free_space: FreeSpaceTracker,
+
+    /// Space releases deferred until their BAT changes are durable.
+    /// Uses generation-based stamping to coordinate with flush().
+    pub(crate) deferred_releases: DeferredReleases,
+
+    // Mode
+    pub(crate) read_only: bool,
+
+    /// Region table bytes to rewrite (set when the two on-disk copies
+    /// don't match). Consumed by [`VhdxBuilder::writable`].
+    region_rewrite_data: Option<F::Buffer>,
+
+    /// Error state: once set, all I/O operations fail.
+    /// Shared with log and apply tasks so they
+    /// can poison the file directly on fatal error.
+    pub(crate) failed: Arc<FailureFlag>,
+
+    // Log task state (set when opened writable via VhdxBuilder::writable).
+    pub(crate) log_state: Option<LogTaskState>,
+}
+
+/// Log pipeline state for a writable VHDX file.
+///
+/// Created during [`VhdxBuilder::writable`] and consumed by
+/// [`VhdxFile::close`] / [`VhdxFile::abort`]. All fields are set
+/// together when the log task is spawned.
+pub(crate) struct LogTaskState {
+    /// Handle to the spawned log task.
+    log_task: pal_async::task::Task<()>,
+    /// Handle to the spawned apply task.
+    apply_task: pal_async::task::Task<()>,
+    /// Flush sequencer for FSN-gated ordering.
+    pub flush_sequencer: Arc<FlushSequencer>,
+    /// Failable semaphore for log backpressure.
+    pub log_permits: Arc<crate::log_permits::LogPermits>,
+    /// LSN watermark published by the log task. `flush()` waits on this.
+    pub logged_lsn: Arc<crate::lsn_watermark::LsnWatermark>,
+}
+
+impl<F: 'static + AsyncFile> VhdxFile<F> {
+    /// Begin opening a VHDX file, returning a [`VhdxBuilder`] to configure
+    /// options before finalizing with [`read_only()`](VhdxBuilder::read_only)
+    /// or [`writable()`](VhdxBuilder::writable).
+    pub fn open(file: F) -> VhdxBuilder<F> {
+        VhdxBuilder {
+            file,
+            options: OpenOptions::new(),
+        }
+    }
+
+    /// Internal open logic shared by [`VhdxBuilder::read_only`] and
+    /// [`VhdxBuilder::writable`].
+    ///
+    /// Validates the file identifier, headers, region tables, and metadata.
+    /// If the log GUID is non-zero (indicating a dirty log), replays the
+    /// log to recover the file. Read-only opens with a dirty log return
+    /// [`CorruptionType::LogReplayRequired`].
+    async fn open_inner(
+        file: F,
+        read_only: bool,
+        log_sender: Option<mesh::Sender<LogRequest<F::Buffer>>>,
+        options: &OpenOptions,
+    ) -> Result<Self, OpenError> {
+        // 1. Validate minimum file size.
+        let file_length = file.file_size().await.map_err(OpenErrorInner::Io)?;
+        if file_length < format::HEADER_AREA_SIZE {
+            return Err(CorruptionType::EmptyFile.into());
+        }
+
+        // 2. Validate the file identifier signature.
+        validate_file_identifier(&file).await?;
+
+        // 3. Parse dual headers.
+        let mut header = parse_headers(&file, file_length).await?;
+
+        // 4. If log_guid is non-zero, replay the log.
+        if header.log_guid != Guid::ZERO {
+            // A dirty log requires writing to the file to replay. If the caller
+            // opened read-only, we cannot proceed — the metadata may be
+            // inconsistent and we're not allowed to fix it.
+            if read_only {
+                return Err((CorruptionType::LogReplayRequired).into());
+            }
+
+            // The file handle hasn't been Arc-wrapped yet — pass &file directly.
+            let log_region = LogRegion {
+                file_offset: header.log_offset,
+                length: header.log_length,
+            };
+
+            let replay_result = log::replay_log(&file, &log_region, header.log_guid).await?;
+
+            if replay_result.replayed {
+                // Write a clean header: clear log_guid, bump sequence number.
+                let new_seq = header.sequence_number + 1;
+                let (buf, write_offset) = serialize_header(
+                    &file,
+                    new_seq,
+                    header.file_write_guid,
+                    header.data_write_guid,
+                    Guid::ZERO,
+                    header.log_offset,
+                    header.log_length,
+                    header.first_header_current,
+                );
+                file.write_from(write_offset, buf)
+                    .await
+                    .map_err(OpenErrorInner::Io)?;
+                file.flush().await.map_err(OpenErrorInner::Io)?;
+
+                // Update the in-flight header state for the rest of the open path.
+                header.sequence_number = new_seq;
+                header.log_guid = Guid::ZERO;
+                header.first_header_current = !header.first_header_current;
+            }
+        }
+
+        // 5. Parse region tables.
+        let regions = parse_region_tables(&file).await?;
+
+        // 6. Read metadata table.
+        let metadata_table =
+            MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length).await?;
+
+        // 7. Verify known metadata (all required system items are recognized).
+        verify_known_metadata(&metadata_table, false)?;
+
+        // 8. Wrap file in Arc for shared access.
+        let file = Arc::new(file);
+
+        // 9. Create PageCache and register tags before reading metadata items.
+        let mut cache = PageCache::new(
+            file.clone(),
+            log_sender.map(crate::log_task::LogClient::new),
+            None,
+            0,
+        );
+        cache.register_tag(BAT_TAG, regions.bat_offset);
+        cache.register_tag(METADATA_TAG, regions.metadata_offset);
+        cache.register_tag(SBM_TAG, 0);
+
+        // 10. Read known metadata values.
+        let known = read_known_metadata(&cache, &metadata_table).await?;
+
+        // 11. Create BAT manager.
+        let mut bat = Bat::new(
+            known.disk_size,
+            known.block_size,
+            known.logical_sector_size,
+            known.has_parent,
+            regions.bat_length,
+        )?;
+
+        // 12. Create FreeSpaceTracker.
+        let (free_space, mut eof_state) = FreeSpaceTracker::new(
+            file_length,
+            known.block_size,
+            options.block_alignment,
+            format::HEADER_AREA_SIZE,
+            header.log_offset,
+            header.log_length,
+            regions.bat_offset,
+            regions.bat_length,
+            regions.metadata_offset,
+            regions.metadata_length,
+            bat.data_block_count,
+        )?;
+
+        // 13. Load in-memory BAT from disk.
+        bat.load_bat_state(
+            &*file,
+            regions.bat_offset,
+            regions.bat_length,
+            &free_space,
+            &mut eof_state,
+        )
+        .await?;
+
+        // 14. Finalize free space initialization after BAT parse.
+        free_space.complete_initialization(&eof_state);
+
+        // 15. Construct VhdxFile.
+        Ok(VhdxFile {
+            file,
+            cache,
+            bat,
+            disk_size: known.disk_size,
+            block_size: known.block_size,
+            logical_sector_size: known.logical_sector_size,
+            physical_sector_size: known.physical_sector_size,
+            has_parent: known.has_parent,
+            is_fully_allocated: known.leave_blocks_allocated,
+            page_83_data: known.page_83_data,
+            metadata_table,
+            header_state: HeaderState::new(&header),
+            allocation_lock: futures::lock::Mutex::new(eof_state),
+            allocation_event: event_listener::Event::new(),
+            free_space,
+            deferred_releases: DeferredReleases::new(),
+
+            read_only,
+            region_rewrite_data: regions.rewrite_data,
+            failed: Arc::new(FailureFlag::new()),
+
+            log_state: None,
+        })
+    }
+
+    /// Open an existing VHDX file in read-only mode.
+    ///
+    /// If [`OpenOptions::allow_replay`] is true, a dirty log is replayed
+    /// (requires the file handle to support writes for the replay I/O),
+    /// but the resulting `VhdxFile` is still read-only. If false, a dirty
+    /// log returns [`CorruptionType::LogReplayRequired`].
+    async fn open_read_only(file: F, options: &OpenOptions) -> Result<Self, OpenError> {
+        if options.allow_replay {
+            let mut vhdx = Self::open_inner(file, false, None, options).await?;
+            vhdx.read_only = true;
+            Ok(vhdx)
+        } else {
+            Self::open_inner(file, true, None, options).await
+        }
+    }
+
+    /// Open an existing VHDX file in writable mode with a log task.
+    ///
+    /// Replays a dirty log if needed, then spawns a log task for
+    /// crash-consistent metadata writes. The log task receives dirty pages
+    /// on `flush()` and writes them as WAL entries.
+    ///
+    /// The spawner must implement [`pal_async::task::Spawn`] to spawn the
+    /// background log task.
+    ///
+    /// Call [`close()`](Self::close) for a clean shutdown. Dropping without
+    /// close leaves the VHDX file dirty (log will be replayed on next open).
+    async fn open_writable(
+        file: F,
+        spawner: &impl pal_async::task::Spawn,
+        options: &OpenOptions,
+    ) -> Result<Self, OpenError> {
+        // Create mesh channel before open_inner so the cache gets the
+        // sender at construction time.
+        let (tx, rx) = mesh::channel::<LogRequest<F::Buffer>>();
+        let mut vhdx = Self::open_inner(file, false, Some(tx.clone()), options).await?;
+
+        // Create shared state for log task communication.
+        let flush_sequencer = {
+            let mut fs = FlushSequencer::new();
+            fs.set_failure_flag(vhdx.failed.clone());
+            Arc::new(fs)
+        };
+        let log_permits = Arc::new(crate::log_permits::LogPermits::new(
+            // Permit count is a multiple of MAX_COMMIT_PAGES to allow
+            // pipelining: multiple batches can be in-flight (committed
+            // but not yet applied) simultaneously. Permits are released
+            // by the apply task, not at commit time.
+            crate::cache::MAX_COMMIT_PAGES * 4,
+        ));
+        let logged_lsn = Arc::new(crate::lsn_watermark::LsnWatermark::new());
+
+        // Initialize the log writer.
+        let log_guid = Guid::new_random();
+        let (log_offset, log_length) = vhdx.header_state.log_region();
+        let log_region = LogRegion {
+            file_offset: log_offset,
+            length: log_length,
+        };
+        let file_length = vhdx.file.file_size().await.map_err(OpenErrorInner::Io)?;
+        let log_writer =
+            log::LogWriter::initialize(vhdx.file.as_ref(), log_region, log_guid, file_length)
+                .await?;
+
+        // Write header with log_guid set (marks file as dirty).
+        // This is done BEFORE spawning the log task so the file is marked
+        // dirty before any log entries are written.
+        vhdx.header_state
+            .set_log_guid(log_guid, vhdx.file.as_ref(), None)
+            .await
+            .map_err(OpenErrorInner::Io)?;
+
+        // Spawn the apply task.
+        let applied_lsn = Arc::new(crate::lsn_watermark::LsnWatermark::new());
+        let (apply_tx, apply_rx) = mesh::channel::<crate::apply_task::ApplyBatch<F::Buffer>>();
+        let apply_task = spawner.spawn(
+            "vhdx-apply-task",
+            crate::apply_task::run_apply_task(
+                apply_rx,
+                vhdx.file.clone(),
+                flush_sequencer.clone(),
+                applied_lsn.clone(),
+                log_permits.clone(),
+                vhdx.failed.clone(),
+            ),
+        );
+
+        // Spawn the log task.
+        let task = spawner.spawn(
+            "vhdx-log-task",
+            crate::log_task::LogTask::new(
+                vhdx.file.clone(),
+                log_writer,
+                flush_sequencer.clone(),
+                log_permits.clone(),
+                logged_lsn.clone(),
+                applied_lsn.clone(),
+                apply_tx,
+                vhdx.failed.clone(),
+            )
+            .run(rx),
+        );
+
+        // Set log state on the cache.
+        vhdx.cache.set_log_state(crate::cache::CacheLogState {
+            permits: log_permits.clone(),
+            applied_lsn: applied_lsn.clone(),
+        });
+
+        vhdx.log_state = Some(LogTaskState {
+            log_task: task,
+            apply_task,
+            flush_sequencer,
+            log_permits,
+            logged_lsn,
+        });
+
+        // Repair mismatched region tables through the write-ahead log.
+        // The pages enter the log pipeline and will be applied in due
+        // course; the next caller-initiated flush() covers them via LSN
+        // ordering. If we crash before that, either log replay applies
+        // the entry or the mismatch is re-detected on reopen.
+        if let Some(table_data) = vhdx.region_rewrite_data.take() {
+            crate::region::rewrite_region_tables(
+                &vhdx.cache,
+                &vhdx
+                    .log_state
+                    .as_ref()
+                    .expect("writable file has log_state")
+                    .log_permits,
+                table_data,
+            )
+            .await
+            .map_err(OpenErrorInner::PipelineFailed)?;
+        }
+
+        Ok(vhdx)
+    }
+
+    /// Gracefully close the VHDX file.
+    ///
+    /// Flushes all dirty pages through the log, applies all logged entries,
+    /// clears the log GUID in the header, and waits for the log task to exit.
+    ///
+    /// After this returns, the file is in a clean state (no log replay needed
+    /// on next open).
+    ///
+    /// If no log task is running (read-only or opened without log), this is
+    /// a no-op.
+    pub async fn close(mut self) -> Result<(), VhdxIoError> {
+        if let Some(state) = self.log_state.take() {
+            // Ship any remaining dirty pages to the log task.
+            // This is fire-and-forget — the Close RPC below will
+            // process after this batch due to channel ordering.
+            self.cache.commit().map_err(VhdxIoErrorInner::CommitCache)?;
+
+            // Take the log client out of the cache to get the sender.
+            let client = self
+                .cache
+                .take_log_client()
+                .expect("log client disappeared");
+
+            // Send Close RPC — the log task will log+apply all pending
+            // batches, then respond.
+            client.close().await?;
+
+            state.log_task.await;
+            // The log task dropping its apply_tx closes the apply channel,
+            // causing the apply task to exit.
+            state.apply_task.await;
+
+            // Clear log GUID in the header now that the log is fully drained.
+            // Done BEFORE truncation so that a crash during truncation
+            // doesn't leave a non-zero log GUID pointing at a file that
+            // may have been partially shrunk. With the GUID cleared first,
+            // a crash at any later point just leaves a larger-than-necessary
+            // file — no replay is attempted.
+            self.header_state
+                .clear_log_guid(self.file.as_ref(), Some(state.flush_sequencer.as_ref()))
+                .await
+                .map_err(VhdxIoErrorInner::WriteHeader)?;
+
+            // Truncate the file to reclaim unused trailing space.
+            // Best-effort: if this fails, the file is still correct,
+            // just not compacted.
+            if let Err(e) = self.truncate_file().await {
+                tracing::warn!(
+                    error = &e as &dyn std::error::Error,
+                    "failed to truncate VHDX file on close"
+                );
+            }
+        }
+        Ok(())
+    }
+
+    /// Abort the VHDX file without graceful close.
+    ///
+    /// Drops the log channel (causing the log task to exit on its next
+    /// recv) and waits for the log task to finish. No pending batches are
+    /// applied and the log GUID is NOT cleared — the file remains dirty,
+    /// requiring log replay on the next open.
+    ///
+    /// This is the test-friendly equivalent of a crash: all state held by
+    /// the log task (including its `Arc<F>`) is released, but no new I/O
+    /// is issued.
+    pub async fn abort(mut self) {
+        // Drop the log client so the log task's recv() returns Err.
+        self.cache.take_log_client();
+
+        // Wait for the log task to notice the closed channel and exit.
+        // The log task dropping its apply_tx closes the apply channel too.
+        if let Some(state) = self.log_state.take() {
+            state.log_task.await;
+            state.apply_task.await;
+        }
+    }
+}
+
+impl<F: AsyncFile> VhdxFile<F> {
+    /// Virtual disk size in bytes.
+    pub fn disk_size(&self) -> u64 {
+        self.disk_size
+    }
+
+    /// Block size in bytes.
+    pub fn block_size(&self) -> u32 {
+        self.block_size
+    }
+
+    /// Logical sector size (512 or 4096).
+    pub fn logical_sector_size(&self) -> u32 {
+        self.logical_sector_size
+    }
+
+    /// Physical sector size (512 or 4096).
+    pub fn physical_sector_size(&self) -> u32 {
+        self.physical_sector_size
+    }
+
+    /// Whether this is a differencing disk (has a parent).
+    pub fn has_parent(&self) -> bool {
+        self.has_parent
+    }
+
+    /// Read and parse the parent locator from the metadata region.
+    ///
+    /// Returns `Ok(None)` for base (non-differencing) disks.
+    /// Returns an error if the locator item is missing or corrupt.
+    pub async fn parent_locator(&self) -> Result<Option<crate::locator::ParentLocator>, OpenError> {
+        if !self.has_parent {
+            return Ok(None);
+        }
+        let locator_data = self
+            .metadata_table
+            .read_item(&self.cache, false, &format::PARENT_LOCATOR_ITEM_GUID)
+            .await?;
+        Ok(Some(crate::locator::ParentLocator::parse(&locator_data)?))
+    }
+
+    /// Whether the disk was created with all blocks pre-allocated (fixed VHD).
+    pub fn is_fully_allocated(&self) -> bool {
+        self.is_fully_allocated
+    }
+
+    /// SCSI VPD Page 83 identifier (stable disk identity).
+    pub fn page_83_data(&self) -> Guid {
+        self.page_83_data
+    }
+
+    /// GUID changed on every virtual-disk data write.
+    pub fn data_write_guid(&self) -> Guid {
+        self.header_state.data_write_guid()
+    }
+
+    /// Whether the file was opened in read-only mode.
+    pub fn is_read_only(&self) -> bool {
+        self.read_only
+    }
+
+    /// Ensures the requested write mode is enabled, updating the header
+    /// and flushing if needed. If the current mode already satisfies the
+    /// request, this is a no-op.
+    ///
+    /// Hot path (mode already enabled): single atomic load, no lock.
+    /// Cold path (mode transition): acquires the header async mutex,
+    /// generates new GUIDs, writes the header, flushes, then publishes.
+    pub(crate) async fn enable_write_mode(&self, mode: WriteMode) -> Result<(), std::io::Error> {
+        let flush_sequencer = self.log_state.as_ref().map(|s| s.flush_sequencer.as_ref());
+        self.header_state
+            .enable_write_mode(mode, self.file.as_ref(), flush_sequencer)
+            .await
+    }
+}
+
+/// Validate the file identifier signature at offset 0.
+async fn validate_file_identifier(file: &impl AsyncFile) -> Result<(), OpenError> {
+    // Read a full sector (not just the identifier struct) so that O_DIRECT
+    // backends with sector-alignment requirements work correctly.
+    let buf = file.alloc_buffer(4096);
+    let buf = file.read_into(0, buf).await.map_err(OpenErrorInner::Io)?;
+
+    let ident = FileIdentifier::read_from_prefix(buf.as_ref())
+        .map_err(|_| CorruptionType::InvalidFileIdentifier)?
+        .0;
+
+    if ident.signature != format::FILE_IDENTIFIER_SIGNATURE {
+        return Err(CorruptionType::InvalidFileIdentifier.into());
+    }
+
+    Ok(())
+}
+
+/// Shared failure flag for poisoning the VHDX file from any task.
+///
+/// Uses an `AtomicBool` for the fast path (`check`) and a mutex for
+/// the error message. Once set, the flag is never cleared.
+pub(crate) struct FailureFlag {
+    flag: std::sync::atomic::AtomicBool,
+    message: Mutex<Option<PipelineFailed>>,
+}
+
+impl FailureFlag {
+    pub fn new() -> Self {
+        Self {
+            flag: std::sync::atomic::AtomicBool::new(false),
+            message: Mutex::new(None),
+        }
+    }
+
+    /// Check whether the flag is set. Fast path: single atomic load.
+    pub fn check(&self) -> Result<(), VhdxIoError> {
+        if self.flag.load(std::sync::atomic::Ordering::Relaxed)
+            && let Some(msg) = self.message.lock().clone()
+        {
+            return Err(VhdxIoErrorInner::Failed(msg).into());
+        }
+        Ok(())
+    }
+
+    /// Set the failure flag. First caller's message wins.
+    pub fn set(&self, error: &dyn std::error::Error) {
+        let mut msg = self.message.lock();
+        if msg.is_none() {
+            *msg = Some(PipelineFailed(error.to_string()));
+        }
+        self.flag.store(true, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::create::{self, CreateParams};
+    use crate::error::OpenError;
+    use crate::format::BatEntry;
+    use crate::format::BatEntryState;
+    use crate::format::Header;
+    use crate::format::MB1;
+    use crate::space::AllocateFlags;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::async_test;
+    use zerocopy::IntoBytes;
+
+    #[async_test]
+    async fn open_default_vhdx() {
+        let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+        assert_eq!(vhdx.disk_size(), format::GB1);
+        assert_eq!(vhdx.block_size(), format::DEFAULT_BLOCK_SIZE);
+        assert_eq!(vhdx.logical_sector_size(), 512);
+        assert_eq!(vhdx.physical_sector_size(), 512);
+        assert!(!vhdx.has_parent());
+        assert!(!vhdx.is_fully_allocated());
+        assert!(vhdx.is_read_only());
+        assert_ne!(vhdx.data_write_guid(), Guid::ZERO);
+        assert_eq!(vhdx.data_write_guid(), params.data_write_guid);
+    }
+
+    #[async_test]
+    async fn open_4k_sector_vhdx() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            logical_sector_size: 4096,
+            physical_sector_size: 4096,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert_eq!(vhdx.logical_sector_size(), 4096);
+        assert_eq!(vhdx.physical_sector_size(), 4096);
+    }
+
+    #[async_test]
+    async fn open_512_sector_vhdx() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            logical_sector_size: 512,
+            physical_sector_size: 512,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert_eq!(vhdx.logical_sector_size(), 512);
+        assert_eq!(vhdx.physical_sector_size(), 512);
+    }
+
+    #[async_test]
+    async fn open_various_block_sizes() {
+        for &block_size in &[
+            MB1 as u32,
+            2 * MB1 as u32,
+            32 * MB1 as u32,
+            256 * MB1 as u32,
+        ] {
+            let file = InMemoryFile::new(0);
+            let mut params = CreateParams {
+                disk_size: format::GB1,
+                block_size,
+                ..Default::default()
+            };
+            create::create(&file, &mut params).await.unwrap();
+
+            let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+            assert_eq!(vhdx.block_size(), block_size);
+        }
+    }
+
+    #[async_test]
+    async fn open_differencing_disk() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert!(vhdx.has_parent());
+    }
+
+    #[async_test]
+    async fn open_fully_allocated() {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            is_fully_allocated: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert!(vhdx.is_fully_allocated());
+    }
+
+    #[async_test]
+    async fn open_dirty_log_no_valid_entries() {
+        // Setting log_guid to a random GUID without writing matching log
+        // entries causes replay_log to return NoValidLogEntries.
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        // Overwrite header 2's log_guid with a non-zero GUID, then fix the CRC.
+        let mut buf = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_2, &mut buf)
+            .await
+            .unwrap();
+
+        let mut header = Header::read_from_prefix(&buf).unwrap().0.clone();
+        header.log_guid = Guid::new_random();
+        header.checksum = 0;
+
+        let header_bytes = header.as_bytes();
+        buf[..header_bytes.len()].copy_from_slice(header_bytes);
+        let crc = format::compute_checksum(&buf, 4);
+        buf[4..8].copy_from_slice(&crc.to_le_bytes());
+        file.write_at(format::HEADER_OFFSET_2, &buf).await.unwrap();
+
+        let result = VhdxFile::open(file).allow_replay(true).read_only().await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::NoValidLogEntries
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn open_invalid_file_identifier() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        // Corrupt the file identifier signature.
+        file.write_at(0, b"BADMAGIC").await.unwrap();
+
+        let result = VhdxFile::open(file).read_only().await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::InvalidFileIdentifier
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn open_empty_file() {
+        // File smaller than HEADER_AREA_SIZE (1 MiB).
+        let file = InMemoryFile::new(512);
+        let result = VhdxFile::open(file).read_only().await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::EmptyFile
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn open_bat_block_lookup() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+        // A newly created dynamic disk has all blocks as NotPresent.
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert_eq!(mapping.bat_state(), BatEntryState::NotPresent);
+        assert_eq!(mapping.file_offset(), 0);
+    }
+
+    #[async_test]
+    async fn open_bat_all_blocks_default() {
+        let disk_size = 4 * MB1; // Small disk → 2 blocks.
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        let block_count = (disk_size / vhdx.block_size() as u64) as u32;
+
+        for block in 0..block_count {
+            let mapping = vhdx.bat.get_block_mapping(block);
+            assert_eq!(mapping.bat_state(), BatEntryState::NotPresent);
+            assert_eq!(mapping.file_offset(), 0);
+        }
+    }
+
+    #[async_test]
+    async fn open_read_only_flag() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert!(vhdx.is_read_only());
+    }
+
+    #[async_test]
+    async fn open_populates_in_memory_bat() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+        // All payload entries should be NotPresent.
+        for i in 0..vhdx.bat.data_block_count {
+            assert_eq!(
+                vhdx.bat.get_block_mapping(i).bat_state(),
+                BatEntryState::NotPresent,
+                "block {i} should be NotPresent"
+            );
+        }
+    }
+
+    #[async_test]
+    async fn open_with_allocated_blocks() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = parse_region_tables(&file).await.unwrap();
+
+        // Manually write a FullyPresent BAT entry for block 0 at offset 4 MB
+        // (just after the metadata region, within the file).
+        // First extend the file to cover the block (4 MB offset + 2 MB block = 6 MB).
+        file.set_file_size(6 * MB1).await.unwrap();
+
+        let entry = BatEntry::new()
+            .with_state(BatEntryState::FullyPresent as u8)
+            .with_file_offset_mb(4);
+        file.write_at(regions.bat_offset, entry.as_bytes())
+            .await
+            .unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent,);
+        assert_eq!(mapping.file_megabyte(), 4);
+    }
+
+    #[async_test]
+    async fn bat_lookup_is_synchronous() {
+        // Compile-time verification: get_block_mapping() is a regular fn,
+        // not an async fn. We call it without .await.
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert_eq!(mapping.bat_state(), BatEntryState::NotPresent);
+    }
+
+    #[async_test]
+    async fn eof_counter_no_overlap() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open_inner(file, false, None, &OpenOptions::new())
+            .await
+            .unwrap();
+        let mut eof = vhdx.allocation_lock.lock().await;
+        let a = vhdx
+            .allocate_space(&mut eof, MB1 as u32, AllocateFlags::new())
+            .await
+            .unwrap();
+        let b = vhdx
+            .allocate_space(&mut eof, MB1 as u32, AllocateFlags::new())
+            .await
+            .unwrap();
+        // Two allocations must not overlap.
+        assert_ne!(a.file_offset, b.file_offset);
+        assert!(b.file_offset >= a.file_offset + MB1);
+    }
+
+    #[async_test]
+    async fn eof_counter_mb_aligned() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open_inner(file, false, None, &OpenOptions::new())
+            .await
+            .unwrap();
+        let mut eof = vhdx.allocation_lock.lock().await;
+        let result = vhdx
+            .allocate_space(&mut eof, MB1 as u32, AllocateFlags::new())
+            .await
+            .unwrap();
+        assert_eq!(result.file_offset % MB1, 0, "offset must be MB1-aligned");
+    }
+
+    #[async_test]
+    async fn open_with_allocated_blocks_inits_space() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = parse_region_tables(&file).await.unwrap();
+
+        // Extend file to 8 MB then write a FullyPresent BAT entry at offset 4 MB.
+        file.set_file_size(8 * MB1).await.unwrap();
+
+        let entry = BatEntry::new()
+            .with_state(BatEntryState::FullyPresent as u8)
+            .with_file_offset_mb(4);
+        file.write_at(regions.bat_offset, entry.as_bytes())
+            .await
+            .unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+        // The free space tracker should have offset 4*MB marked as in-use.
+        let eof = vhdx.allocation_lock.lock().await;
+        assert!(
+            vhdx.free_space
+                .is_range_in_use(&eof, 4 * MB1, vhdx.block_size())
+        );
+    }
+
+    #[async_test]
+    async fn non_differencing_no_locator() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert!(!vhdx.has_parent());
+        assert!(vhdx.parent_locator().await.unwrap().is_none());
+    }
+
+    /// Helper: inject a parent locator metadata entry and blob into a diff disk.
+    ///
+    /// Reads the existing metadata table, appends a new entry for the parent
+    /// locator GUID, writes the locator blob at the entry's data offset, and
+    /// updates the metadata table header's entry count.
+    async fn inject_parent_locator(file: &InMemoryFile, locator_blob: &[u8]) {
+        use crate::format::{MetadataTableEntry, MetadataTableEntryFlags, MetadataTableHeader};
+        use zerocopy::{FromBytes, IntoBytes};
+
+        let regions = parse_region_tables(file).await.unwrap();
+
+        // Read the full metadata table (first 64 KiB of metadata region).
+        let mut table_buf = vec![0u8; format::METADATA_TABLE_SIZE as usize];
+        file.read_at(regions.metadata_offset, &mut table_buf)
+            .await
+            .unwrap();
+
+        // Parse header to get current entry count.
+        let mut header = MetadataTableHeader::read_from_prefix(&table_buf)
+            .unwrap()
+            .0
+            .clone();
+        let old_count = header.entry_count as usize;
+        let entry_size = size_of::<MetadataTableEntry>();
+        let header_size = size_of::<MetadataTableHeader>();
+
+        // Find the max data offset used by existing entries to place our blob after them.
+        let mut max_data_end: u32 = format::METADATA_TABLE_SIZE as u32;
+        for i in 0..old_count {
+            let off = header_size + i * entry_size;
+            let entry = MetadataTableEntry::read_from_prefix(&table_buf[off..])
+                .unwrap()
+                .0
+                .clone();
+            if entry.length > 0 {
+                let end = entry.offset + entry.length;
+                if end > max_data_end {
+                    max_data_end = end;
+                }
+            }
+        }
+
+        // Place the parent locator blob right after existing data.
+        let locator_offset = max_data_end;
+
+        // Write the new entry.
+        let new_entry = MetadataTableEntry {
+            item_id: format::PARENT_LOCATOR_ITEM_GUID,
+            offset: locator_offset,
+            length: locator_blob.len() as u32,
+            flags: MetadataTableEntryFlags::new().with_is_required(true),
+            reserved2: 0,
+        };
+        let new_entry_file_offset = header_size + old_count * entry_size;
+        let e_bytes = new_entry.as_bytes();
+        table_buf[new_entry_file_offset..new_entry_file_offset + e_bytes.len()]
+            .copy_from_slice(e_bytes);
+
+        // Update header entry count.
+        header.entry_count = (old_count + 1) as u16;
+        let h_bytes = header.as_bytes();
+        table_buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        // Write back the metadata table.
+        file.write_at(regions.metadata_offset, &table_buf)
+            .await
+            .unwrap();
+
+        // Write the locator blob into the metadata region data area.
+        file.write_at(
+            regions.metadata_offset + locator_offset as u64,
+            locator_blob,
+        )
+        .await
+        .unwrap();
+    }
+
+    #[async_test]
+    async fn differencing_has_locator() {
+        use crate::locator;
+
+        // Create a differencing disk.
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        // Build a parent locator blob and inject it into the metadata region.
+        let locator_blob = locator::build_locator(
+            format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            &[
+                ("parent_linkage", "{some-guid}"),
+                ("relative_path", ".\\parent.vhdx"),
+                ("absolute_win32_path", "C:\\VMs\\parent.vhdx"),
+            ],
+        );
+        inject_parent_locator(&file, &locator_blob).await;
+
+        // Open and verify.
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert!(vhdx.has_parent());
+
+        let loc = vhdx
+            .parent_locator()
+            .await
+            .unwrap()
+            .expect("should have locator");
+        assert_eq!(loc.locator_type, format::PARENT_LOCATOR_VHDX_TYPE_GUID);
+        assert_eq!(loc.find("parent_linkage"), Some("{some-guid}"));
+        assert_eq!(loc.find("relative_path"), Some(".\\parent.vhdx"));
+        assert_eq!(
+            loc.find("absolute_win32_path"),
+            Some("C:\\VMs\\parent.vhdx")
+        );
+    }
+
+    #[async_test]
+    async fn parent_paths_extraction() {
+        use crate::locator;
+
+        // Create a differencing disk with a parent locator.
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let locator_blob = locator::build_locator(
+            format::PARENT_LOCATOR_VHDX_TYPE_GUID,
+            &[
+                ("parent_linkage", "{some-guid}"),
+                ("relative_path", ".\\parent.vhdx"),
+                ("absolute_win32_path", "C:\\VMs\\parent.vhdx"),
+            ],
+        );
+        inject_parent_locator(&file, &locator_blob).await;
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        let loc = vhdx
+            .parent_locator()
+            .await
+            .unwrap()
+            .expect("should have locator");
+        let paths = loc.parent_paths();
+        assert_eq!(paths.parent_linkage.as_deref(), Some("{some-guid}"));
+        assert_eq!(paths.relative_path.as_deref(), Some(".\\parent.vhdx"));
+        assert_eq!(
+            paths.absolute_win32_path.as_deref(),
+            Some("C:\\VMs\\parent.vhdx")
+        );
+        assert!(paths.volume_path.is_none());
+    }
+
+    #[async_test]
+    async fn differencing_missing_locator_errors() {
+        // Create a diff disk but don't write any locator data.
+        // create() doesn't add a parent locator entry, so read_item() will
+        // return MissingRequiredMetadata.
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert!(vhdx.has_parent());
+        let result = vhdx.parent_locator().await;
+        assert!(result.is_err());
+    }
+
+    // -----------------------------------------------------------------------
+    // Log replay integration tests
+    // -----------------------------------------------------------------------
+
+    /// Inject a dirty log into a VHDX file:
+    /// 1. Write log entries using LogWriter
+    /// 2. Set the header's log_guid to match
+    /// 3. Update header CRC
+    ///
+    /// Returns the log_guid used.
+    async fn inject_dirty_log(
+        file: &InMemoryFile,
+        data_pages: &[log::DataPage<'_>],
+        zero_ranges: &[log::ZeroRange],
+    ) -> Guid {
+        // Read the active header (header 2, sequence_number=1 after create).
+        let mut hdr_buf = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_2, &mut hdr_buf)
+            .await
+            .unwrap();
+        let header = Header::read_from_prefix(&hdr_buf).unwrap().0.clone();
+
+        let log_guid = Guid::new_random();
+        let log_region = LogRegion {
+            file_offset: header.log_offset,
+            length: header.log_length,
+        };
+
+        // Initialize a LogWriter and write the entry.
+        let file_size = file.file_size().await.unwrap();
+        let mut writer = log::LogWriter::initialize(file, log_region, log_guid, file_size)
+            .await
+            .unwrap();
+
+        if !data_pages.is_empty() || !zero_ranges.is_empty() {
+            writer
+                .write_entry(file, data_pages, zero_ranges)
+                .await
+                .unwrap();
+        }
+
+        // Set log_guid in a new header with bumped sequence number.
+        // Write to header 1 (the non-current slot) with a higher sequence
+        // number so it becomes the active header.
+        let mut header_copy = header;
+        header_copy.log_guid = log_guid;
+        header_copy.sequence_number += 1;
+        header_copy.checksum = 0;
+
+        let mut buf = vec![0u8; format::HEADER_SIZE as usize];
+        let hdr_bytes = header_copy.as_bytes();
+        buf[..hdr_bytes.len()].copy_from_slice(hdr_bytes);
+        let crc = format::compute_checksum(&buf, 4);
+        buf[4..8].copy_from_slice(&crc.to_le_bytes());
+
+        // Write to header 1 (which now has a higher seq, becoming active).
+        file.write_at(format::HEADER_OFFSET_1, &buf).await.unwrap();
+
+        log_guid
+    }
+
+    #[async_test]
+    async fn open_replays_dirty_log_data() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        // Pick a target offset >= LOGABLE_OFFSET (192 KiB = region table offset).
+        // Use 320 KiB (= 5 * 64 KiB) to be past both region tables.
+        let target_offset: u64 = 5 * format::KB64;
+
+        // Build a recognizable data pattern.
+        let pattern = [0xABu8; 4096];
+        let data_page = log::DataPage {
+            file_offset: target_offset,
+            payload: &pattern,
+        };
+
+        inject_dirty_log(&file, &[data_page], &[]).await;
+
+        // Open should replay the log and succeed.
+        let vhdx = VhdxFile::open(file)
+            .allow_replay(true)
+            .read_only()
+            .await
+            .unwrap();
+        assert_eq!(vhdx.disk_size(), format::GB1);
+
+        // Verify the data pattern was written at the target offset via the
+        // Arc<InMemoryFile> inside the VhdxFile.
+        let mut readback = [0u8; 4096];
+        vhdx.file
+            .read_at(target_offset, &mut readback)
+            .await
+            .unwrap();
+        assert_eq!(readback, pattern);
+    }
+
+    #[async_test]
+    async fn open_replays_dirty_log_zeros() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        // Write non-zero data at a target offset first.
+        let target_offset: u64 = 5 * format::KB64;
+        let non_zero = [0xFFu8; 4096];
+        file.write_at(target_offset, &non_zero).await.unwrap();
+
+        // Inject a dirty log with a zero descriptor targeting that offset.
+        let zero_range = log::ZeroRange {
+            file_offset: target_offset,
+            length: 4096,
+        };
+
+        inject_dirty_log(&file, &[], &[zero_range]).await;
+
+        // Open should replay the log and succeed.
+        let vhdx = VhdxFile::open(file)
+            .allow_replay(true)
+            .read_only()
+            .await
+            .unwrap();
+        assert_eq!(vhdx.disk_size(), format::GB1);
+
+        // Verify the range is now zeroed.
+        let mut readback = [0u8; 4096];
+        vhdx.file
+            .read_at(target_offset, &mut readback)
+            .await
+            .unwrap();
+        assert_eq!(readback, [0u8; 4096]);
+    }
+
+    #[async_test]
+    async fn open_replay_then_reopen_clean() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        let target_offset: u64 = 5 * format::KB64;
+        let pattern = [0xCDu8; 4096];
+        let data_page = log::DataPage {
+            file_offset: target_offset,
+            payload: &pattern,
+        };
+
+        inject_dirty_log(&file, &[data_page], &[]).await;
+
+        // First open triggers replay.
+        let vhdx = VhdxFile::open(file)
+            .allow_replay(true)
+            .read_only()
+            .await
+            .unwrap();
+        // The clean header was written to the file inside vhdx.
+        // Make a snapshot of the replayed file for the second open.
+        let snapshot = vhdx.file.snapshot();
+        drop(vhdx);
+
+        // Create a new InMemoryFile from the snapshot for the second open.
+        let file3 = InMemoryFile::from_snapshot(snapshot);
+
+        // Second open should succeed without replay (log_guid is now ZERO).
+        let vhdx2 = VhdxFile::open(file3).read_only().await.unwrap();
+        assert_eq!(vhdx2.disk_size(), format::GB1);
+    }
+
+    #[async_test]
+    async fn open_replay_corrupt_log_entry() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        let target_offset: u64 = 5 * format::KB64;
+        let pattern = [0xEEu8; 4096];
+        let data_page = log::DataPage {
+            file_offset: target_offset,
+            payload: &pattern,
+        };
+
+        let _log_guid = inject_dirty_log(&file, &[data_page], &[]).await;
+
+        // Read the active header to find the log region offset.
+        let mut hdr_buf = vec![0u8; format::HEADER_SIZE as usize];
+        file.read_at(format::HEADER_OFFSET_1, &mut hdr_buf)
+            .await
+            .unwrap();
+        let header = Header::read_from_prefix(&hdr_buf).unwrap().0.clone();
+
+        // Corrupt the first byte of the log region (flip a byte in the CRC
+        // of the log entry).
+        let mut corrupt_buf = [0u8; 1];
+        file.read_at(header.log_offset + 4, &mut corrupt_buf)
+            .await
+            .unwrap();
+        corrupt_buf[0] ^= 0xFF;
+        file.write_at(header.log_offset + 4, &corrupt_buf)
+            .await
+            .unwrap();
+
+        // Open should fail because there are no valid log entries for this GUID.
+        let result = VhdxFile::open(file).allow_replay(true).read_only().await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::NoValidLogEntries
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn open_read_only_dirty_log_rejected() {
+        let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        let target_offset: u64 = 5 * format::KB64;
+        let pattern = [0xBBu8; 4096];
+        let data_page = log::DataPage {
+            file_offset: target_offset,
+            payload: &pattern,
+        };
+
+        inject_dirty_log(&file, &[data_page], &[]).await;
+
+        // Read-only open with a dirty log should return LogReplayRequired.
+        let result = VhdxFile::open(file).read_only().await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::LogReplayRequired
+            )))
+        ));
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/region.rs b/vm/devices/storage/vhdx/src/region.rs
new file mode 100644
index 0000000000..69e13f775f
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/region.rs
@@ -0,0 +1,527 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Region table parsing and validation for VHDX files.
+//!
+//! Reads both region tables, validates their signatures and CRC-32C checksums,
+//! identifies BAT and metadata regions, and checks for overlaps and duplicates.
+
+use crate::AsyncFile;
+use crate::cache::PageCache;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::error::OpenErrorInner;
+use crate::format;
+use crate::format::RegionTableEntry;
+use crate::format::RegionTableHeader;
+use crate::log_task::LogData;
+use crate::log_task::Lsn;
+use std::sync::Arc;
+use zerocopy::FromBytes;
+
+/// Parsed region table data.
+pub(crate) struct ParsedRegions<B> {
+    /// File offset of the BAT region.
+    pub bat_offset: u64,
+    /// Length of the BAT region in bytes.
+    pub bat_length: u32,
+    /// File offset of the metadata region.
+    pub metadata_offset: u64,
+    /// Length of the metadata region in bytes.
+    pub metadata_length: u32,
+    /// The validated region table bytes. Present only when the two on-disk
+    /// copies don't match and need rewriting.
+    pub rewrite_data: Option<B>,
+}
+
+/// Read and validate a single 64 KiB region table from the file.
+/// Returns the raw bytes if valid, or `None` if the table is corrupt.
+async fn read_and_validate_region_table<F: AsyncFile>(
+    file: &F,
+    offset: u64,
+) -> Result<Option<F::Buffer>, OpenError> {
+    let buf = file.alloc_buffer(format::REGION_TABLE_SIZE as usize);
+    let buf = file
+        .read_into(offset, buf)
+        .await
+        .map_err(OpenErrorInner::Io)?;
+    let buf_ref = buf.as_ref();
+
+    // Check signature.
+    let header = match RegionTableHeader::read_from_prefix(buf_ref) {
+        Ok((h, _)) => h,
+        Err(_) => return Ok(None),
+    };
+    if header.signature != format::REGION_TABLE_SIGNATURE {
+        return Ok(None);
+    }
+
+    // Validate CRC-32C checksum (checksum field is at byte offset 4).
+    if !format::validate_checksum(buf_ref, 4) {
+        return Ok(None);
+    }
+
+    Ok(Some(buf))
+}
+
+/// Read both region tables from the file, validate, and extract BAT/metadata
+/// region locations.
+pub(crate) async fn parse_region_tables<F: AsyncFile>(
+    file: &F,
+) -> Result<ParsedRegions<F::Buffer>, OpenError> {
+    let table1 = read_and_validate_region_table(file, format::REGION_TABLE_OFFSET).await?;
+    let table2 = read_and_validate_region_table(file, format::ALT_REGION_TABLE_OFFSET).await?;
+
+    let (table, needs_rewrite) = match (table1, table2) {
+        (Some(t1), Some(t2)) => {
+            let needs_rewrite = t1.as_ref() != t2.as_ref();
+            (t1, needs_rewrite)
+        }
+        (Some(t1), None) => (t1, true),
+        (None, Some(t2)) => (t2, true),
+        (None, None) => return Err(CorruptionType::RegionTablesBothCorrupt.into()),
+    };
+
+    // Parse the header to get entry count.
+    let header = RegionTableHeader::read_from_prefix(table.as_ref())
+        .unwrap()
+        .0
+        .clone();
+
+    if header.entry_count as u64 > format::REGION_TABLE_MAX_ENTRY_COUNT {
+        return Err(CorruptionType::InvalidEntryCountInRegionTable.into());
+    }
+    if header.reserved != 0 {
+        return Err(CorruptionType::ReservedRegionTableFieldNonzero.into());
+    }
+
+    // Parse all entries.
+    let entry_size = size_of::<RegionTableEntry>();
+    let header_size = size_of::<RegionTableHeader>();
+    let mut entries = Vec::with_capacity(header.entry_count as usize);
+    for i in 0..header.entry_count as usize {
+        let offset = header_size + i * entry_size;
+        let entry = RegionTableEntry::read_from_prefix(&table.as_ref()[offset..])
+            .unwrap()
+            .0
+            .clone();
+
+        let supported = u32::from(format::RegionTableEntryFlags::new().with_required(true));
+        if u32::from(entry.flags) & !supported != 0 {
+            return Err(CorruptionType::ReservedRegionTableFieldNonzero.into());
+        }
+        entries.push(entry);
+    }
+
+    // Sort by GUID for duplicate detection.
+    entries.sort_by_key(|a| a.guid);
+
+    // Check for duplicate GUIDs.
+    for i in 1..entries.len() {
+        if entries[i].guid == entries[i - 1].guid {
+            return Err(CorruptionType::DuplicateRegionEntry.into());
+        }
+    }
+
+    // Validate each entry's offset and length.
+    for entry in &entries {
+        if entry.length == 0 {
+            return Err(CorruptionType::OffsetOrLengthInRegionTable.into());
+        }
+        if !entry.file_offset.is_multiple_of(format::REGION_ALIGNMENT)
+            || !(entry.length as u64).is_multiple_of(format::REGION_ALIGNMENT)
+        {
+            return Err(CorruptionType::OffsetOrLengthInRegionTable.into());
+        }
+    }
+
+    // Check for overlapping regions by sorting by offset.
+    let mut by_offset: Vec<(u64, u64)> = entries
+        .iter()
+        .map(|e| (e.file_offset, e.length as u64))
+        .collect();
+    // Also include the 1 MiB header area as a reserved region.
+    by_offset.push((0, format::HEADER_AREA_SIZE));
+    by_offset.sort_by_key(|&(offset, _)| offset);
+
+    for i in 1..by_offset.len() {
+        let prev_end = by_offset[i - 1]
+            .0
+            .checked_add(by_offset[i - 1].1)
+            .ok_or(CorruptionType::OffsetOrLengthInRegionTable)?;
+        if prev_end > by_offset[i].0 {
+            return Err(CorruptionType::OffsetOrLengthInRegionTable.into());
+        }
+    }
+
+    // Identify known regions.
+    let mut bat_offset = None;
+    let mut bat_length = None;
+    let mut metadata_offset = None;
+    let mut metadata_length = None;
+
+    for entry in &entries {
+        if entry.guid == format::BAT_REGION_GUID {
+            bat_offset = Some(entry.file_offset);
+            bat_length = Some(entry.length);
+        } else if entry.guid == format::METADATA_REGION_GUID {
+            metadata_offset = Some(entry.file_offset);
+            metadata_length = Some(entry.length);
+        } else if entry.flags.required() {
+            return Err(CorruptionType::UnknownRequiredRegion.into());
+        }
+        // Unknown non-required regions are silently ignored.
+    }
+
+    let bat_offset = bat_offset.ok_or(CorruptionType::MissingBatOrMetadataRegion)?;
+    let bat_length = bat_length.ok_or(CorruptionType::MissingBatOrMetadataRegion)?;
+    let metadata_offset = metadata_offset.ok_or(CorruptionType::MissingBatOrMetadataRegion)?;
+    let metadata_length = metadata_length.ok_or(CorruptionType::MissingBatOrMetadataRegion)?;
+
+    Ok(ParsedRegions {
+        bat_offset,
+        bat_length,
+        metadata_offset,
+        metadata_length,
+        rewrite_data: if needs_rewrite {
+            Some(table.clone())
+        } else {
+            None
+        },
+    })
+}
+
+/// Write the region table to both on-disk slots via the write-ahead log.
+///
+/// Called during [`VhdxBuilder::writable`](crate::open::VhdxBuilder::writable)
+/// when one region table was corrupt or the two copies didn't match. Acquires
+/// log permits, sends the pages through [`PageCache::commit_raw`], and returns
+/// the LSN. The caller must wait for the LSN and flush to make the writes
+/// durable.
+pub(crate) async fn rewrite_region_tables<F: AsyncFile>(
+    cache: &PageCache<F>,
+    log_permits: &crate::log_permits::LogPermits,
+    table: F::Buffer,
+) -> Result<Lsn, crate::error::PipelineFailed> {
+    assert_eq!(
+        table.as_ref().len(),
+        format::REGION_TABLE_SIZE as usize,
+        "region table must be exactly {} bytes",
+        format::REGION_TABLE_SIZE
+    );
+
+    let log_data_page_size = format::LOG_SECTOR_SIZE as usize;
+    let pages_per_table = format::REGION_TABLE_SIZE as usize / log_data_page_size;
+    let total_pages = pages_per_table * 2;
+    let table = Arc::new(table);
+    let mut pages = Vec::with_capacity(2);
+
+    for base_offset in [format::REGION_TABLE_OFFSET, format::ALT_REGION_TABLE_OFFSET] {
+        pages.push(LogData::new(base_offset, table.clone()));
+    }
+
+    log_permits.acquire(total_pages).await?;
+    Ok(cache.commit_raw(pages, None))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::error::OpenErrorInner;
+    use crate::open::VhdxFile;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::DefaultDriver;
+    use pal_async::async_test;
+    use zerocopy::IntoBytes;
+
+    #[async_test]
+    async fn parse_valid_region_tables() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+        let regions = parse_region_tables(&file).await.unwrap();
+
+        // Metadata at 2 MiB, BAT at 3 MiB (based on create layout).
+        assert_eq!(regions.metadata_offset, 2 * format::MB1);
+        assert_eq!(
+            regions.metadata_length,
+            format::DEFAULT_METADATA_REGION_SIZE
+        );
+        assert_eq!(regions.bat_offset, 3 * format::MB1);
+        assert!(regions.rewrite_data.is_none());
+    }
+
+    #[async_test]
+    async fn parse_one_corrupt_table() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        // Corrupt the first region table's CRC.
+        let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize];
+        file.read_at(format::REGION_TABLE_OFFSET, &mut buf)
+            .await
+            .unwrap();
+        buf[10] ^= 0xFF;
+        file.write_at(format::REGION_TABLE_OFFSET, &buf)
+            .await
+            .unwrap();
+
+        let regions = parse_region_tables(&file).await.unwrap();
+        assert!(regions.rewrite_data.is_some());
+        // Should still parse successfully using table 2.
+        assert_eq!(regions.metadata_offset, 2 * format::MB1);
+    }
+
+    #[async_test]
+    async fn parse_both_corrupt() {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        // Corrupt both region tables.
+        for offset in [format::REGION_TABLE_OFFSET, format::ALT_REGION_TABLE_OFFSET] {
+            let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize];
+            file.read_at(offset, &mut buf).await.unwrap();
+            buf[10] ^= 0xFF;
+            file.write_at(offset, &buf).await.unwrap();
+        }
+
+        let result = parse_region_tables(&file).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::RegionTablesBothCorrupt
+            )))
+        ));
+    }
+
+    /// Helper to build a custom region table and write it to both locations.
+    async fn write_custom_region_table(file: &InMemoryFile, entries: &[RegionTableEntry]) {
+        let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize];
+        let header = RegionTableHeader {
+            signature: format::REGION_TABLE_SIGNATURE,
+            checksum: 0,
+            entry_count: entries.len() as u32,
+            reserved: 0,
+        };
+        let h_bytes = header.as_bytes();
+        buf[..h_bytes.len()].copy_from_slice(h_bytes);
+
+        let entry_start = size_of::<RegionTableHeader>();
+        for (i, entry) in entries.iter().enumerate() {
+            let off = entry_start + i * size_of::<RegionTableEntry>();
+            let e_bytes = entry.as_bytes();
+            buf[off..off + e_bytes.len()].copy_from_slice(e_bytes);
+        }
+
+        let crc = format::compute_checksum(&buf, 4);
+        buf[4..8].copy_from_slice(&crc.to_le_bytes());
+
+        file.write_at(format::REGION_TABLE_OFFSET, &buf)
+            .await
+            .unwrap();
+        file.write_at(format::ALT_REGION_TABLE_OFFSET, &buf)
+            .await
+            .unwrap();
+    }
+
+    #[async_test]
+    async fn parse_missing_bat_region() {
+        let file = InMemoryFile::new(format::HEADER_AREA_SIZE);
+        // Only metadata region, no BAT.
+        let entries = vec![RegionTableEntry {
+            guid: format::METADATA_REGION_GUID,
+            file_offset: 2 * format::MB1,
+            length: format::MB1 as u32,
+            flags: format::RegionTableEntryFlags::new().with_required(true),
+        }];
+        write_custom_region_table(&file, &entries).await;
+
+        let result = parse_region_tables(&file).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::MissingBatOrMetadataRegion
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn parse_duplicate_region() {
+        let file = InMemoryFile::new(format::HEADER_AREA_SIZE);
+        let entries = vec![
+            RegionTableEntry {
+                guid: format::BAT_REGION_GUID,
+                file_offset: 2 * format::MB1,
+                length: format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+            RegionTableEntry {
+                guid: format::BAT_REGION_GUID,
+                file_offset: 3 * format::MB1,
+                length: format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+        ];
+        write_custom_region_table(&file, &entries).await;
+
+        let result = parse_region_tables(&file).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::DuplicateRegionEntry
+            )))
+        ));
+    }
+
+    async fn corrupt_region_table_bytes(
+        file: &InMemoryFile,
+        entries: &[RegionTableEntry],
+        corrupt: impl FnOnce(&mut [u8]),
+    ) {
+        write_custom_region_table(file, entries).await;
+
+        let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize];
+        file.read_at(format::REGION_TABLE_OFFSET, &mut buf)
+            .await
+            .unwrap();
+        corrupt(&mut buf);
+        let crc = format::compute_checksum(&buf, 4);
+        buf[4..8].copy_from_slice(&crc.to_le_bytes());
+
+        file.write_at(format::REGION_TABLE_OFFSET, &buf)
+            .await
+            .unwrap();
+        file.write_at(format::ALT_REGION_TABLE_OFFSET, &buf)
+            .await
+            .unwrap();
+    }
+
+    #[async_test]
+    async fn parse_region_table_with_nonzero_reserved_header() {
+        let file = InMemoryFile::new(format::HEADER_AREA_SIZE);
+        let entries = vec![
+            RegionTableEntry {
+                guid: format::BAT_REGION_GUID,
+                file_offset: 2 * format::MB1,
+                length: format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+            RegionTableEntry {
+                guid: format::METADATA_REGION_GUID,
+                file_offset: 3 * format::MB1,
+                length: format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+        ];
+        corrupt_region_table_bytes(&file, &entries, |buf| {
+            buf[12..16].copy_from_slice(&1_u32.to_le_bytes());
+        })
+        .await;
+
+        let result = parse_region_tables(&file).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::ReservedRegionTableFieldNonzero
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn parse_region_table_with_reserved_entry_flags() {
+        let file = InMemoryFile::new(format::HEADER_AREA_SIZE);
+        let entries = vec![
+            RegionTableEntry {
+                guid: format::BAT_REGION_GUID,
+                file_offset: 2 * format::MB1,
+                length: format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+            RegionTableEntry {
+                guid: format::METADATA_REGION_GUID,
+                file_offset: 3 * format::MB1,
+                length: format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+        ];
+        corrupt_region_table_bytes(&file, &entries, |buf| {
+            let entry_flags_offset = size_of::<RegionTableHeader>() + 28;
+            buf[entry_flags_offset..entry_flags_offset + 4].copy_from_slice(&3_u32.to_le_bytes());
+        })
+        .await;
+
+        let result = parse_region_tables(&file).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::ReservedRegionTableFieldNonzero
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn parse_overlapping_regions() {
+        let file = InMemoryFile::new(format::HEADER_AREA_SIZE);
+        // Two regions that overlap at the 2 MiB mark.
+        let entries = vec![
+            RegionTableEntry {
+                guid: format::BAT_REGION_GUID,
+                file_offset: 2 * format::MB1,
+                length: 2 * format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+            RegionTableEntry {
+                guid: format::METADATA_REGION_GUID,
+                file_offset: 3 * format::MB1,
+                length: format::MB1 as u32,
+                flags: format::RegionTableEntryFlags::new().with_required(true),
+            },
+        ];
+        write_custom_region_table(&file, &entries).await;
+
+        let result = parse_region_tables(&file).await;
+        assert!(matches!(
+            result,
+            Err(OpenError(OpenErrorInner::Corrupt(
+                CorruptionType::OffsetOrLengthInRegionTable
+            )))
+        ));
+    }
+
+    #[async_test]
+    async fn rewrite_repairs_corrupt_table(driver: DefaultDriver) {
+        let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+        // Corrupt the first region table.
+        let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize];
+        file.read_at(format::REGION_TABLE_OFFSET, &mut buf)
+            .await
+            .unwrap();
+        buf[10] ^= 0xFF;
+        file.write_at(format::REGION_TABLE_OFFSET, &buf)
+            .await
+            .unwrap();
+
+        // Opening writable should detect and repair the mismatch via the log.
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+        let file_ref = vhdx.file.clone();
+        vhdx.close().await.unwrap();
+
+        // Parse again — both should match now.
+        let regions2 = parse_region_tables(&*file_ref).await.unwrap();
+        assert!(
+            regions2.rewrite_data.is_none(),
+            "tables should match after rewrite"
+        );
+
+        // Verify both on-disk copies are identical.
+        let mut t1 = vec![0u8; format::REGION_TABLE_SIZE as usize];
+        let mut t2 = vec![0u8; format::REGION_TABLE_SIZE as usize];
+        file_ref
+            .read_at(format::REGION_TABLE_OFFSET, &mut t1)
+            .await
+            .unwrap();
+        file_ref
+            .read_at(format::ALT_REGION_TABLE_OFFSET, &mut t2)
+            .await
+            .unwrap();
+        assert_eq!(t1, t2, "both region tables should be identical");
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/sector_bitmap.rs b/vm/devices/storage/vhdx/src/sector_bitmap.rs
new file mode 100644
index 0000000000..b29e1309ef
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/sector_bitmap.rs
@@ -0,0 +1,688 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Sector bitmap read logic for partially-present VHDX blocks.
+//!
+//! A sector bitmap is a 1-MiB block of bits where each bit represents one
+//! logical sector. Bit = 1 means the sector's data is present in this VHDX
+//! file; bit = 0 means the sector is transparent (should be read from the
+//! parent disk in a differencing chain).
+//!
+//! The bitmap is cached in 4-KiB pages via the [`PageCache`]. Each page
+//! covers `4096 * 8 = 32768` sectors.
+
+use crate::AsyncFile;
+use crate::cache::PageKey;
+use crate::cache::WriteMode;
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::format::BatEntryState;
+use crate::format::CACHE_PAGE_SIZE;
+use crate::format::SECTORS_PER_CHUNK;
+use crate::io::ReadRange;
+use crate::open::VhdxFile;
+use bitvec::prelude::*;
+
+/// Cache tag for sector bitmap pages.
+///
+/// SBM pages are at absolute file offsets (not region-relative), so the
+/// base offset for this tag is 0.
+pub(crate) const SBM_TAG: u8 = 2;
+
+/// Number of sectors tracked per bitmap cache page (4 KiB * 8 bits = 32768).
+const SECTORS_PER_BITMAP_PAGE: u64 = CACHE_PAGE_SIZE * 8;
+
+impl<F: AsyncFile> VhdxFile<F> {
+    /// Resolve a read for a partially-present block by reading the sector bitmap.
+    ///
+    /// For each sector in the range, checks the corresponding bit in the sector
+    /// bitmap. Emits runs of [`ReadRange::Data`] (bit=1, sector present in file)
+    /// and [`ReadRange::Unmapped`] (bit=0, sector transparent to parent).
+    ///
+    /// # Arguments
+    ///
+    /// * `data_file_offset` - The file offset of the data block (from the
+    ///   payload BAT entry). Used to compute file offsets for present sectors.
+    /// * `virtual_offset` - The virtual disk byte offset of the start of this
+    ///   sub-request (already clamped to a single block).
+    /// * `length` - The length in bytes (already clamped to a single block).
+    /// * `ranges` - Output vector to append ranges to.
+    pub(crate) async fn resolve_partial_block_read(
+        &self,
+        data_file_offset: u64,
+        virtual_offset: u64,
+        length: u32,
+        ranges: &mut Vec<ReadRange>,
+    ) -> Result<(), VhdxIoError> {
+        // 1. Compute sector coordinates.
+        let sector_number = virtual_offset / self.logical_sector_size as u64;
+        let chunk_number = (sector_number / SECTORS_PER_CHUNK) as u32;
+        let sector_count = length as u64 / self.logical_sector_size as u64;
+
+        // 2. Get sector bitmap block mapping (synchronous).
+        // The SBM is guaranteed to be allocated — validated during BAT
+        // loading (PartiallyPresentWithoutSectorBitmap check) and
+        // maintained at runtime by ensure_sbm_allocated.
+        let sbm_mapping = self.bat.get_sector_bitmap_mapping(chunk_number);
+        assert!(
+            sbm_mapping.bat_state() == BatEntryState::FullyPresent,
+            "SBM for chunk {chunk_number} must be allocated for PartiallyPresent block"
+        );
+
+        // 3. Iterate over bitmap pages (outer loop for multi-page support).
+        let mut remaining_sectors = sector_count;
+        let mut current_virtual_offset = virtual_offset;
+
+        while remaining_sectors > 0 {
+            // Recompute bitmap page coordinates for current position.
+            let cur_sector = current_virtual_offset / self.logical_sector_size as u64;
+            let cur_chunk_sector = cur_sector % SECTORS_PER_CHUNK;
+            let cur_page_number = cur_chunk_sector / SECTORS_PER_BITMAP_PAGE;
+            let start_bit = cur_chunk_sector % SECTORS_PER_BITMAP_PAGE;
+            let bits_in_this_page =
+                std::cmp::min(start_bit + remaining_sectors, SECTORS_PER_BITMAP_PAGE);
+
+            // Acquire the bitmap page for this portion.
+            let page_file_offset = sbm_mapping.file_offset() + cur_page_number * CACHE_PAGE_SIZE;
+            {
+                let guard = self
+                    .cache
+                    .acquire_read(PageKey {
+                        tag: SBM_TAG,
+                        offset: page_file_offset,
+                    })
+                    .await
+                    .map_err(VhdxIoErrorInner::ReadSectorBitmap)?;
+
+                // Scan bits within this page using BitSlice for word-level acceleration.
+                let bits = BitSlice::<u8, Lsb0>::from_slice(&*guard);
+                let window = &bits[start_bit as usize..bits_in_this_page as usize];
+                let mut pos = 0usize;
+                let len = window.len();
+                while pos < len {
+                    // Find first set bit (data present).
+                    let one = window[pos..].first_one().map_or(len, |i| pos + i);
+                    if one > pos {
+                        let unmapped_sectors = (one - pos) as u64;
+                        let unmapped_bytes = unmapped_sectors * self.logical_sector_size as u64;
+                        ranges.push(ReadRange::Unmapped {
+                            guest_offset: current_virtual_offset,
+                            length: unmapped_bytes as u32,
+                        });
+                        current_virtual_offset += unmapped_bytes;
+                    }
+
+                    if one < len {
+                        // Find first clear bit (end of data run).
+                        let next_zero = window[one..].first_zero().map_or(len, |i| one + i);
+                        let data_sectors = (next_zero - one) as u64;
+                        let data_bytes = data_sectors * self.logical_sector_size as u64;
+                        let block_offset = (current_virtual_offset % self.block_size as u64) as u32;
+                        let file_offset = data_file_offset + block_offset as u64;
+                        ranges.push(ReadRange::Data {
+                            guest_offset: current_virtual_offset,
+                            length: data_bytes as u32,
+                            file_offset,
+                        });
+                        current_virtual_offset += data_bytes;
+                        pos = next_zero;
+                    } else {
+                        pos = len;
+                    }
+                }
+            }
+
+            // Advance to next page.
+            let sectors_processed = bits_in_this_page - start_bit;
+            remaining_sectors -= sectors_processed;
+        }
+
+        Ok(())
+    }
+
+    /// Set or clear sector bitmap bits for a range of sectors.
+    ///
+    /// For each sector in the virtual range, sets (or clears) the corresponding
+    /// bit in the sector bitmap. The bitmap page is acquired in Modify mode
+    /// and written through to disk on release.
+    ///
+    /// # Arguments
+    ///
+    /// * `virtual_offset` - Virtual disk byte offset of the start of the range.
+    /// * `length` - Length in bytes.
+    /// * `set` - If true, set bits (mark sectors present); if false, clear bits.
+    pub(crate) async fn set_sector_bitmap_bits(
+        &self,
+        virtual_offset: u64,
+        length: u32,
+        set: bool,
+    ) -> Result<(), VhdxIoError> {
+        let sector_number = virtual_offset / self.logical_sector_size as u64;
+        let chunk_number = (sector_number / SECTORS_PER_CHUNK) as u32;
+        let sector_count = length as u64 / self.logical_sector_size as u64;
+
+        // Get sector bitmap block mapping (synchronous).
+        // The SBM is guaranteed to be allocated — validated during BAT
+        // loading and maintained at runtime by ensure_sbm_allocated.
+        let sbm_mapping = self.bat.get_sector_bitmap_mapping(chunk_number);
+        assert!(
+            sbm_mapping.bat_state() == BatEntryState::FullyPresent,
+            "SBM for chunk {chunk_number} must be allocated for PartiallyPresent block"
+        );
+
+        let mut remaining_sectors = sector_count;
+        let mut current_virtual_offset = virtual_offset;
+
+        while remaining_sectors > 0 {
+            let cur_sector = current_virtual_offset / self.logical_sector_size as u64;
+            let cur_chunk_sector = cur_sector % SECTORS_PER_CHUNK;
+            let cur_page_number = cur_chunk_sector / SECTORS_PER_BITMAP_PAGE;
+            let start_bit = cur_chunk_sector % SECTORS_PER_BITMAP_PAGE;
+            let bits_in_this_page =
+                std::cmp::min(start_bit + remaining_sectors, SECTORS_PER_BITMAP_PAGE);
+
+            let page_file_offset = sbm_mapping.file_offset() + cur_page_number * CACHE_PAGE_SIZE;
+
+            // If the range covers the entire page, skip the disk read.
+            let full_page = start_bit == 0 && bits_in_this_page == SECTORS_PER_BITMAP_PAGE;
+            let mode = if full_page {
+                WriteMode::Overwrite
+            } else {
+                WriteMode::Modify
+            };
+
+            let mut guard = self
+                .cache
+                .acquire_write(
+                    PageKey {
+                        tag: SBM_TAG,
+                        offset: page_file_offset,
+                    },
+                    mode,
+                )
+                .await
+                .map_err(VhdxIoErrorInner::SectorBitmapCache)?;
+
+            if full_page {
+                // Overwrite entire page without reading existing data.
+                // Overwriting pages are zero-initialized by the cache.
+                if set || !guard.is_overwriting() {
+                    guard.fill(if set { 0xFF } else { 0x00 });
+                }
+            } else {
+                // Check via read-only Deref whether any bits actually differ.
+                // If not, DerefMut is never called, the page stays clean,
+                // and no write-back occurs.
+                let bits = BitSlice::<u8, Lsb0>::from_slice(&*guard);
+                let window = &bits[start_bit as usize..bits_in_this_page as usize];
+                let needs_change = if set { !window.all() } else { window.any() };
+
+                if needs_change {
+                    let bits_mut = BitSlice::<u8, Lsb0>::from_slice_mut(&mut *guard);
+                    bits_mut[start_bit as usize..bits_in_this_page as usize].fill(set);
+                }
+            }
+
+            let sectors_processed = bits_in_this_page - start_bit;
+            remaining_sectors -= sectors_processed;
+            current_virtual_offset += sectors_processed * self.logical_sector_size as u64;
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::AsyncFileExt;
+    use crate::bat::Bat;
+    use crate::create::{self, CreateParams};
+    use crate::format;
+    use crate::format::BatEntry;
+    use crate::io::ReadRange;
+    use crate::open::VhdxFile;
+    use crate::region;
+    use crate::tests::support::InMemoryFile;
+    use pal_async::DefaultDriver;
+    use pal_async::async_test;
+    use zerocopy::IntoBytes;
+
+    /// Create a differencing VHDX with block 0 set to PartiallyPresent and
+    /// a sector bitmap at a known file offset.
+    ///
+    /// The `bitmap_data` should be exactly 4096 bytes of bitmap data for the
+    /// first bitmap page.
+    ///
+    /// Returns `(VhdxFile, data_block_file_offset, sbm_block_file_offset)`.
+    async fn create_partial_block_vhdx(
+        bitmap_data: &[u8; 4096],
+    ) -> (VhdxFile<InMemoryFile>, u64, u64) {
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        // Parse regions to find the BAT offset.
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let bat_offset = regions.bat_offset;
+
+        // Compute entry indices. With 2 MiB blocks, 512-byte sectors,
+        // chunk_ratio = 2048. Block 0 is payload entry 0. SBM entry for
+        // chunk 0 is at index chunk_ratio = 2048.
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            true,
+            format::MB1 as u32,
+        )
+        .unwrap();
+        let payload_index = bat.payload_entry_index(0);
+        let sbm_index = bat.sector_bitmap_entry_index(0);
+
+        // Place data block at 8 MiB (file_offset_mb = 8).
+        let data_block_offset = 8 * format::MB1;
+        let data_entry = BatEntry::new()
+            .with_state(BatEntryState::PartiallyPresent as u8)
+            .with_file_offset_mb(data_block_offset >> 20);
+        file.write_at(bat_offset + payload_index as u64 * 8, data_entry.as_bytes())
+            .await
+            .unwrap();
+
+        // Place SBM block at 10 MiB (file_offset_mb = 10).
+        let sbm_block_offset = 10 * format::MB1;
+        let sbm_entry = BatEntry::new()
+            .with_state(BatEntryState::FullyPresent as u8)
+            .with_file_offset_mb(sbm_block_offset >> 20);
+        file.write_at(bat_offset + sbm_index as u64 * 8, sbm_entry.as_bytes())
+            .await
+            .unwrap();
+
+        // Write the bitmap data at the SBM page offset (first page of SBM block).
+        file.write_at(sbm_block_offset, bitmap_data).await.unwrap();
+
+        // Open the VHDX.
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+        (vhdx, data_block_offset, sbm_block_offset)
+    }
+
+    #[async_test]
+    async fn partial_block_all_present() {
+        // All bits set → single Data range.
+        let bitmap = [0xFFu8; 4096];
+        let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await;
+
+        let mut ranges = Vec::new();
+        // Read first 4096 bytes (8 sectors * 512) of block 0.
+        vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+        assert_eq!(ranges.len(), 1);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Data {
+                guest_offset: 0,
+                length: 4096,
+                file_offset: data_offset,
+            }
+        );
+    }
+
+    #[async_test]
+    async fn partial_block_all_transparent() {
+        // All bits clear → single Unmapped range.
+        let bitmap = [0x00u8; 4096];
+        let (vhdx, _, _) = create_partial_block_vhdx(&bitmap).await;
+
+        let mut ranges = Vec::new();
+        vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+        assert_eq!(ranges.len(), 1);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Unmapped {
+                guest_offset: 0,
+                length: 4096,
+            }
+        );
+    }
+
+    #[async_test]
+    async fn partial_block_mixed() {
+        // First 4 sectors (bits 0-3) set, next 4 (bits 4-7) clear.
+        // Byte 0 = 0x0F (bits 0-3 set, 4-7 clear).
+        let mut bitmap = [0x00u8; 4096];
+        bitmap[0] = 0x0F;
+        let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await;
+
+        let mut ranges = Vec::new();
+        // Read 8 sectors = 4096 bytes.
+        vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+        assert_eq!(ranges.len(), 2);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Data {
+                guest_offset: 0,
+                length: 2048, // 4 sectors * 512
+                file_offset: data_offset,
+            }
+        );
+        assert_eq!(
+            ranges[1],
+            ReadRange::Unmapped {
+                guest_offset: 2048,
+                length: 2048,
+            }
+        );
+    }
+
+    #[async_test]
+    async fn partial_block_alternating() {
+        // Alternating: sector 0 set, 1 clear, 2 set, 3 clear, ...
+        // Byte pattern: 0b01010101 = 0x55 → bits 0,2,4,6 set
+        let mut bitmap = [0x00u8; 4096];
+        bitmap[0] = 0x55;
+        let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await;
+
+        let mut ranges = Vec::new();
+        // Read 8 sectors = 4096 bytes.
+        vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+        // 0x55 = 0b01010101: bits 0,2,4,6 set; bits 1,3,5,7 clear.
+        // Expected: Data(0) Unmapped(1) Data(2) Unmapped(3)
+        //           Data(4) Unmapped(5) Data(6) Unmapped(7)
+        assert_eq!(ranges.len(), 8);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Data {
+                guest_offset: 0,
+                length: 512,
+                file_offset: data_offset,
+            }
+        );
+        assert_eq!(
+            ranges[1],
+            ReadRange::Unmapped {
+                guest_offset: 512,
+                length: 512,
+            }
+        );
+        assert_eq!(
+            ranges[2],
+            ReadRange::Data {
+                guest_offset: 1024,
+                length: 512,
+                file_offset: data_offset + 1024,
+            }
+        );
+        assert_eq!(
+            ranges[3],
+            ReadRange::Unmapped {
+                guest_offset: 1536,
+                length: 512,
+            }
+        );
+        assert_eq!(
+            ranges[4],
+            ReadRange::Data {
+                guest_offset: 2048,
+                length: 512,
+                file_offset: data_offset + 2048,
+            }
+        );
+        assert_eq!(
+            ranges[5],
+            ReadRange::Unmapped {
+                guest_offset: 2560,
+                length: 512,
+            }
+        );
+        assert_eq!(
+            ranges[6],
+            ReadRange::Data {
+                guest_offset: 3072,
+                length: 512,
+                file_offset: data_offset + 3072,
+            }
+        );
+        assert_eq!(
+            ranges[7],
+            ReadRange::Unmapped {
+                guest_offset: 3584,
+                length: 512,
+            }
+        );
+    }
+
+    #[async_test]
+    async fn partial_block_single_sector_present() {
+        // Only bit 2 set: Unmapped(0,1024) Data(1024,512) Unmapped(1536,2560)
+        let mut bitmap = [0x00u8; 4096];
+        bitmap[0] = 0x04; // bit 2 set
+        let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await;
+
+        let mut ranges = Vec::new();
+        vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+        assert_eq!(ranges.len(), 3);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Unmapped {
+                guest_offset: 0,
+                length: 1024, // 2 sectors
+            }
+        );
+        assert_eq!(
+            ranges[1],
+            ReadRange::Data {
+                guest_offset: 1024,
+                length: 512,
+                file_offset: data_offset + 1024,
+            }
+        );
+        assert_eq!(
+            ranges[2],
+            ReadRange::Unmapped {
+                guest_offset: 1536,
+                length: 2560, // 5 sectors
+            }
+        );
+    }
+
+    #[async_test]
+    async fn partial_block_first_and_last_sector() {
+        // Bits 0 and 7 set in byte 0: Data(0,512) Unmapped(512,3072) Data(3584,512)
+        let mut bitmap = [0x00u8; 4096];
+        bitmap[0] = 0x81; // bits 0 and 7 set
+        let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await;
+
+        let mut ranges = Vec::new();
+        vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+        assert_eq!(ranges.len(), 3);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Data {
+                guest_offset: 0,
+                length: 512,
+                file_offset: data_offset,
+            }
+        );
+        assert_eq!(
+            ranges[1],
+            ReadRange::Unmapped {
+                guest_offset: 512,
+                length: 3072, // 6 sectors
+            }
+        );
+        assert_eq!(
+            ranges[2],
+            ReadRange::Data {
+                guest_offset: 3584,
+                length: 512,
+                file_offset: data_offset + 3584,
+            }
+        );
+    }
+
+    #[async_test]
+    async fn partial_block_read_at_offset() {
+        // All bits set. Read starting at sector 4 (offset 2048 within block).
+        let bitmap = [0xFFu8; 4096];
+        let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await;
+
+        let mut ranges = Vec::new();
+        // Read 4 sectors starting at byte offset 2048.
+        vhdx.resolve_read(2048, 2048, &mut ranges).await.unwrap();
+
+        assert_eq!(ranges.len(), 1);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Data {
+                guest_offset: 2048,
+                length: 2048,
+                file_offset: data_offset + 2048,
+            }
+        );
+    }
+
+    #[async_test]
+    async fn partial_block_unallocated_sbm_error() {
+        // Set up a PartiallyPresent data block but leave the SBM entry as NotPresent.
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let bat_offset = regions.bat_offset;
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            true,
+            format::MB1 as u32,
+        )
+        .unwrap();
+        let payload_index = bat.payload_entry_index(0);
+
+        // Set block 0 to PartiallyPresent but do NOT set the SBM entry.
+        let data_block_offset = 8 * format::MB1;
+        let data_entry = BatEntry::new()
+            .with_state(BatEntryState::PartiallyPresent as u8)
+            .with_file_offset_mb(data_block_offset >> 20);
+        file.write_at(bat_offset + payload_index as u64 * 8, data_entry.as_bytes())
+            .await
+            .unwrap();
+
+        // Extend file to cover the data block offset + block size.
+        let needed = data_block_offset + format::DEFAULT_BLOCK_SIZE as u64;
+        file.set_file_size(needed).await.unwrap();
+
+        // Open should fail because the PartiallyPresent block has no
+        // corresponding SBM allocation.
+        let result = VhdxFile::open(file).read_only().await;
+        assert!(
+            result.is_err(),
+            "open should reject PartiallyPresent block without SBM"
+        );
+    }
+
+    #[async_test]
+    async fn set_sector_bitmap_bits_roundtrip(driver: DefaultDriver) {
+        // Create a differencing VHDX with all-zero bitmap (all transparent).
+        // This test writes SBM bits, so it needs a writable VhdxFile.
+        let bitmap = [0x00u8; 4096];
+        let (_, data_offset, _) = create_partial_block_vhdx(&bitmap).await;
+
+        // Re-create the same setup but open writable.
+        let file = InMemoryFile::new(0);
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..Default::default()
+        };
+        create::create(&file, &mut params).await.unwrap();
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let bat_offset = regions.bat_offset;
+        let bat = Bat::new(
+            format::GB1,
+            format::DEFAULT_BLOCK_SIZE,
+            512,
+            true,
+            format::MB1 as u32,
+        )
+        .unwrap();
+        let payload_index = bat.payload_entry_index(0);
+        let sbm_index = bat.sector_bitmap_entry_index(0);
+
+        let data_block_offset = 8 * format::MB1;
+        let data_entry = BatEntry::new()
+            .with_state(BatEntryState::PartiallyPresent as u8)
+            .with_file_offset_mb(data_block_offset >> 20);
+        file.write_at(bat_offset + payload_index as u64 * 8, data_entry.as_bytes())
+            .await
+            .unwrap();
+
+        let sbm_block_offset = 10 * format::MB1;
+        let sbm_entry = BatEntry::new()
+            .with_state(BatEntryState::FullyPresent as u8)
+            .with_file_offset_mb(sbm_block_offset >> 20);
+        file.write_at(bat_offset + sbm_index as u64 * 8, sbm_entry.as_bytes())
+            .await
+            .unwrap();
+
+        file.write_at(sbm_block_offset, &bitmap).await.unwrap();
+
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // Verify initial state: sectors 0-7 are transparent.
+        let mut ranges = Vec::new();
+        vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+        assert_eq!(ranges.len(), 1);
+        assert_eq!(
+            ranges[0],
+            ReadRange::Unmapped {
+                guest_offset: 0,
+                length: 4096,
+            }
+        );
+
+        // Set bits for sectors 0-3 (first 2048 bytes).
+        vhdx.set_sector_bitmap_bits(
+            0,    // virtual_offset
+            2048, // length (4 sectors * 512)
+            true, // set
+        )
+        .await
+        .unwrap();
+
+        // Now read again: first 4 sectors should be Data, last 4 Unmapped.
+        let mut ranges2 = Vec::new();
+        vhdx.resolve_read(0, 4096, &mut ranges2).await.unwrap();
+        assert_eq!(ranges2.len(), 2);
+        assert_eq!(
+            ranges2[0],
+            ReadRange::Data {
+                guest_offset: 0,
+                length: 2048,
+                file_offset: data_offset,
+            }
+        );
+        assert_eq!(
+            ranges2[1],
+            ReadRange::Unmapped {
+                guest_offset: 2048,
+                length: 2048,
+            }
+        );
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/space.rs b/vm/devices/storage/vhdx/src/space.rs
new file mode 100644
index 0000000000..efba84cef0
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/space.rs
@@ -0,0 +1,2000 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Free space management for VHDX files.
+//!
+//! Tracks which megabyte-granularity regions of the file are free, in-use,
+//! or soft-anchored (from trimmed blocks). Implements a four-priority
+//! allocation strategy:
+//!
+//! 1. **Free space pool** — reuse interior free blocks
+//! 2. **Near-EOF space** — allocate from zeroed space before file end
+//! 3. **Soft-anchored blocks** — reclaim trimmed blocks (in-memory only)
+//! 4. **Extend EOF** — grow the file
+//!
+//! The bitmap uses 1-bit-per-megabyte granularity with SET = free / anchored
+//! and CLEAR = in-use.
+
+use crate::bat::Bat;
+use crate::error::CorruptionType;
+use crate::error::OpenError;
+use crate::error::OpenErrorInner;
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::format::BatEntryState;
+use crate::format::MB1;
+use bitfield_struct::bitfield;
+use bitvec::prelude::*;
+use parking_lot::Mutex;
+use std::collections::HashMap;
+
+/// Default EOF extension length: 32 MiB.
+const DEFAULT_EOF_EXTENSION_LENGTH: u32 = 32 * MB1 as u32;
+
+// ---------------------------------------------------------------------------
+// SpaceBitmap — RTL_BITMAP equivalent
+// ---------------------------------------------------------------------------
+
+/// Bitmap wrapper providing `RTL_BITMAP`-equivalent operations.
+///
+/// Uses [`BitVec`] with LSB-first bit ordering on `u64` words for
+/// word-level accelerated operations.
+/// SET bits (1) denote the property tracked by the containing structure
+/// (free, anchored, or trimmed); CLEAR bits (0) denote the opposite.
+#[derive(Clone)]
+struct SpaceBitmap {
+    bits: BitVec<u64, Lsb0>,
+}
+
+impl SpaceBitmap {
+    /// Create a new bitmap with `bit_count` bits, all initially clear.
+    fn new(bit_count: usize) -> Self {
+        SpaceBitmap {
+            bits: bitvec![u64, Lsb0; 0; bit_count],
+        }
+    }
+
+    /// Number of valid bits.
+    fn len(&self) -> usize {
+        self.bits.len()
+    }
+
+    /// Set a single bit.
+    fn set_bit(&mut self, index: usize) {
+        self.bits.set(index, true);
+    }
+
+    /// Clear a single bit.
+    fn clear_bit(&mut self, index: usize) {
+        self.bits.set(index, false);
+    }
+
+    /// Check whether a single bit is set.
+    fn check_bit(&self, index: usize) -> bool {
+        self.bits[index]
+    }
+
+    /// Set a contiguous range of bits `[start..start+count)`.
+    fn set_range(&mut self, start: usize, count: usize) {
+        self.bits[start..start + count].fill(true);
+    }
+
+    /// Clear a contiguous range of bits `[start..start+count)`.
+    fn clear_range(&mut self, start: usize, count: usize) {
+        self.bits[start..start + count].fill(false);
+    }
+
+    /// Check whether all bits in `[start..start+count)` are set.
+    fn are_bits_set(&self, start: usize, count: usize) -> bool {
+        count == 0 || self.bits[start..start + count].all()
+    }
+
+    /// Check whether all bits in `[start..start+count)` are clear.
+    fn are_bits_clear(&self, start: usize, count: usize) -> bool {
+        count == 0 || self.bits[start..start + count].not_any()
+    }
+
+    /// Find the first contiguous run of `count` SET bits, starting the
+    /// scan at `hint`. Returns `None` if no such run exists.
+    ///
+    /// Scans `[hint..len)` first, then `[0..hint)`. Uses word-level
+    /// `first_one` / `first_zero` operations for efficient run detection.
+    fn find_set_bits(&self, count: usize, hint: usize) -> Option<usize> {
+        let total = self.bits.len();
+        if count == 0 || count > total {
+            return None;
+        }
+        let hint = hint.min(total);
+
+        // Pass 1: [hint..total)
+        if let Some(idx) = Self::find_run(&self.bits, count, hint, total) {
+            return Some(idx);
+        }
+        // Pass 2: [0..hint) — only the region not covered by pass 1.
+        if hint > 0 {
+            if let Some(idx) = Self::find_run(&self.bits, count, 0, hint) {
+                return Some(idx);
+            }
+        }
+        None
+    }
+
+    /// Set all valid bits.
+    fn set_all(&mut self) {
+        self.bits.fill(true);
+    }
+
+    /// Resize the bitmap to `new_bit_count`. New bits are cleared.
+    /// Preserves existing data up to `min(old_count, new_count)`.
+    fn resize(&mut self, new_bit_count: usize) {
+        self.bits.resize(new_bit_count, false);
+    }
+
+    /// Find a contiguous run of `count` SET bits within `[start..end)`.
+    fn find_run(
+        bits: &BitSlice<u64, Lsb0>,
+        count: usize,
+        start: usize,
+        end: usize,
+    ) -> Option<usize> {
+        if end - start < count {
+            return None;
+        }
+        let window = &bits[start..end];
+        let mut pos = 0;
+        while pos + count <= window.len() {
+            // Skip clear bits — find next set bit.
+            let run_start = match window[pos..].first_one() {
+                Some(i) => pos + i,
+                None => return None,
+            };
+            if run_start + count > window.len() {
+                return None;
+            }
+            // Find end of the set-bit run.
+            let run_end = window[run_start..]
+                .first_zero()
+                .map_or(window.len(), |i| run_start + i);
+            if run_end - run_start >= count {
+                return Some(start + run_start);
+            }
+            pos = run_end;
+        }
+        None
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Sub-structures
+// ---------------------------------------------------------------------------
+
+/// Free space pool state. Tracks 1-bit-per-megabyte: SET = free.
+struct FreeSpacePool {
+    bitmap: SpaceBitmap,
+    lowest_bit_hint: u32,
+    /// Fast-path flag: if true, skip free-pool scan for block-sized allocations.
+    no_free_blocks: bool,
+}
+
+/// Anchored space state. Tracks 1-bit-per-megabyte: SET = soft-anchored.
+struct AnchoredSpacePool {
+    bitmap: SpaceBitmap,
+    lowest_bit_hint: u32,
+}
+
+/// Tracks which data blocks have been trimmed but still hold a
+/// "soft anchor" to their file space.
+///
+/// When a block is trimmed with `TrimMode::FileSpace`, the BAT entry
+/// transitions to Unmapped but the `file_megabyte` field is preserved.
+/// The space is *not* released to the free pool. This avoids the cost
+/// of zeroing + flushing the space before a future BAT commit, because
+/// the space still contains only the block's own old data — no
+/// cross-block data leak is possible on power failure.
+///
+/// Bitmap: 1-bit-per-block-number, SET = has soft-anchored file offset.
+struct TrimmedBlockTracker {
+    bitmap: SpaceBitmap,
+    lowest_block_number_hint: u32,
+    num_trimmed_blocks: u32,
+}
+
+/// EOF geometry state — describes where new space comes from.
+///
+/// These fields are only mutated under the `allocation_lock` (the async
+/// `futures::lock::Mutex<()>` on `VhdxFile` that serializes the
+/// allocate→TFP→write sequence). They live outside `FreeSpaceInner`
+/// so they don't contend with the sync mutex.
+pub(crate) struct EofState {
+    /// Current file length (always MB1-aligned).
+    pub file_length: u64,
+    /// Highest in-use file offset.
+    pub last_file_offset: u64,
+    /// Offset at which all data beyond is guaranteed zero.
+    pub zero_offset: u64,
+    /// Minimum chunk for EOF extension (constant after init).
+    pub eof_extension_length: u32,
+}
+
+/// Internal mutable state of the free space tracker.
+struct FreeSpaceInner {
+    free_space: FreeSpacePool,
+    anchored_space: AnchoredSpacePool,
+    trimmed_blocks: TrimmedBlockTracker,
+
+    /// Block size in bytes.
+    block_size: u32,
+    /// Number of data blocks.
+    data_block_count: u32,
+}
+
+// ---------------------------------------------------------------------------
+// FreeSpaceTracker — public API
+// ---------------------------------------------------------------------------
+
+/// Free space tracker for VHDX files. All internal state is protected by
+/// a synchronous `parking_lot::Mutex`.
+///
+/// This mutex must **never** be held across `.await` points. The outer
+/// `allocation_lock` (an async mutex on `VhdxFile`) serializes the full
+/// allocation sequence including any file I/O.
+pub(crate) struct FreeSpaceTracker {
+    inner: Mutex<FreeSpaceInner>,
+    /// Block alignment (0 or power of 2 ≤ block_size). Constant after construction.
+    block_alignment: u32,
+}
+
+/// Flags for [`VhdxFile::allocate_space()`].
+#[bitfield(u8)]
+#[derive(PartialEq, Eq)]
+pub(crate) struct AllocateFlags {
+    /// Align the allocation to `block_alignment`.
+    #[bits(1)]
+    pub aligned: bool,
+    /// Zero the allocated region if not already zeroed on disk.
+    #[bits(1)]
+    pub zero: bool,
+    #[bits(6)]
+    _reserved: u8,
+}
+
+/// Describes the state of newly allocated space.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum SpaceState {
+    /// Fresh space from file extension — zeroed on disk. Safe to commit
+    /// BAT before flushing the data write (no data leak possible).
+    Zero,
+    /// Recycled space containing the same block's own old data. Safe to
+    /// commit BAT before flushing (a power failure only exposes the
+    /// block's own stale data, not another block's). NOT zero.
+    OwnStale,
+    /// Recycled space that may contain another block's data. Must flush
+    /// data writes before committing BAT to prevent cross-block data
+    /// leaks on power failure. NOT zero.
+    CrossStale,
+}
+
+impl SpaceState {
+    /// Safe to commit BAT entry before data flush completes?
+    pub fn is_safe(self) -> bool {
+        matches!(self, Self::Zero | Self::OwnStale)
+    }
+
+    /// Guaranteed zeroed on disk?
+    pub fn is_zero(self) -> bool {
+        matches!(self, Self::Zero)
+    }
+}
+
+/// Result from a successful space allocation.
+pub(crate) struct AllocateResult {
+    /// File byte offset of the allocated region.
+    pub file_offset: u64,
+    /// State of the allocated space.
+    pub state: SpaceState,
+    /// If this allocation reclaimed a cross-block soft anchor, the old
+    /// block number whose `file_megabyte` must be cleared by the caller.
+    pub unanchored_block: Option<u32>,
+}
+
+impl FreeSpaceTracker {
+    /// Create and initialize the free space tracker.
+    ///
+    /// Called during `VhdxFile::open_inner()`, before the BAT parse. Sets all file
+    /// space as free, then marks the header area, log, BAT, and metadata
+    /// regions as in-use.
+    ///
+    /// Returns both the tracker and the initial [`EofState`].
+    pub fn new(
+        file_length: u64,
+        block_size: u32,
+        block_alignment: u32,
+        header_area_size: u64,
+        log_offset: u64,
+        log_length: u32,
+        bat_offset: u64,
+        bat_length: u32,
+        metadata_offset: u64,
+        metadata_length: u32,
+        data_block_count: u32,
+    ) -> Result<(Self, EofState), OpenError> {
+        // Validate alignment.
+        if block_alignment != 0 && !block_alignment.is_power_of_two() {
+            return Err(OpenErrorInner::InvalidParameter(
+                crate::error::InvalidFormatReason::BlockAlignmentNotPowerOfTwo,
+            )
+            .into());
+        }
+        let effective_alignment = if block_alignment > block_size {
+            0
+        } else {
+            block_alignment
+        };
+        // File length must be MB1-aligned.
+        let aligned_file_length = (file_length + MB1 - 1) & !(MB1 - 1);
+        let bit_count = (aligned_file_length / MB1) as usize;
+
+        // Create bitmaps.
+        let mut free_space_bitmap = SpaceBitmap::new(bit_count);
+        let anchored_space_bitmap = SpaceBitmap::new(bit_count);
+        let trimmed_block_bitmap = SpaceBitmap::new(data_block_count as usize);
+
+        // Mark entire file as free.
+        free_space_bitmap.set_all();
+
+        let mut eof_state = EofState {
+            file_length: aligned_file_length,
+            last_file_offset: 0,
+            zero_offset: 0,
+            eof_extension_length: DEFAULT_EOF_EXTENSION_LENGTH,
+        };
+
+        let mut inner = FreeSpaceInner {
+            free_space: FreeSpacePool {
+                bitmap: free_space_bitmap,
+                lowest_bit_hint: 0,
+                no_free_blocks: false,
+            },
+            anchored_space: AnchoredSpacePool {
+                bitmap: anchored_space_bitmap,
+                lowest_bit_hint: bit_count as u32,
+            },
+            trimmed_blocks: TrimmedBlockTracker {
+                bitmap: trimmed_block_bitmap,
+                lowest_block_number_hint: data_block_count,
+                num_trimmed_blocks: 0,
+            },
+            block_size,
+            data_block_count,
+        };
+
+        // Mark header area as in-use.
+        inner.mark_range_in_use_inner(&mut eof_state, 0, header_area_size as u32)?;
+
+        // Mark log as in-use.
+        if log_length > 0 {
+            inner.mark_range_in_use_inner(&mut eof_state, log_offset, log_length)?;
+        }
+
+        // Mark BAT region as in-use.
+        // BAT length is rounded up to MB1 for space tracking.
+        let bat_length_aligned = round_up_mb1(bat_length as u64) as u32;
+        inner.mark_range_in_use_inner(&mut eof_state, bat_offset, bat_length_aligned)?;
+
+        // Mark metadata region as in-use.
+        let metadata_length_aligned = round_up_mb1(metadata_length as u64) as u32;
+        inner.mark_range_in_use_inner(&mut eof_state, metadata_offset, metadata_length_aligned)?;
+
+        Ok((
+            FreeSpaceTracker {
+                inner: Mutex::new(inner),
+                block_alignment: effective_alignment,
+            },
+            eof_state,
+        ))
+    }
+
+    /// Block alignment (0 or power of 2). Constant after construction.
+    pub fn block_alignment(&self) -> u32 {
+        self.block_alignment
+    }
+
+    /// Mark a file range as in-use during BAT parse.
+    ///
+    /// Validates that the range doesn't overlap with an already-in-use range
+    /// and doesn't extend past EOF.
+    pub fn mark_range_in_use(
+        &self,
+        eof: &mut EofState,
+        offset: u64,
+        length: u32,
+    ) -> Result<(), CorruptionType> {
+        self.inner
+            .lock()
+            .mark_range_in_use_inner(eof, offset, length)
+    }
+
+    /// Mark a trimmed block as soft-anchored during BAT parse.
+    pub fn mark_trimmed_block(
+        &self,
+        block_number: u32,
+        file_offset: u64,
+        block_size: u32,
+    ) -> Result<(), CorruptionType> {
+        self.inner
+            .lock()
+            .mark_trimmed_block_inner(block_number, file_offset, block_size)
+    }
+
+    /// Finalize after BAT parse. Separates EOF free space from pool free space.
+    ///
+    /// Blocks from `ZeroOffset` to `FileLength` are "near-EOF free space"
+    /// (tracked separately, not in the bitmap pool). Clear those bits from
+    /// the FreeSpace bitmap.
+    pub fn complete_initialization(&self, eof: &EofState) {
+        let mut inner = self.inner.lock();
+        let bit_base = (eof.zero_offset / MB1) as usize;
+        let bit_count = ((eof.file_length - eof.zero_offset) / MB1) as usize;
+        if bit_count > 0 {
+            debug_assert!(inner.free_space.bitmap.are_bits_set(bit_base, bit_count));
+            inner.free_space.bitmap.clear_range(bit_base, bit_count);
+        }
+    }
+
+    /// Try to allocate using priorities 1–3, with access to the BAT state
+    /// for soft-anchor lookup (priority 3).
+    pub fn try_allocate_with_bat(
+        &self,
+        eof: &mut EofState,
+        size: u32,
+        aligned: bool,
+        bat: &Bat,
+    ) -> Option<AllocateResult> {
+        self.try_allocate_inner(eof, size, aligned, Some(bat))
+    }
+
+    /// Try all three in-memory allocation priorities.
+    fn try_allocate_inner(
+        &self,
+        eof: &mut EofState,
+        size: u32,
+        aligned: bool,
+        bat: Option<&Bat>,
+    ) -> Option<AllocateResult> {
+        let mut inner = self.inner.lock();
+        // Priority 1: free space pool.
+        if let Some(offset) = inner.free_space_pool_alloc(eof, size) {
+            return Some(AllocateResult {
+                file_offset: offset,
+                state: SpaceState::CrossStale,
+                unanchored_block: None,
+            });
+        }
+
+        // Priority 2: near-EOF space (between ZeroOffset and FileLength).
+        let aligned_zero_offset = if aligned && self.block_alignment != 0 {
+            round_up(eof.zero_offset, self.block_alignment as u64)
+        } else {
+            eof.zero_offset
+        };
+
+        if eof.file_length >= aligned_zero_offset + size as u64 {
+            let offset = aligned_zero_offset;
+            eof.zero_offset = aligned_zero_offset + size as u64;
+            eof.last_file_offset = eof.zero_offset;
+            return Some(AllocateResult {
+                file_offset: offset,
+                state: SpaceState::Zero,
+                unanchored_block: None,
+            });
+        }
+
+        // Priority 3: soft-anchored space from trimmed blocks.
+        //
+        // Only considers blocks in TrimmedBlockTracker, which are populated
+        // by flush() — so they are always durable. The caller must clear
+        // the old block's file_megabyte in BatState and write its BAT page
+        // to cache.
+        if size <= inner.block_size {
+            if let Some(bat) = bat {
+                if let Some((file_offset, block_number)) =
+                    inner.find_and_unanchor_in_memory_inner(bat)
+                {
+                    // If the allocated block is larger than needed, release excess.
+                    if size < inner.block_size {
+                        let excess_offset = file_offset + size as u64;
+                        let excess_size = inner.block_size - size;
+                        inner.release_inner(excess_offset, excess_size);
+                    }
+                    return Some(AllocateResult {
+                        file_offset,
+                        state: SpaceState::CrossStale,
+                        unanchored_block: Some(block_number),
+                    });
+                }
+            }
+        }
+
+        // Priority 4: caller must extend EOF.
+        None
+    }
+
+    /// Release space back to the free pool.
+    pub fn release(&self, offset: u64, size: u32) {
+        self.inner.lock().release_inner(offset, size);
+    }
+
+    /// Unmark a trimmed block (when its space is reclaimed).
+    #[must_use]
+    pub fn unmark_trimmed_block(
+        &self,
+        block_number: u32,
+        file_offset: u64,
+        block_size: u32,
+    ) -> bool {
+        let mut inner = self.inner.lock();
+        inner.unmark_trimmed_block_inner(block_number, file_offset, block_size)
+    }
+
+    /// Compute truncation target size.
+    pub fn truncate_target(&self, eof: &EofState, is_fully_allocated: bool) -> u64 {
+        let inner = self.inner.lock();
+        let mut target = eof.last_file_offset;
+        if is_fully_allocated {
+            let excess = inner.compute_excess_block_count(eof, target);
+            let extra = (excess as u64) * inner.block_size as u64;
+            target = (target + extra).min(eof.file_length);
+        }
+        target
+    }
+
+    /// Update state after truncation.
+    pub fn apply_truncate(&self, eof: &mut EofState, new_file_length: u64) {
+        let mut inner = self.inner.lock();
+        let aligned = (new_file_length + MB1 - 1) & !(MB1 - 1);
+        let new_bit_count = (aligned / MB1) as usize;
+        let old_bit_count = inner.free_space.bitmap.len();
+
+        if new_bit_count < old_bit_count {
+            inner.free_space.bitmap.resize(new_bit_count);
+            inner.anchored_space.bitmap.resize(new_bit_count);
+        }
+
+        eof.file_length = aligned;
+        eof.zero_offset = eof.zero_offset.min(aligned);
+    }
+}
+
+impl EofState {
+    /// Compute the target file size for EOF extension.
+    ///
+    /// Includes `eof_extension_length` minimum chunk.
+    pub fn required_file_length(&self, block_alignment: u32, size: u32, aligned: bool) -> u64 {
+        let aligned_zero_offset = if aligned && block_alignment != 0 {
+            round_up(self.zero_offset, block_alignment as u64)
+        } else {
+            self.zero_offset
+        };
+        let target = aligned_zero_offset + size as u64;
+        let min_target = self.file_length + self.eof_extension_length as u64;
+        target.max(min_target)
+    }
+
+    /// Update state after file extension completed.
+    ///
+    /// Resizes bitmaps if needed and updates `file_length`.
+    pub fn complete_file_extend(&mut self, tracker: &FreeSpaceTracker, new_file_length: u64) {
+        let mut inner = tracker.inner.lock();
+        let aligned = (new_file_length + MB1 - 1) & !(MB1 - 1);
+        let new_bit_count = (aligned / MB1) as usize;
+        let old_bit_count = inner.free_space.bitmap.len();
+
+        if new_bit_count > old_bit_count {
+            // Grow by at least 125% to avoid O(n²) behavior.
+            let target_bits = (old_bit_count + old_bit_count / 4).max(new_bit_count);
+            inner.free_space.bitmap.resize(target_bits);
+            inner.anchored_space.bitmap.resize(target_bits);
+        }
+
+        self.file_length = aligned;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Internal helpers (operate on FreeSpaceInner, called under lock)
+// ---------------------------------------------------------------------------
+
+/// Round `value` up to the nearest multiple of `alignment`.
+fn round_up(value: u64, alignment: u64) -> u64 {
+    (value + alignment - 1) & !(alignment - 1)
+}
+
+/// Round `value` up to the nearest MB1 boundary.
+fn round_up_mb1(value: u64) -> u64 {
+    round_up(value, MB1)
+}
+
+impl FreeSpaceInner {
+    /// Mark a file range as in-use during parse (internal, no lock).
+    fn mark_range_in_use_inner(
+        &mut self,
+        eof: &mut EofState,
+        offset: u64,
+        length: u32,
+    ) -> Result<(), CorruptionType> {
+        debug_assert!(offset.is_multiple_of(MB1), "offset must be MB1-aligned");
+        debug_assert!(
+            (length as u64).is_multiple_of(MB1),
+            "length must be MB1-aligned"
+        );
+
+        if length == 0 {
+            return Ok(());
+        }
+
+        // Check range is within file.
+        if eof.file_length < offset || eof.file_length - offset < length as u64 {
+            return Err(CorruptionType::RangeBeyondEof);
+        }
+
+        let bit_base = (offset / MB1) as usize;
+        let bit_count = length as usize / MB1 as usize;
+
+        // Overlap check: all bits must currently be SET (free).
+        if !self.free_space.bitmap.are_bits_set(bit_base, bit_count) {
+            return Err(CorruptionType::RangeCollision);
+        }
+
+        // Mark as in-use (clear the bits).
+        self.free_space.bitmap.clear_range(bit_base, bit_count);
+
+        // Update last_file_offset and zero_offset.
+        let range_end = offset + length as u64;
+        if range_end > eof.last_file_offset {
+            eof.last_file_offset = range_end;
+            if eof.last_file_offset > eof.zero_offset {
+                eof.zero_offset = eof.last_file_offset;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Mark a trimmed block as soft-anchored (internal, no lock).
+    fn mark_trimmed_block_inner(
+        &mut self,
+        block_number: u32,
+        file_offset: u64,
+        block_size: u32,
+    ) -> Result<(), CorruptionType> {
+        debug_assert!(block_number < self.data_block_count);
+        debug_assert!(block_size.is_multiple_of(MB1 as u32));
+
+        // Check: already marked as trimmed?
+        if self.trimmed_blocks.bitmap.check_bit(block_number as usize) {
+            return Err(CorruptionType::TrimmedRangeCollision);
+        }
+
+        // Check: anchored space bits must be clear (no collision with another anchor).
+        let bit_base = (file_offset / MB1) as usize;
+        let bit_count = block_size as usize / MB1 as usize;
+        if !self
+            .anchored_space
+            .bitmap
+            .are_bits_clear(bit_base, bit_count)
+        {
+            return Err(CorruptionType::TrimmedRangeCollision);
+        }
+
+        // Mark in trimmed block tracker.
+        self.trimmed_blocks.bitmap.set_bit(block_number as usize);
+        self.trimmed_blocks.num_trimmed_blocks += 1;
+        self.trimmed_blocks.lowest_block_number_hint = self
+            .trimmed_blocks
+            .lowest_block_number_hint
+            .min(block_number);
+
+        // Mark in anchored space bitmap.
+        self.anchored_space.bitmap.set_range(bit_base, bit_count);
+        self.anchored_space.lowest_bit_hint =
+            self.anchored_space.lowest_bit_hint.min(bit_base as u32);
+
+        Ok(())
+    }
+
+    /// Unmark a trimmed block (internal, no lock).
+    #[must_use]
+    fn unmark_trimmed_block_inner(
+        &mut self,
+        block_number: u32,
+        file_offset: u64,
+        block_size: u32,
+    ) -> bool {
+        debug_assert!(block_number < self.data_block_count);
+        debug_assert!(block_size.is_multiple_of(MB1 as u32));
+
+        // If not marked, someone else already claimed it.
+        if !self.trimmed_blocks.bitmap.check_bit(block_number as usize) {
+            return false;
+        }
+
+        self.trimmed_blocks.bitmap.clear_bit(block_number as usize);
+        self.trimmed_blocks.num_trimmed_blocks -= 1;
+
+        let bit_base = (file_offset / MB1) as usize;
+        let bit_count = block_size as usize / MB1 as usize;
+        debug_assert!(
+            self.anchored_space.bitmap.are_bits_set(bit_base, bit_count),
+            "anchored space bits must be set for trimmed block {block_number} at offset {file_offset:#x}"
+        );
+        self.anchored_space.bitmap.clear_range(bit_base, bit_count);
+
+        true
+    }
+
+    /// Release space to the free pool (internal, no lock).
+    fn release_inner(&mut self, offset: u64, size: u32) {
+        debug_assert!(offset.is_multiple_of(MB1));
+        debug_assert!((size as u64).is_multiple_of(MB1));
+
+        let bit_base = (offset / MB1) as usize;
+        let bit_count = size as usize / MB1 as usize;
+
+        if bit_base + bit_count > self.free_space.bitmap.len() {
+            // Defensive: can't release beyond bitmap size.
+            return;
+        }
+
+        debug_assert!(self.free_space.bitmap.are_bits_clear(bit_base, bit_count));
+        self.free_space.bitmap.set_range(bit_base, bit_count);
+        self.free_space.no_free_blocks = false;
+
+        if (bit_base as u32) < self.free_space.lowest_bit_hint {
+            self.free_space.lowest_bit_hint = bit_base as u32;
+        }
+    }
+
+    /// Priority 1: free space pool allocation (internal, no lock).
+    fn free_space_pool_alloc(&mut self, eof: &mut EofState, length: u32) -> Option<u64> {
+        debug_assert!((length as u64).is_multiple_of(MB1));
+        let bit_count = length as usize / MB1 as usize;
+
+        // Fast-path skip for block-sized allocations.
+        if length >= self.block_size && self.free_space.no_free_blocks {
+            return None;
+        }
+
+        let result = self
+            .free_space
+            .bitmap
+            .find_set_bits(bit_count, self.free_space.lowest_bit_hint as usize);
+
+        match result {
+            Some(bit_base) => {
+                // Claim the space.
+                self.free_space.bitmap.clear_range(bit_base, bit_count);
+                self.free_space.lowest_bit_hint = (bit_base + bit_count) as u32;
+                let max_offset = (bit_base + bit_count) as u64 * MB1;
+                if eof.last_file_offset < max_offset {
+                    eof.last_file_offset = max_offset;
+                }
+                Some(bit_base as u64 * MB1)
+            }
+            None => {
+                if length <= self.block_size {
+                    self.free_space.no_free_blocks = true;
+                }
+                None
+            }
+        }
+    }
+
+    /// Find and unanchor an in-memory-only soft-anchored block.
+    fn find_and_unanchor_in_memory_inner(&mut self, bat: &Bat) -> Option<(u64, u32)> {
+        if self.trimmed_blocks.num_trimmed_blocks == 0 {
+            return None;
+        }
+
+        let block_size = self.block_size;
+
+        // Try to find an in-memory-only soft-anchored block by scanning
+        // the TrimmedBlock bitmap.
+        let mut trimmed_found = 0u32;
+        let total_trimmed = self.trimmed_blocks.num_trimmed_blocks;
+        let mut hint = self.trimmed_blocks.lowest_block_number_hint as usize;
+
+        while trimmed_found < total_trimmed {
+            let block_number = match self.trimmed_blocks.bitmap.find_set_bits(1, hint) {
+                Some(n) => n,
+                None => break,
+            };
+
+            trimmed_found += 1;
+            let mapping = bat.get_block_mapping(block_number as u32);
+
+            // Block must be soft-anchored: unmapped/undefined state with non-zero file_megabyte.
+            let state = mapping.bat_state();
+            let is_unmapped = state == BatEntryState::Unmapped
+                || state == BatEntryState::Undefined
+                || state == BatEntryState::Zero
+                || state == BatEntryState::NotPresent;
+
+            debug_assert!(
+                is_unmapped && mapping.file_megabyte() != 0,
+                "trimmed block {block_number} is not soft-anchored"
+            );
+
+            // Check if it's in-memory only (not on-disk anchored).
+            // Only blocks in TrimmedBlockTracker are considered here, and
+            // those are only populated by flush() after WAL durability, so
+            // the on-disk BAT already reflects the trim. Cross-block reclaim
+            // is safe — the caller just needs to clear the old block's
+            // file_megabyte and write its BAT page to cache.
+            let file_offset = mapping.file_megabyte() as u64 * MB1;
+
+            // Unmark the trimmed block.
+            if self.unmark_trimmed_block_inner(block_number as u32, file_offset, block_size) {
+                return Some((file_offset, block_number as u32));
+            }
+
+            hint = block_number + 1;
+        }
+
+        None
+    }
+
+    /// Compute excess block count (blocks that won't fit given current space).
+    fn compute_excess_block_count(&self, eof: &EofState, max_offset: u64) -> u32 {
+        // Count unallocated blocks.
+        let total = self.data_block_count;
+        // Available space: count of free bits in free space bitmap + anchored space
+        // + space from zero_offset to file_length.
+        let mut available_mb: u64 = 0;
+
+        // Count free bits up to the bitmap.
+        for i in 0..self.free_space.bitmap.len() {
+            if self.free_space.bitmap.check_bit(i) {
+                available_mb += 1;
+            }
+        }
+
+        // Count anchored bits.
+        for i in 0..self.anchored_space.bitmap.len() {
+            if self.anchored_space.bitmap.check_bit(i) {
+                available_mb += 1;
+            }
+        }
+
+        // EOF space.
+        let zero = eof.zero_offset.min(max_offset);
+        if eof.file_length > zero {
+            available_mb += (eof.file_length - zero) / MB1;
+        }
+
+        let block_mb = self.block_size as u64 / MB1;
+        let available_blocks = available_mb / block_mb;
+        let needed = total as u64;
+        if needed > available_blocks {
+            (needed - available_blocks) as u32
+        } else {
+            0
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Deferred space releases
+// ---------------------------------------------------------------------------
+
+/// Maximum number of deferred entries before trim forces a flush.
+const DEFERRED_QUOTA: usize = 1024;
+
+/// A space release that is deferred until its BAT change is durable on disk.
+///
+/// Without deferral, a crash could "teleport" data: a new block's data
+/// appears at an old block's file offset because the old block's BAT
+/// reverts to FullyPresent on replay.
+struct DeferredRelease {
+    file_offset: u64,
+    size: u32,
+    anchor: bool,
+}
+
+/// Entry in the deferred releases tracker, with generation stamp.
+struct DeferredEntry {
+    release: DeferredRelease,
+    /// `None` = not yet committed to a WAL entry.
+    /// `Some(gen)` = committed in flush generation `gen`.
+    committed_gen: Option<u64>,
+}
+
+/// Tracks deferred space releases with generation-based promotion.
+///
+/// All state is behind a single `parking_lot::Mutex` — never held across
+/// `.await`. The generation counter ensures entries are only promoted
+/// after the flush that committed them reaches WAL durability.
+pub(crate) struct DeferredReleases {
+    inner: Mutex<DeferredInner>,
+}
+
+struct DeferredInner {
+    entries: HashMap<u32, DeferredEntry>,
+    /// Monotonically increasing. Bumped by each flush before commit.
+    generation: u64,
+}
+
+impl DeferredReleases {
+    pub fn new() -> Self {
+        Self {
+            inner: Mutex::new(DeferredInner {
+                entries: HashMap::new(),
+                generation: 0,
+            }),
+        }
+    }
+
+    /// Insert or replace a deferred release for a block.
+    /// The entry starts uncommitted (no generation stamp).
+    pub fn insert(&self, block_number: u32, file_offset: u64, size: u32, anchor: bool) {
+        self.inner.lock().entries.insert(
+            block_number,
+            DeferredEntry {
+                release: DeferredRelease {
+                    file_offset,
+                    size,
+                    anchor,
+                },
+                committed_gen: None,
+            },
+        );
+    }
+
+    /// Remove a deferred entry for same-block reclaim.
+    /// Returns the file offset if found. Same-block reclaim is always
+    /// safe (OwnStale) regardless of committed state.
+    pub fn remove(&self, block_number: u32) -> Option<u64> {
+        self.inner
+            .lock()
+            .entries
+            .remove(&block_number)
+            .map(|e| e.release.file_offset)
+    }
+
+    /// Check whether a deferred entry exists for a block, and remove
+    /// it if so. Returns true if removed.
+    pub fn cancel(&self, block_number: u32) -> bool {
+        self.inner.lock().entries.remove(&block_number).is_some()
+    }
+
+    /// Returns true if the number of deferred entries has reached the
+    /// quota and a flush should be triggered to free them.
+    pub fn needs_flush(&self) -> bool {
+        self.inner.lock().entries.len() >= DEFERRED_QUOTA
+    }
+
+    /// Stamp all uncommitted entries with the current generation and
+    /// bump the generation. Called at the start of flush(), before
+    /// `commit()`. Returns the generation that was stamped.
+    pub fn stamp_uncommitted(&self) -> u64 {
+        let mut inner = self.inner.lock();
+        inner.generation += 1;
+        let flush_gen = inner.generation;
+        for entry in inner.entries.values_mut() {
+            if entry.committed_gen.is_none() {
+                entry.committed_gen = Some(flush_gen);
+            }
+        }
+        flush_gen
+    }
+
+    /// Drain all entries committed at or before the given generation.
+    /// Returns (block_number, file_offset, size, anchor) tuples for
+    /// promotion to the FreeSpaceTracker.
+    pub fn drain_committed(&self, up_to_gen: u64) -> Vec<(u32, u64, u32, bool)> {
+        let mut inner = self.inner.lock();
+        let mut drained = Vec::new();
+        inner.entries.retain(|&block, entry| {
+            if entry.committed_gen.is_some_and(|g| g <= up_to_gen) {
+                drained.push((
+                    block,
+                    entry.release.file_offset,
+                    entry.release.size,
+                    entry.release.anchor,
+                ));
+                false // remove from map
+            } else {
+                true // keep
+            }
+        });
+        drained
+    }
+}
+
+// ---------------------------------------------------------------------------
+// VhdxFile — space allocation
+// ---------------------------------------------------------------------------
+
+use crate::AsyncFile;
+use crate::bat::BlockMapping;
+use crate::bat::BlockType;
+use crate::open::VhdxFile;
+
+impl<F: AsyncFile> VhdxFile<F> {
+    /// Allocate space for a new block. Async — may extend the file.
+    ///
+    /// Called under `allocation_lock` (the `FreeSpaceWorkerLock` equivalent).
+    /// The caller must pass `&mut EofState` obtained from locking
+    /// `allocation_lock`.
+    /// Tries pool → near-EOF → anchored, extends file and retries if needed.
+    ///
+    /// When `flags` includes [`AllocateFlags::ZERO`], the allocated region
+    /// is guaranteed to be zeroed on disk before returning. Near-EOF
+    /// allocations are inherently zero; pool/anchor allocations get an
+    /// explicit zero-write.
+    ///
+    /// When `flags` includes [`AllocateFlags::ALIGNED`], the allocation is
+    /// aligned to `block_alignment`.
+    pub(crate) async fn allocate_space(
+        &self,
+        eof: &mut EofState,
+        size: u32,
+        flags: AllocateFlags,
+    ) -> Result<AllocateResult, VhdxIoError> {
+        debug_assert!(
+            (size as u64).is_multiple_of(MB1),
+            "allocation size must be MB1-aligned"
+        );
+
+        loop {
+            // Try priorities 1–3 (pool, near-EOF, anchored).
+            let result =
+                self.free_space
+                    .try_allocate_with_bat(eof, size, flags.aligned(), &self.bat);
+
+            if let Some(alloc) = result {
+                // If this was a cross-block soft-anchor reclaim, clear the
+                // old block's file_megabyte in BatState and write its BAT
+                // page to cache. The old block's trim is already durable
+                // (TrimmedBlockTracker is only populated after flush), so
+                // no extra flush is needed — just BAT write ordering.
+                if let Some(old_block) = alloc.unanchored_block {
+                    let old_mapping = self.bat.get_block_mapping(old_block);
+                    let cleared_mapping = BlockMapping::new()
+                        .with_bat_state(old_mapping.bat_state())
+                        .with_transitioning_to_fully_present(false)
+                        .with_file_megabyte(0);
+                    self.bat.set_block_mapping(old_block, cleared_mapping);
+
+                    // Write old block's BAT page to cache (async).
+                    // LOCK AUDIT: allocation_lock held.
+                    self.bat
+                        .write_block_mapping(
+                            &self.cache,
+                            BlockType::Payload,
+                            old_block,
+                            cleared_mapping,
+                            None,
+                        )
+                        .await?;
+                }
+
+                if flags.zero() && !alloc.state.is_zero() {
+                    // Space from pool/anchor may contain stale data — zero it.
+                    self.file
+                        .zero_range(alloc.file_offset, size as u64)
+                        .await
+                        .map_err(|e| VhdxIoErrorInner::ZeroBlock {
+                            err: e,
+                            file_offset: alloc.file_offset,
+                        })?;
+                }
+                return Ok(alloc);
+            }
+
+            // Priority 4: extend EOF.
+            let block_alignment = self.free_space.block_alignment();
+            let target = eof.required_file_length(block_alignment, size, flags.aligned());
+            // LOCK AUDIT: bat_state read-lock dropped (end of block above). allocation_lock held (async Mutex — OK across .await).
+            self.file
+                .set_file_size(target)
+                .await
+                .map_err(|e| VhdxIoErrorInner::ExtendFile {
+                    err: e,
+                    target_file_size: target,
+                })?;
+            eof.complete_file_extend(&self.free_space, target);
+            // Retry — will succeed from near-EOF space.
+        }
+    }
+
+    /// Truncate the file to reclaim unused trailing space.
+    ///
+    /// Shrinks the file to just past the highest in-use offset, rounded
+    /// up to MB1. For fully-allocated (fixed) disks, reserves extra
+    /// space for blocks that haven't been allocated yet.
+    ///
+    /// Called during [`close()`](Self::close) after all WAL entries are
+    /// drained. Must NOT be called while the log task is running.
+    pub(crate) async fn truncate_file(&self) -> Result<(), VhdxIoError> {
+        let mut eof = self.allocation_lock.lock().await;
+        let target = self
+            .free_space
+            .truncate_target(&eof, self.is_fully_allocated());
+
+        // Only shrink, never grow. And don't bother if the savings
+        // are less than the EOF extension length (avoids thrashing
+        // on files that are close to their minimum size).
+        if target < eof.file_length && eof.file_length - target >= eof.eof_extension_length as u64 {
+            // Round up to MB1.
+            let target_aligned = (target + MB1 - 1) & !(MB1 - 1);
+            self.file.set_file_size(target_aligned).await.map_err(|e| {
+                VhdxIoErrorInner::TruncateFile {
+                    err: e,
+                    target_file_size: target_aligned,
+                }
+            })?;
+            self.free_space.apply_truncate(&mut eof, target_aligned);
+        }
+
+        Ok(())
+    }
+
+    /// Compute the cache [`PageKey`] for the BAT page containing the given
+    /// payload block's entry.
+    ///
+    /// Used by crash-consistency tests to inspect `pre_log_fsn` on BAT pages.
+    #[cfg(test)]
+    pub(crate) fn bat_page_key_for_block(&self, block_number: u32) -> crate::cache::PageKey {
+        use crate::bat::BAT_TAG;
+        use crate::format::CACHE_PAGE_SIZE;
+
+        let entry_index = self.bat.payload_entry_index(block_number);
+        let page_offset = (entry_index as u64 * 8) & !(CACHE_PAGE_SIZE - 1);
+        crate::cache::PageKey {
+            tag: BAT_TAG,
+            offset: page_offset,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test-only helpers on FreeSpaceTracker
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+impl FreeSpaceTracker {
+    /// Try to allocate space using priorities 1–3 (pool, near-EOF, anchored)
+    /// without a BAT state (skips priority 3).
+    pub fn try_allocate(
+        &self,
+        eof: &mut EofState,
+        size: u32,
+        aligned: bool,
+    ) -> Option<AllocateResult> {
+        self.try_allocate_inner(eof, size, aligned, None)
+    }
+
+    /// Find and unanchor a soft-anchored block (in-memory only anchors).
+    pub fn find_and_unanchor_in_memory(&self, bat: &Bat) -> Option<(u64, u32)> {
+        let mut inner = self.inner.lock();
+        inner.find_and_unanchor_in_memory_inner(bat)
+    }
+
+    /// Check if a range is in use (for debug/validation).
+    pub fn is_range_in_use(&self, eof: &EofState, offset: u64, length: u32) -> bool {
+        let inner = self.inner.lock();
+        debug_assert!(offset.is_multiple_of(MB1));
+        debug_assert!((length as u64).is_multiple_of(MB1));
+
+        if eof.file_length < offset || eof.file_length - offset < length as u64 {
+            return true;
+        }
+
+        let bit_base = (offset / MB1) as usize;
+        let bit_count = length as usize / MB1 as usize;
+        !inner.free_space.bitmap.are_bits_set(bit_base, bit_count)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::bat::BlockMapping;
+    use crate::format::BatEntryState;
+
+    // -- Bitmap unit tests --
+
+    #[test]
+    fn bitmap_set_clear_range() {
+        let mut bm = SpaceBitmap::new(128);
+        assert!(bm.are_bits_clear(0, 128));
+
+        bm.set_range(10, 20);
+        assert!(bm.are_bits_set(10, 20));
+        assert!(bm.are_bits_clear(0, 10));
+        assert!(bm.are_bits_clear(30, 98));
+
+        bm.clear_range(15, 5);
+        assert!(bm.are_bits_set(10, 5));
+        assert!(bm.are_bits_clear(15, 5));
+        assert!(bm.are_bits_set(20, 10));
+    }
+
+    #[test]
+    fn bitmap_find_set_bits() {
+        let mut bm = SpaceBitmap::new(64);
+        // Create a run of 8 set bits starting at index 20.
+        bm.set_range(20, 8);
+
+        assert_eq!(bm.find_set_bits(8, 0), Some(20));
+        assert_eq!(bm.find_set_bits(8, 20), Some(20));
+        assert_eq!(bm.find_set_bits(9, 0), None);
+        assert_eq!(bm.find_set_bits(1, 25), Some(25));
+    }
+
+    #[test]
+    fn bitmap_find_set_bits_wraps_hint() {
+        let mut bm = SpaceBitmap::new(64);
+        // Run at the beginning.
+        bm.set_range(0, 4);
+
+        // Hint past the run — should wrap and find it.
+        assert_eq!(bm.find_set_bits(4, 50), Some(0));
+    }
+
+    /// Regression: find_set_bits must find a valid non-wrapping run at
+    /// the bitmap start even when the scan first encounters a wrapping
+    /// candidate that spans the bitmap end→start boundary.
+    ///
+    /// Bitmap (8 bits): [1,1,1,1,0,0,1,1]
+    ///                   ^-------^         valid run of 4 at index 0
+    ///                               ^--^  bits 6-7 set
+    ///
+    /// With hint=5, the scan visits: 5(0),6(1),7(1),0(1),1(1) — a run
+    /// of 4 starting at index 6, but it wraps (6+4=10>8). After
+    /// rejecting the wrap, bits 0-3 must still be found as a valid run.
+    #[test]
+    fn bitmap_find_set_bits_rejected_wrap_finds_later_run() {
+        let mut bm = SpaceBitmap::new(8);
+        bm.set_range(0, 4); // bits 0,1,2,3
+        bm.set_range(6, 2); // bits 6,7
+
+        // Hint=5: scan starts at 5, wraps, should find run at 0.
+        assert_eq!(
+            bm.find_set_bits(4, 5),
+            Some(0),
+            "should find non-wrapping run [0..4) after rejecting wrap at 6"
+        );
+    }
+
+    #[test]
+    fn bitmap_are_bits_set_clear() {
+        let mut bm = SpaceBitmap::new(32);
+        bm.set_all();
+        assert!(bm.are_bits_set(0, 32));
+        assert!(!bm.are_bits_clear(0, 32));
+
+        bm.clear_bit(16);
+        assert!(!bm.are_bits_set(0, 32));
+        assert!(!bm.are_bits_set(16, 1));
+        assert!(bm.are_bits_clear(16, 1));
+    }
+
+    #[test]
+    fn bitmap_empty_and_full() {
+        let bm_empty = SpaceBitmap::new(0);
+        assert_eq!(bm_empty.len(), 0);
+        assert_eq!(bm_empty.find_set_bits(1, 0), None);
+
+        let mut bm = SpaceBitmap::new(1);
+        assert!(bm.are_bits_clear(0, 1));
+        bm.set_bit(0);
+        assert!(bm.are_bits_set(0, 1));
+    }
+
+    // -- FreeSpaceTracker initialization tests --
+
+    /// Helper: create a tracker for a small test file.
+    fn make_test_tracker(file_mb: u64, block_size_mb: u32) -> (FreeSpaceTracker, EofState) {
+        make_test_tracker_aligned(file_mb, block_size_mb, 0)
+    }
+
+    fn make_test_tracker_aligned(
+        file_mb: u64,
+        block_size_mb: u32,
+        block_alignment: u32,
+    ) -> (FreeSpaceTracker, EofState) {
+        let file_length = file_mb * MB1;
+        let block_size = block_size_mb * MB1 as u32;
+        let data_block_count = 16; // arbitrary for testing
+
+        FreeSpaceTracker::new(
+            file_length,
+            block_size,
+            block_alignment,
+            MB1,        // header_area_size = 1 MB
+            MB1,        // log_offset = 1 MB
+            MB1 as u32, // log_length = 1 MB
+            2 * MB1,    // bat_offset = 2 MB
+            MB1 as u32, // bat_length = 1 MB
+            3 * MB1,    // metadata_offset = 3 MB
+            MB1 as u32, // metadata_length = 1 MB
+            data_block_count,
+        )
+        .unwrap()
+    }
+
+    #[test]
+    fn init_marks_header_in_use() {
+        let (tracker, eof) = make_test_tracker(10, 2);
+        // Header area (0..1MB) should be in-use.
+        assert!(tracker.is_range_in_use(&eof, 0, MB1 as u32));
+    }
+
+    #[test]
+    fn init_marks_regions_in_use() {
+        let (tracker, eof) = make_test_tracker(10, 2);
+        // Log (1..2MB), BAT (2..3MB), metadata (3..4MB) should be in-use.
+        assert!(tracker.is_range_in_use(&eof, MB1, MB1 as u32));
+        assert!(tracker.is_range_in_use(&eof, 2 * MB1, MB1 as u32));
+        assert!(tracker.is_range_in_use(&eof, 3 * MB1, MB1 as u32));
+    }
+
+    #[test]
+    fn overlap_detection() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        // Try to mark the header area again — should fail with RangeCollision.
+        let result = tracker.mark_range_in_use(&mut eof, 0, MB1 as u32);
+        assert!(matches!(result, Err(CorruptionType::RangeCollision)));
+    }
+
+    #[test]
+    fn range_beyond_eof_detected() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        // Try to mark a range that extends beyond file length.
+        let result = tracker.mark_range_in_use(&mut eof, 9 * MB1, 2 * MB1 as u32);
+        assert!(matches!(result, Err(CorruptionType::RangeBeyondEof)));
+    }
+
+    // -- Allocation priority tests --
+
+    #[test]
+    fn allocate_from_free_pool() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        // Mark offset 4MB in-use (simulating BAT parse finding a block there).
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+        // Now zero_offset = 5*MB. Near-EOF = 5..10 MB (5 MB).
+        // Bit 4 is in-use (cleared). Release it back to pool.
+        tracker.release(4 * MB1, MB1 as u32);
+
+        // Priority 1: should find the released space.
+        let result = tracker.try_allocate(&mut eof, MB1 as u32, false);
+        assert!(result.is_some());
+        let r = result.unwrap();
+        assert_eq!(r.file_offset, 4 * MB1);
+        assert!(!r.state.is_safe());
+    }
+
+    #[test]
+    fn allocate_from_eof_space() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        tracker.complete_initialization(&eof);
+
+        // After initialization, zero_offset = 4*MB, file_length = 10*MB.
+        // Near-EOF space = 6 MB.
+        let result = tracker.try_allocate(&mut eof, 2 * MB1 as u32, false);
+        assert!(result.is_some());
+        let r = result.unwrap();
+        assert_eq!(r.file_offset, 4 * MB1);
+        assert!(r.state.is_safe()); // Beyond old zero_offset.
+    }
+
+    #[test]
+    fn allocate_extends_eof() {
+        // Create a tracker with only 4MB (all in-use by regions).
+        let (tracker, mut eof) = make_test_tracker(4, 2);
+        tracker.complete_initialization(&eof);
+
+        // No free space, no near-EOF space.
+        let result = tracker.try_allocate(&mut eof, MB1 as u32, false);
+        assert!(result.is_none());
+
+        // Compute required length and extend.
+        let target = eof.required_file_length(tracker.block_alignment(), MB1 as u32, false);
+        assert!(target > 4 * MB1);
+
+        eof.complete_file_extend(&tracker, target);
+
+        // Now retry — should succeed from near-EOF.
+        let result = tracker.try_allocate(&mut eof, MB1 as u32, false);
+        assert!(result.is_some());
+        assert!(result.unwrap().state.is_safe());
+    }
+
+    #[test]
+    fn allocate_alignment() {
+        // 20MB file, 4MB block size, 4MB alignment.
+        let (tracker, mut eof) = make_test_tracker_aligned(20, 4, 4 * MB1 as u32);
+        tracker.complete_initialization(&eof);
+
+        // zero_offset = 4MB (after regions).
+        // Aligned allocation from EOF: should be at 4MB (already aligned).
+        let result = tracker.try_allocate(&mut eof, 4 * MB1 as u32, true);
+        assert!(result.is_some());
+        let r = result.unwrap();
+        assert_eq!(r.file_offset % (4 * MB1), 0);
+    }
+
+    #[test]
+    fn allocate_sets_no_free_blocks_flag() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        tracker.complete_initialization(&eof);
+
+        // Exhaust near-EOF space with pool allocations — first exhaust pool.
+        // After init, pool is empty (regions fill 0..4MB, rest is EOF space).
+        // Try pool-only: allocate 1MB from pool (should fail, and set flag).
+        // But near-EOF will succeed before we get to that.
+        //
+        // Instead, fill up all space and verify the flag works.
+        // Allocate all 6 MB of EOF space.
+        for _ in 0..6 {
+            tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        }
+        // Now no space left.
+        let result = tracker.try_allocate(&mut eof, MB1 as u32, false);
+        assert!(result.is_none());
+    }
+
+    // -- Soft anchoring tests --
+
+    /// Create a minimal `Bat` for soft-anchor tests with one anchored block.
+    ///
+    /// Uses 2 MiB block size, 512-byte sectors, no parent. The
+    /// `data_block_count` parameter controls how many payload entries
+    /// the BAT has.
+    fn make_test_bat_with_anchored_block(
+        block_number: u32,
+        file_megabyte: u32,
+        data_block_count: u32,
+    ) -> Bat {
+        let block_size = 2 * MB1 as u32;
+        let disk_size = data_block_count as u64 * block_size as u64;
+        let mut bat = Bat::new(disk_size, block_size, 512, false, MB1 as u32).unwrap();
+        bat.init_test_payload_mappings();
+        bat.set_block_mapping(
+            block_number,
+            BlockMapping::new()
+                .with_bat_state(BatEntryState::Unmapped)
+                .with_file_megabyte(file_megabyte),
+        );
+        bat
+    }
+
+    #[test]
+    fn mark_and_find_anchored_block() {
+        let (tracker, _eof) = make_test_tracker(20, 2);
+        // Mark block 3 as trimmed at file offset 6*MB.
+        tracker
+            .mark_trimmed_block(3, 6 * MB1, 2 * MB1 as u32)
+            .unwrap();
+
+        // Verify anchored space bits are set.
+        let inner = tracker.inner.lock();
+        assert!(inner.anchored_space.bitmap.are_bits_set(6, 2));
+        assert!(inner.trimmed_blocks.bitmap.check_bit(3));
+        assert_eq!(inner.trimmed_blocks.num_trimmed_blocks, 1);
+    }
+
+    #[test]
+    fn unmark_trimmed_block() {
+        let (tracker, _eof) = make_test_tracker(20, 2);
+        tracker
+            .mark_trimmed_block(3, 6 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        assert!(tracker.unmark_trimmed_block(3, 6 * MB1, 2 * MB1 as u32));
+
+        let inner = tracker.inner.lock();
+        assert!(inner.anchored_space.bitmap.are_bits_clear(6, 2));
+        assert!(!inner.trimmed_blocks.bitmap.check_bit(3));
+        assert_eq!(inner.trimmed_blocks.num_trimmed_blocks, 0);
+    }
+
+    #[test]
+    fn find_and_unanchor_in_memory() {
+        let (tracker, _eof) = make_test_tracker(20, 2);
+        // Mark block 5 as trimmed at file offset 8*MB.
+        tracker
+            .mark_trimmed_block(5, 8 * MB1, 2 * MB1 as u32)
+            .unwrap();
+
+        let bat = make_test_bat_with_anchored_block(5, 8, 16);
+
+        let result = tracker.find_and_unanchor_in_memory(&bat);
+        assert!(result.is_some());
+        let (offset, block_num) = result.unwrap();
+        assert_eq!(offset, 8 * MB1);
+        assert_eq!(block_num, 5);
+
+        // After unanchoring, the trimmed block should be unmarked.
+        let inner = tracker.inner.lock();
+        assert!(!inner.trimmed_blocks.bitmap.check_bit(5));
+        assert_eq!(inner.trimmed_blocks.num_trimmed_blocks, 0);
+    }
+
+    #[test]
+    fn anchored_space_before_eof_extend() {
+        // Set up a full file with no free pool and no EOF space,
+        // but with a soft-anchored block.
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+
+        // Mark block 2 as trimmed at offset 6*MB.
+        tracker
+            .mark_trimmed_block(2, 6 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        // Mark remaining free space as in-use so pool is empty.
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_range_in_use(&mut eof, 5 * MB1, MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_range_in_use(&mut eof, 8 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+
+        let bat = make_test_bat_with_anchored_block(2, 6, 16);
+
+        // Should find anchored space (priority 3) instead of extending EOF.
+        let result = tracker.try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat);
+        assert!(result.is_some());
+        let r = result.unwrap();
+        assert_eq!(r.file_offset, 6 * MB1);
+    }
+
+    // -- Release tests --
+
+    #[test]
+    fn release_then_reallocate() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        tracker.complete_initialization(&eof);
+
+        // Allocate from EOF space.
+        let r1 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        let offset = r1.file_offset;
+
+        // Release it back to free pool.
+        tracker.release(offset, MB1 as u32);
+
+        // Allocate again — should reuse the released space.
+        let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r2.file_offset, offset);
+    }
+
+    // -- Truncation test --
+
+    #[test]
+    fn truncate_shrinks_bitmaps() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        tracker.complete_initialization(&eof);
+
+        assert_eq!(eof.file_length, 10 * MB1);
+
+        tracker.apply_truncate(&mut eof, 6 * MB1);
+        assert_eq!(eof.file_length, 6 * MB1);
+    }
+
+    // -- Bitmap resize test --
+
+    #[test]
+    fn bitmap_resize_preserves_data() {
+        let mut bm = SpaceBitmap::new(32);
+        bm.set_range(10, 10);
+
+        bm.resize(64);
+        assert_eq!(bm.len(), 64);
+        assert!(bm.are_bits_set(10, 10));
+        assert!(bm.are_bits_clear(20, 44));
+
+        bm.resize(16);
+        assert_eq!(bm.len(), 16);
+        assert!(bm.are_bits_set(10, 6)); // only 10..16 remains
+    }
+
+    // -- Priority cascade test --
+
+    #[test]
+    fn priority_cascade_pool_then_eof_then_anchor_then_extend() {
+        // Walk through all 4 priorities in sequence.
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+
+        // Mark 4..5 MB in-use (a data block during BAT parse).
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32)
+            .unwrap();
+        // Mark 5..7 MB in-use, then mark as soft-anchored (trimmed block 1).
+        // Always mark in-use first, then mark as trimmed.
+        tracker
+            .mark_range_in_use(&mut eof, 5 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_trimmed_block(1, 5 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        // Mark 7..8 MB in-use.
+        tracker
+            .mark_range_in_use(&mut eof, 7 * MB1, MB1 as u32)
+            .unwrap();
+
+        tracker.complete_initialization(&eof);
+        // zero_offset = 8 MB, file_length = 10 MB.
+        // Pool: empty (all bits 0..8 are cleared). Near-EOF: 8..10 (2 MB).
+
+        // Release bit 4 back to pool.
+        tracker.release(4 * MB1, MB1 as u32);
+
+        // Create BAT for soft-anchor lookup.
+        let bat = make_test_bat_with_anchored_block(1, 5, 16);
+
+        // Priority 1: pool (offset 4 MB).
+        let r1 = tracker
+            .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r1.file_offset, 4 * MB1);
+        assert!(!r1.state.is_safe());
+
+        // Pool now empty. Priority 2: near-EOF (offset 8 MB).
+        let r2 = tracker
+            .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r2.file_offset, 8 * MB1);
+        assert!(r2.state.is_safe());
+
+        // Take the second EOF MB too.
+        let r3 = tracker
+            .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r3.file_offset, 9 * MB1);
+        assert!(r3.state.is_safe());
+
+        // Pool and EOF exhausted. Priority 3: soft-anchored (offset 5 MB).
+        // The block is 2 MB but we only need 1 MB — excess goes to pool.
+        let r4 = tracker
+            .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r4.file_offset, 5 * MB1);
+        assert!(!r4.state.is_safe());
+
+        // The excess 1 MB from the anchored block should now be in pool.
+        let r5 = tracker
+            .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r5.file_offset, 6 * MB1);
+        assert!(!r5.state.is_safe());
+
+        // Everything exhausted. Priority 4: returns None.
+        let r6 = tracker.try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat);
+        assert!(r6.is_none());
+
+        // Extend EOF, then retry.
+        let target = eof.required_file_length(tracker.block_alignment(), MB1 as u32, false);
+        eof.complete_file_extend(&tracker, target);
+        let r7 = tracker
+            .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat)
+            .unwrap();
+        assert!(r7.state.is_safe());
+        assert_eq!(r7.file_offset, 10 * MB1);
+    }
+
+    // -- Aligned allocation from pool test --
+
+    #[test]
+    fn aligned_alloc_from_pool() {
+        // 20 MB file, 4 MB block size, 4 MB alignment.
+        let (tracker, mut eof) = make_test_tracker_aligned(20, 4, 4 * MB1 as u32);
+
+        // Mark 4..8 MB in-use, then release to create a 4MB pool hole at an aligned offset.
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, 4 * MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+        tracker.release(4 * MB1, 4 * MB1 as u32);
+
+        // Pool allocation ignores alignment (alignment only applies to near-EOF).
+        let result = tracker
+            .try_allocate(&mut eof, 4 * MB1 as u32, true)
+            .unwrap();
+        assert_eq!(result.file_offset, 4 * MB1);
+        assert!(!result.state.is_safe());
+    }
+
+    // -- Unaligned EOF skip test --
+
+    #[test]
+    fn aligned_alloc_skips_unaligned_eof_offset() {
+        // 20 MB file, 4 MB block size, 4 MB alignment.
+        let (tracker, mut eof) = make_test_tracker_aligned(20, 4, 4 * MB1 as u32);
+
+        // Mark 4..5 MB in-use. This pushes zero_offset to 5 MB (not 4MB-aligned).
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+        // zero_offset = 5 MB. Aligned to 4 MB → round up to 8 MB.
+        // So the allocation should come from offset 8 MB (skipping 5..8).
+
+        let result = tracker
+            .try_allocate(&mut eof, 4 * MB1 as u32, true)
+            .unwrap();
+        assert_eq!(result.file_offset, 8 * MB1);
+        assert!(result.state.is_safe());
+    }
+
+    // -- Bitmap resize on file extend --
+
+    #[test]
+    fn complete_file_extend_grows_bitmaps() {
+        let (tracker, mut eof) = make_test_tracker(4, 2);
+        tracker.complete_initialization(&eof);
+
+        // Bitmap should be 4 bits (4 MB / 1 MB).
+        {
+            let inner = tracker.inner.lock();
+            assert!(inner.free_space.bitmap.len() >= 4);
+        }
+
+        // Extend to 100 MB.
+        eof.complete_file_extend(&tracker, 100 * MB1);
+        assert_eq!(eof.file_length, 100 * MB1);
+
+        {
+            let inner = tracker.inner.lock();
+            // Bitmap must have grown to at least 100 bits.
+            assert!(inner.free_space.bitmap.len() >= 100);
+            assert!(inner.anchored_space.bitmap.len() >= 100);
+        }
+
+        // Near-EOF space should now be available.
+        let result = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert!(result.state.is_safe());
+    }
+
+    // -- no_free_blocks flag reset on release --
+
+    #[test]
+    fn no_free_blocks_flag_resets_on_release() {
+        let (tracker, mut eof) = make_test_tracker(6, 2);
+        tracker.complete_initialization(&eof);
+
+        // Exhaust all space: 2 MB of near-EOF (6-4=2).
+        tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert!(tracker.try_allocate(&mut eof, MB1 as u32, false).is_none());
+
+        // The no_free_blocks flag should be set now.
+        {
+            let inner = tracker.inner.lock();
+            assert!(inner.free_space.no_free_blocks);
+        }
+
+        // Release 1 MB back.
+        tracker.release(4 * MB1, MB1 as u32);
+
+        // Flag should be cleared.
+        {
+            let inner = tracker.inner.lock();
+            assert!(!inner.free_space.no_free_blocks);
+        }
+
+        // Should be able to allocate again.
+        let result = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(result.file_offset, 4 * MB1);
+        assert!(!result.state.is_safe());
+    }
+
+    // -- Fragmented pool test --
+
+    #[test]
+    fn fragmented_pool_allocates_from_lowest_hint() {
+        let (tracker, mut eof) = make_test_tracker(20, 2);
+        // Mark a contiguous range 4..11 MB in-use during BAT parse.
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, 7 * MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+        // zero_offset = 11 MB. Near-EOF = 11..20 (9 MB).
+        // Pool: empty (bits 0..11 all cleared).
+
+        // Release scattered 1MB blocks to create fragmentation.
+        tracker.release(10 * MB1, MB1 as u32);
+        tracker.release(8 * MB1, MB1 as u32);
+        tracker.release(6 * MB1, MB1 as u32);
+        tracker.release(4 * MB1, MB1 as u32);
+
+        // Pool should find the lowest free bit first.
+        let r1 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r1.file_offset, 4 * MB1);
+
+        let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r2.file_offset, 6 * MB1);
+
+        let r3 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r3.file_offset, 8 * MB1);
+
+        let r4 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r4.file_offset, 10 * MB1);
+
+        // Pool exhausted — next allocation comes from near-EOF.
+        let r5 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r5.file_offset, 11 * MB1);
+        assert!(r5.state.is_safe());
+    }
+
+    // -- Multi-MB allocation from pool --
+
+    #[test]
+    fn pool_allocates_contiguous_multi_mb() {
+        let (tracker, mut eof) = make_test_tracker(20, 2);
+        // Mark a contiguous 4 MB region (bits 4..8) in-use, then release.
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, 4 * MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+        tracker.release(4 * MB1, 4 * MB1 as u32);
+
+        // Now request a 3 MB allocation from pool — should find the 4MB hole.
+        let result = tracker
+            .try_allocate(&mut eof, 3 * MB1 as u32, false)
+            .unwrap();
+        assert_eq!(result.file_offset, 4 * MB1);
+        assert!(!result.state.is_safe());
+
+        // 1 MB of the hole (bit 7) is still in pool.
+        let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r2.file_offset, 7 * MB1);
+    }
+
+    // -- Truncation clamps zero_offset --
+
+    #[test]
+    fn truncate_clamps_zero_offset() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        tracker.complete_initialization(&eof);
+        // zero_offset = 4 MB, file_length = 10 MB.
+
+        // Allocate some EOF space to advance zero_offset.
+        tracker
+            .try_allocate(&mut eof, 3 * MB1 as u32, false)
+            .unwrap();
+        assert_eq!(eof.zero_offset, 7 * MB1);
+
+        // Truncate file to 5 MB.
+        tracker.apply_truncate(&mut eof, 5 * MB1);
+        assert_eq!(eof.file_length, 5 * MB1);
+        // zero_offset should be clamped to file_length.
+        assert!(eof.zero_offset <= 5 * MB1);
+    }
+
+    // -- Multiple anchored blocks: only one reclaimed per allocate --
+
+    #[test]
+    fn multiple_anchored_blocks_reclaimed_one_at_a_time() {
+        let (tracker, mut eof) = make_test_tracker(20, 2);
+        // Mark anchored regions in-use first (standard sequence),
+        // then mark as trimmed.
+        tracker
+            .mark_range_in_use(&mut eof, 6 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_trimmed_block(2, 6 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_range_in_use(&mut eof, 10 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_trimmed_block(5, 10 * MB1, 2 * MB1 as u32)
+            .unwrap();
+
+        // Fill all remaining space so pool + EOF are empty.
+        tracker
+            .mark_range_in_use(&mut eof, 4 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_range_in_use(&mut eof, 8 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        tracker
+            .mark_range_in_use(&mut eof, 12 * MB1, 8 * MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+
+        // BAT with both blocks anchored.
+        let bat = make_test_bat_with_anchored_block(2, 6, 16);
+        bat.set_block_mapping(
+            5,
+            BlockMapping::new()
+                .with_bat_state(BatEntryState::Unmapped)
+                .with_file_megabyte(10),
+        );
+
+        // First allocate gets block 2 (lowest block number).
+        let r1 = tracker
+            .try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r1.file_offset, 6 * MB1);
+        assert!(!r1.state.is_safe());
+
+        // Second allocate gets block 5.
+        let r2 = tracker
+            .try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r2.file_offset, 10 * MB1);
+        assert!(!r2.state.is_safe());
+
+        // No more anchored blocks.
+        assert!(
+            tracker
+                .try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat)
+                .is_none()
+        );
+    }
+
+    // -- Anchored block larger than requested: excess goes to pool --
+
+    #[test]
+    fn anchored_block_excess_released_to_pool() {
+        let (tracker, mut eof) = make_test_tracker(10, 2); // block_size = 2 MB
+        // Anchor block 0 at offset 4..6 MB.
+        tracker
+            .mark_trimmed_block(0, 4 * MB1, 2 * MB1 as u32)
+            .unwrap();
+        // Fill the rest.
+        tracker
+            .mark_range_in_use(&mut eof, 6 * MB1, 4 * MB1 as u32)
+            .unwrap();
+        tracker.complete_initialization(&eof);
+
+        let bat = make_test_bat_with_anchored_block(0, 4, 16);
+
+        // Request only 1 MB from a 2 MB anchored block.
+        let r = tracker
+            .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat)
+            .unwrap();
+        assert_eq!(r.file_offset, 4 * MB1);
+
+        // The excess 1 MB (at offset 5 MB) should now be in the free pool.
+        let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap();
+        assert_eq!(r2.file_offset, 5 * MB1);
+        assert!(!r2.state.is_safe());
+    }
+
+    // -- required_file_length respects alignment --
+
+    #[test]
+    fn required_file_length_with_alignment() {
+        let (tracker, eof) = make_test_tracker_aligned(4, 4, 4 * MB1 as u32);
+        tracker.complete_initialization(&eof);
+        // zero_offset = 4 MB (already aligned).
+
+        let target = eof.required_file_length(tracker.block_alignment(), 4 * MB1 as u32, true);
+        // Should be at least file_length + extension_length.
+        assert!(target >= 4 * MB1 + DEFAULT_EOF_EXTENSION_LENGTH as u64);
+        // And aligned target should fit the request.
+        assert!(target >= 4 * MB1 + 4 * MB1);
+    }
+
+    // -- Zero-length mark is a no-op --
+
+    #[test]
+    fn mark_zero_length_is_noop() {
+        let (tracker, mut eof) = make_test_tracker(10, 2);
+        assert!(tracker.mark_range_in_use(&mut eof, 4 * MB1, 0).is_ok());
+        // The range should still be free.
+        assert!(!tracker.is_range_in_use(&eof, 4 * MB1, MB1 as u32));
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/tests/crash_tests.rs b/vm/devices/storage/vhdx/src/tests/crash_tests.rs
new file mode 100644
index 0000000000..60d0b2aefb
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/tests/crash_tests.rs
@@ -0,0 +1,1132 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Crash consistency tests for the VHDX write path.
+//!
+//! Tests are organized into three categories:
+//! 1. **Structural tests** — verify `pre_log_fsn` values on BAT pages
+//!    after allocation decisions.
+//! 2. **Ordering tests** — verify flush barriers between data writes and
+//!    WAL writes via the write log.
+//! 3. **End-to-end crash recovery tests** — simulate crashes with
+//!    `CrashTestFile` and verify replay recovers correctly.
+
+use crate::AsyncFile;
+use crate::AsyncFileExt;
+use crate::format;
+use crate::open::VhdxFile;
+use crate::tests::support::CrashTestFile;
+use crate::tests::support::InMemoryFile;
+use pal_async::DefaultDriver;
+use pal_async::async_test;
+
+/// Helper: write a data pattern via the write path.
+async fn write_pattern<F: AsyncFile>(vhdx: &VhdxFile<F>, offset: u64, len: usize, value: u8) {
+    let write_buf = vec![value; len];
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(offset, len as u32, &mut ranges)
+        .await
+        .unwrap();
+    for range in &ranges {
+        match range {
+            crate::WriteRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                vhdx.file
+                    .write_at(*file_offset, &write_buf[..(*length as usize)])
+                    .await
+                    .unwrap();
+            }
+            crate::WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let zeros = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &zeros).await.unwrap();
+            }
+        }
+    }
+    guard.complete().await.unwrap();
+}
+
+/// Helper: read data at a guest offset via the read path.
+async fn read_pattern<F: AsyncFile>(vhdx: &VhdxFile<F>, offset: u64, len: usize) -> Vec<u8> {
+    let mut buf = vec![0u8; len];
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_read(offset, len as u32, &mut ranges)
+        .await
+        .unwrap();
+    for range in &ranges {
+        match range {
+            crate::ReadRange::Data {
+                guest_offset,
+                file_offset,
+                length,
+            } => {
+                let start = (*guest_offset - offset) as usize;
+                let end = start + *length as usize;
+                vhdx.file
+                    .read_at(*file_offset, &mut buf[start..end])
+                    .await
+                    .unwrap();
+            }
+            crate::ReadRange::Zero {
+                guest_offset,
+                length,
+            } => {
+                let start = (*guest_offset - offset) as usize;
+                let end = start + *length as usize;
+                buf[start..end].fill(0);
+            }
+            crate::ReadRange::Unmapped { .. } => {}
+        }
+    }
+    buf
+}
+
+// =============================================================================
+// Structural tests: verify pre_log_fsn values on BAT pages
+// =============================================================================
+
+/// Near-EOF allocation: SpaceState::Zero → no pre_log_fsn on BAT page.
+///
+/// First write to a new VHDX allocates from near-EOF space, which is
+/// already zeroed (durable zeros). No flush barrier is needed.
+#[async_test]
+async fn bat_page_no_fsn_safe_near_eof(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write one full block (block_size is 1 MiB by default).
+    let block_size = vhdx.block_size as usize;
+    write_pattern(&vhdx, 0, block_size, 0xAA).await;
+
+    // Check that the BAT page for block 0 has NO pre_log_fsn.
+    let page_key = vhdx.bat_page_key_for_block(0);
+    let fsn = vhdx.cache.get_pre_log_fsn(page_key);
+    assert_eq!(
+        fsn, None,
+        "near-EOF allocation should NOT set pre_log_fsn (SpaceState::Zero)"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// Free-pool reuse: SpaceState::CrossStale → pre_log_fsn set.
+///
+/// Allocate a block, trim it (release to free pool), then allocate again.
+/// The second allocation reuses free-pool space, which is NOT safe (contains
+/// old data from the previously trimmed block).
+#[async_test]
+async fn bat_page_has_fsn_unsafe_free_pool(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Allocate block 0 with data.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0 to release its space to the free pool.
+    let trim_request =
+        crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size);
+    vhdx.trim(trim_request).await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Now write to block 1 — space should come from free pool.
+    write_pattern(&vhdx, block_size, block_size as usize, 0xBB).await;
+
+    // The BAT page for block 1 should have a pre_log_fsn set because
+    // the allocation reused free-pool space (SpaceState::CrossStale (unsafe)).
+    let page_key = vhdx.bat_page_key_for_block(1);
+    let fsn = vhdx.cache.get_pre_log_fsn(page_key);
+
+    assert!(
+        fsn.is_some(),
+        "free-pool reuse should set pre_log_fsn (SpaceState::CrossStale)"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// Overwrite existing FullyPresent block: no allocation → no pre_log_fsn.
+///
+/// Writing to an already-allocated block should not set any FSN constraint
+/// because the BAT entry doesn't change.
+#[async_test]
+async fn bat_page_no_fsn_existing_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as usize;
+
+    // First write allocates the block.
+    write_pattern(&vhdx, 0, block_size, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Second write overwrites the same block — no new allocation.
+    write_pattern(&vhdx, 0, block_size, 0xBB).await;
+
+    // BAT page should have no pre_log_fsn (no allocation happened).
+    let page_key = vhdx.bat_page_key_for_block(0);
+    let fsn = vhdx.cache.get_pre_log_fsn(page_key);
+    assert_eq!(
+        fsn, None,
+        "overwrite of existing block should NOT set pre_log_fsn"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// Soft-anchor reclaim: SpaceState::Zero → no pre_log_fsn.
+///
+/// Trim a block with FileSpace mode (creates soft anchor), then write the
+/// same block again. The allocation reclaims the soft-anchored space, which
+/// is the block's own old data — SpaceState::OwnStale (safe).
+#[async_test]
+async fn bat_page_no_fsn_safe_soft_anchor(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Allocate block 0 with data.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0 with FileSpace mode — creates a soft anchor.
+    let trim_request =
+        crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size);
+    vhdx.trim(trim_request).await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Write to block 0 again — should reclaim the soft-anchored space.
+    write_pattern(&vhdx, 0, block_size as usize, 0xBB).await;
+
+    // The BAT page for block 0 should have NO pre_log_fsn because the
+    // allocation reused the block's own old space (SpaceState::OwnStale (safe)).
+    let page_key = vhdx.bat_page_key_for_block(0);
+    let fsn = vhdx.cache.get_pre_log_fsn(page_key);
+    assert_eq!(
+        fsn, None,
+        "soft-anchor reclaim should NOT set pre_log_fsn (SpaceState::Zero)"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// Partial write triggering non-TFP allocation with unsafe space →
+/// pre_log_fsn set.
+///
+/// Write a partial block (less than full block) where the allocation
+/// comes from the free pool. Since the space contains stale data from
+/// another block, SpaceState::CrossStale (unsafe) → the non-TFP path sets pre_log_fsn.
+#[async_test]
+async fn bat_page_has_fsn_partial_unsafe(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Allocate block 0 with data.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0 to release its space to the free pool.
+    let trim_request =
+        crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size);
+    vhdx.trim(trim_request).await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Write a partial block at block 1 (less than full block_size). This
+    // triggers the non-TFP allocation path. Space comes from free pool →
+    // SpaceState::CrossStale (unsafe).
+    let partial_size = 4096;
+    write_pattern(&vhdx, block_size, partial_size, 0xCC).await;
+
+    // The BAT page for block 1 should have pre_log_fsn set.
+    let page_key = vhdx.bat_page_key_for_block(1);
+    let fsn = vhdx.cache.get_pre_log_fsn(page_key);
+    assert!(
+        fsn.is_some(),
+        "partial write with unsafe free-pool space should set pre_log_fsn"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// Partial write with safe space → no pre_log_fsn.
+///
+/// A partial write to a new block allocated from near-EOF (safe) space
+/// should NOT set pre_log_fsn, because near-EOF space contains durable zeros.
+#[async_test]
+async fn bat_page_no_fsn_partial_safe(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Partial write to block 0 (less than full block). Space comes from
+    // near-EOF → SpaceState::OwnStale (safe).
+    let partial_size = 4096;
+    write_pattern(&vhdx, 0, partial_size, 0xDD).await;
+
+    // The BAT page for block 0 should have NO pre_log_fsn.
+    let page_key = vhdx.bat_page_key_for_block(0);
+    let fsn = vhdx.cache.get_pre_log_fsn(page_key);
+    assert_eq!(
+        fsn, None,
+        "partial write with safe near-EOF space should NOT set pre_log_fsn"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+// =============================================================================
+// End-to-end crash recovery tests using CrashTestFile
+// =============================================================================
+
+/// Write + flush → crash → replay recovers data.
+#[async_test]
+async fn crash_after_flush_data_survives(driver: DefaultDriver) {
+    // Create a VHDX on InMemoryFile first, then transfer to CrashTestFile.
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+
+    // Write one block of data.
+    let block_size = vhdx.block_size as usize;
+    write_pattern(&vhdx, 0, block_size, 0xAB).await;
+
+    // Flush to make data durable.
+    vhdx.flush().await.unwrap();
+
+    // Crash — get durable state, then abort (ensures log task exits).
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Reopen from durable state (log replay will happen).
+    let recovered_file = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered_file)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Verify data survived.
+    let read_buf = read_pattern(&vhdx2, 0, block_size).await;
+    assert!(
+        read_buf.iter().all(|&b| b == 0xAB),
+        "data should survive crash after flush"
+    );
+}
+
+/// Write without flush → crash → data lost.
+#[async_test]
+async fn crash_no_flush_data_lost(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as usize;
+
+    // Write but do NOT flush.
+    write_pattern(&vhdx, 0, block_size, 0xCD).await;
+
+    // Crash — get durable state.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Reopen from durable state.
+    let recovered_file = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered_file)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Data should be lost (read as zeros for unallocated block).
+    let read_buf = read_pattern(&vhdx2, 0, block_size).await;
+    assert!(
+        read_buf.iter().all(|&b| b == 0),
+        "data should be lost without flush before crash"
+    );
+}
+
+/// Write + flush + close → reopen → clean (no replay needed).
+#[async_test]
+async fn clean_close_no_replay(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as usize;
+
+    write_pattern(&vhdx, 0, block_size, 0xEE).await;
+    vhdx.flush().await.unwrap();
+
+    // Close cleanly.
+    let file_ref = vhdx.file.clone();
+    vhdx.close().await.unwrap();
+    let durable = file_ref.durable_snapshot();
+
+    // Reopen — should NOT need log replay.
+    let recovered_file = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered_file)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let read_buf = read_pattern(&vhdx2, 0, block_size).await;
+    assert!(
+        read_buf.iter().all(|&b| b == 0xEE),
+        "data should survive clean close + reopen"
+    );
+}
+
+/// Crash after flush, reopen, write more, flush, crash again →
+/// both rounds of data survive.
+#[async_test]
+async fn crash_recovery_then_more_writes(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+    let block_size: u64;
+
+    // Round 1: write, flush, crash.
+    let durable1 = {
+        let crash_file = CrashTestFile::from_durable(snapshot);
+        let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+        block_size = vhdx.block_size as u64;
+
+        write_pattern(&vhdx, 0, block_size as usize, 0x11).await;
+        vhdx.flush().await.unwrap();
+
+        let durable = vhdx.file.durable_snapshot();
+        vhdx.abort().await;
+        durable
+    };
+
+    // Round 2: recover, write more, flush, crash again.
+    let durable2 = {
+        let crash_file = CrashTestFile::from_durable(durable1);
+        let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+
+        write_pattern(&vhdx, block_size, block_size as usize, 0x22).await;
+        vhdx.flush().await.unwrap();
+
+        let durable = vhdx.file.durable_snapshot();
+        vhdx.abort().await;
+        durable
+    };
+
+    // Verify both rounds of data survive.
+    let recovered_file = InMemoryFile::from_snapshot(durable2);
+    let vhdx = VhdxFile::open(recovered_file)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let buf0 = read_pattern(&vhdx, 0, block_size as usize).await;
+    assert!(
+        buf0.iter().all(|&b| b == 0x11),
+        "round 1 data should survive"
+    );
+
+    let buf1 = read_pattern(&vhdx, block_size, block_size as usize).await;
+    assert!(
+        buf1.iter().all(|&b| b == 0x22),
+        "round 2 data should survive"
+    );
+}
+
+/// Multiple blocks → flush → crash → all survive.
+#[async_test]
+async fn crash_multi_block_all_survive(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Write three blocks with different patterns.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    write_pattern(&vhdx, block_size, block_size as usize, 0xBB).await;
+    write_pattern(&vhdx, block_size * 2, block_size as usize, 0xCC).await;
+
+    vhdx.flush().await.unwrap();
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Verify all three blocks survived.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let buf0 = read_pattern(&vhdx, 0, block_size as usize).await;
+    assert!(buf0.iter().all(|&b| b == 0xAA));
+    let buf1 = read_pattern(&vhdx, block_size, block_size as usize).await;
+    assert!(buf1.iter().all(|&b| b == 0xBB));
+    let buf2 = read_pattern(&vhdx, block_size * 2, block_size as usize).await;
+    assert!(buf2.iter().all(|&b| b == 0xCC));
+}
+
+/// Sequential writes with flushes → crash after second flush →
+/// first two blocks survive, third (unflushed) lost.
+#[async_test]
+async fn crash_interleaved_flush_partial(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Write block 0, flush.
+    write_pattern(&vhdx, 0, block_size as usize, 0x11).await;
+    vhdx.flush().await.unwrap();
+
+    // Write block 1, flush.
+    write_pattern(&vhdx, block_size, block_size as usize, 0x22).await;
+    vhdx.flush().await.unwrap();
+
+    // Write block 2, do NOT flush.
+    write_pattern(&vhdx, block_size * 2, block_size as usize, 0x33).await;
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Block 0 and 1 should survive.
+    let buf0 = read_pattern(&vhdx, 0, block_size as usize).await;
+    assert!(buf0.iter().all(|&b| b == 0x11), "block 0 should survive");
+    let buf1 = read_pattern(&vhdx, block_size, block_size as usize).await;
+    assert!(buf1.iter().all(|&b| b == 0x22), "block 1 should survive");
+
+    // Block 2 should be lost (zeros).
+    let buf2 = read_pattern(&vhdx, block_size * 2, block_size as usize).await;
+    assert!(
+        buf2.iter().all(|&b| b == 0),
+        "block 2 (unflushed) should be lost"
+    );
+}
+
+/// Large write spanning multiple blocks → flush → crash → all survive.
+#[async_test]
+async fn crash_spanning_write_survives(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Write 3 blocks at once (spanning write).
+    let total_len = (block_size * 3) as usize;
+    write_pattern(&vhdx, 0, total_len, 0xDD).await;
+    vhdx.flush().await.unwrap();
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let read_buf = read_pattern(&vhdx, 0, total_len).await;
+    assert!(
+        read_buf.iter().all(|&b| b == 0xDD),
+        "spanning write data should survive crash after flush"
+    );
+}
+
+/// Write → flush → apply completes → crash (log_guid still set) →
+/// replay is idempotent.
+///
+/// After flush, the log task writes WAL entries and applies them to the
+/// BAT region. If we crash at that point, log_guid is still set (close
+/// never ran), so the next open replays the log. Since the entries are
+/// already applied, replay is idempotent — the data should be correct.
+#[async_test]
+async fn crash_after_apply_replay_idempotent(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as usize;
+
+    // Write two blocks with distinct patterns.
+    write_pattern(&vhdx, 0, block_size, 0xA1).await;
+    write_pattern(&vhdx, block_size as u64, block_size, 0xA2).await;
+    vhdx.flush().await.unwrap();
+
+    // Take a durable snapshot — log entries are applied by now.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // First replay — should succeed.
+    let recovered1 = InMemoryFile::from_snapshot(durable.clone());
+    let vhdx1 = VhdxFile::open(recovered1)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+    let buf0 = read_pattern(&vhdx1, 0, block_size).await;
+    assert!(
+        buf0.iter().all(|&b| b == 0xA1),
+        "block 0 after first replay"
+    );
+    let buf1 = read_pattern(&vhdx1, block_size as u64, block_size).await;
+    assert!(
+        buf1.iter().all(|&b| b == 0xA2),
+        "block 1 after first replay"
+    );
+
+    // Second replay from the same durable snapshot — should be idempotent.
+    let recovered2 = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered2)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+    let buf0b = read_pattern(&vhdx2, 0, block_size).await;
+    assert!(
+        buf0b.iter().all(|&b| b == 0xA1),
+        "block 0 after second replay (idempotent)"
+    );
+    let buf1b = read_pattern(&vhdx2, block_size as u64, block_size).await;
+    assert!(
+        buf1b.iter().all(|&b| b == 0xA2),
+        "block 1 after second replay (idempotent)"
+    );
+}
+
+/// Overwrite same block → flush → crash → latest data survives.
+#[async_test]
+async fn crash_overwrite_latest_wins(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as usize;
+
+    // Write block 0 with 0xAA, flush.
+    write_pattern(&vhdx, 0, block_size, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Overwrite block 0 with 0xBB, flush.
+    write_pattern(&vhdx, 0, block_size, 0xBB).await;
+    vhdx.flush().await.unwrap();
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let read_buf = read_pattern(&vhdx, 0, block_size).await;
+    assert!(
+        read_buf.iter().all(|&b| b == 0xBB),
+        "latest overwrite should survive"
+    );
+}
+
+/// Crash without close → reopen → log_guid set → replay → correct.
+#[async_test]
+async fn drop_without_close_triggers_replay(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as usize;
+
+    write_pattern(&vhdx, 0, block_size, 0xEE).await;
+    vhdx.flush().await.unwrap();
+
+    // Abort without close (simulates unclean shutdown).
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Verify the header still has log_guid set (dirty file).
+    let recovered = InMemoryFile::from_snapshot(durable.clone());
+    // Opening read-only when dirty should fail with LogReplayRequired.
+    let result = VhdxFile::open(InMemoryFile::from_snapshot(durable.clone()))
+        .read_only()
+        .await;
+    assert!(result.is_err(), "read-only open of dirty file should fail");
+
+    // Open writable — log replay should happen.
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+    let read_buf = read_pattern(&vhdx2, 0, block_size).await;
+    assert!(
+        read_buf.iter().all(|&b| b == 0xEE),
+        "data should survive after log replay"
+    );
+    drop(result);
+}
+
+// =============================================================================
+// Ordering tests: verify flush barrier placement via write log
+// =============================================================================
+
+/// Core ordering test: unsafe allocation → flush barrier between
+/// data write and WAL write.
+///
+/// We verify that after a free-pool reuse allocation, the write log
+/// contains a Flush entry between the data writes and subsequent WAL writes.
+#[async_test]
+async fn flush_between_data_and_wal_unsafe(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Allocate block 0 (near-EOF, safe).
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0 to release to free pool.
+    let trim_request =
+        crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size);
+    vhdx.trim(trim_request).await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Record the flush count before the unsafe allocation.
+    let pre_flush_count = vhdx.file.flush_count();
+
+    // Write block 1 — should reuse free-pool space (unsafe).
+    write_pattern(&vhdx, block_size, block_size as usize, 0xBB).await;
+
+    // Flush — this triggers the log path.
+    vhdx.flush().await.unwrap();
+
+    // Post-flush count should be greater (at least one flush for data + one for WAL).
+    let post_flush_count = vhdx.file.flush_count();
+    assert!(
+        post_flush_count > pre_flush_count,
+        "should have flushed after unsafe allocation: pre={}, post={}",
+        pre_flush_count,
+        post_flush_count
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// Safe allocation → no extra flush barrier needed.
+///
+/// Near-EOF allocations produce SpaceState::Zero, so no pre_log_fsn
+/// is set. The number of flushes should be minimal.
+#[async_test]
+async fn no_extra_flush_safe_allocation(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // All allocations are near-EOF (safe). No extra flushes needed.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+
+    // The BAT page should have no pre_log_fsn constraint.
+    let page_key = vhdx.bat_page_key_for_block(0);
+    let fsn = vhdx.cache.get_pre_log_fsn(page_key);
+    assert_eq!(
+        fsn, None,
+        "safe near-EOF allocation should not set pre_log_fsn"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// Multiple blocks in one write, mixed safe/unsafe → barrier present
+/// for the unsafe block.
+///
+/// Allocate block 0 (safe), trim it to free pool, then write a spanning
+/// write covering block 1 (unsafe, from free pool) and block 2 (safe,
+/// near-EOF). The unsafe block should have pre_log_fsn set.
+#[async_test]
+async fn mixed_safe_unsafe_has_barrier(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Allocate block 0.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0 to release to free pool.
+    let trim_request =
+        crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size);
+    vhdx.trim(trim_request).await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Write a spanning write covering blocks 1 and 2.
+    // Block 1's space may come from the free pool (unsafe).
+    // Block 2's space comes from near-EOF (safe).
+    let total_len = (block_size * 2) as usize;
+    write_pattern(&vhdx, block_size, total_len, 0xCC).await;
+
+    // At least one of the BAT pages should have pre_log_fsn set (the
+    // unsafe block). Check that the write_guard's needs_flush_before_log
+    // was set by verifying the BAT page for the unsafe block has FSN.
+    let page_key_1 = vhdx.bat_page_key_for_block(1);
+    let page_key_2 = vhdx.bat_page_key_for_block(2);
+    let fsn_1 = vhdx.cache.get_pre_log_fsn(page_key_1);
+    let fsn_2 = vhdx.cache.get_pre_log_fsn(page_key_2);
+
+    // At least one block should have FSN (the one allocated from free pool).
+    // Both blocks share the TFP path, so needs_flush_before_log is set for
+    // the entire WriteIoGuard. The FSN is applied to all TFP blocks in
+    // complete_write_inner.
+    assert!(
+        fsn_1.is_some() || fsn_2.is_some(),
+        "mixed safe/unsafe spanning write should set pre_log_fsn on at least one BAT page"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+// =============================================================================
+// Header update tests
+// =============================================================================
+
+/// After flush, header sequence_number has advanced.
+#[async_test]
+async fn flush_advances_header_sequence(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let seq_before = vhdx.header_state.sequence_number().await;
+
+    write_pattern(&vhdx, 0, vhdx.block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // The enable_write_mode call during the first write bumps the sequence number.
+    let seq_after = vhdx.header_state.sequence_number().await;
+    assert!(
+        seq_after > seq_before,
+        "sequence number should advance after write: before={}, after={}",
+        seq_before,
+        seq_after
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+/// close() writes clean header (log_guid = ZERO).
+#[async_test]
+async fn close_header_is_clean(driver: DefaultDriver) {
+    use zerocopy::FromBytes;
+
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let file_ref = vhdx.file.clone();
+
+    write_pattern(&vhdx, 0, vhdx.block_size as usize, 0xBB).await;
+    vhdx.flush().await.unwrap();
+    vhdx.close().await.unwrap();
+
+    // Read both headers, find the current one (highest seq#).
+    let mut buf1 = vec![0u8; format::HEADER_SIZE as usize];
+    file_ref
+        .read_at(format::HEADER_OFFSET_1, &mut buf1)
+        .await
+        .unwrap();
+    let mut buf2 = vec![0u8; format::HEADER_SIZE as usize];
+    file_ref
+        .read_at(format::HEADER_OFFSET_2, &mut buf2)
+        .await
+        .unwrap();
+
+    let h1 = format::Header::read_from_prefix(&buf1).ok().map(|(h, _)| h);
+    let h2 = format::Header::read_from_prefix(&buf2).ok().map(|(h, _)| h);
+
+    let current = match (&h1, &h2) {
+        (Some(a), Some(b)) if b.sequence_number >= a.sequence_number => b,
+        (Some(a), _) => a,
+        (_, Some(b)) => b,
+        _ => panic!("no valid headers"),
+    };
+    assert_eq!(
+        current.log_guid,
+        guid::Guid::ZERO,
+        "after close, current header should have log_guid = ZERO"
+    );
+}
+
+/// Header alternation: writes alternate between header slots 1 and 2.
+#[async_test]
+async fn headers_alternate_between_slots(driver: DefaultDriver) {
+    use zerocopy::FromBytes;
+
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+
+    // Read initial header state.
+    let mut buf1_init = vec![0u8; format::HEADER_SIZE as usize];
+    file.read_at(format::HEADER_OFFSET_1, &mut buf1_init)
+        .await
+        .unwrap();
+    let h1_init = format::Header::read_from_prefix(&buf1_init)
+        .ok()
+        .map(|(h, _)| h);
+    let mut buf2_init = vec![0u8; format::HEADER_SIZE as usize];
+    file.read_at(format::HEADER_OFFSET_2, &mut buf2_init)
+        .await
+        .unwrap();
+    let h2_init = format::Header::read_from_prefix(&buf2_init)
+        .ok()
+        .map(|(h, _)| h);
+
+    let seq1_init = h1_init.as_ref().map_or(0, |h| h.sequence_number);
+    let seq2_init = h2_init.as_ref().map_or(0, |h| h.sequence_number);
+
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let file_ref = vhdx.file.clone();
+
+    // After open_writable, one header slot was updated with log_guid.
+    let mut buf1_after = vec![0u8; format::HEADER_SIZE as usize];
+    file_ref
+        .read_at(format::HEADER_OFFSET_1, &mut buf1_after)
+        .await
+        .unwrap();
+    let h1_after = format::Header::read_from_prefix(&buf1_after)
+        .ok()
+        .map(|(h, _)| h);
+
+    let mut buf2_after = vec![0u8; format::HEADER_SIZE as usize];
+    file_ref
+        .read_at(format::HEADER_OFFSET_2, &mut buf2_after)
+        .await
+        .unwrap();
+    let h2_after = format::Header::read_from_prefix(&buf2_after)
+        .ok()
+        .map(|(h, _)| h);
+
+    let seq1_after = h1_after.as_ref().map_or(0, |h| h.sequence_number);
+    let seq2_after = h2_after.as_ref().map_or(0, |h| h.sequence_number);
+
+    // One slot should have a higher sequence number than before.
+    let slot1_updated = seq1_after > seq1_init;
+    let slot2_updated = seq2_after > seq2_init;
+    assert!(
+        slot1_updated || slot2_updated,
+        "one header slot should be updated after open_writable"
+    );
+    assert!(
+        !(slot1_updated && slot2_updated),
+        "only one header slot should be updated (alternation)"
+    );
+
+    vhdx.close().await.unwrap();
+}
+
+// =============================================================================
+// Deferred space reclaim tests
+// =============================================================================
+
+/// Trim block A (FileSpace), then write the same block again (same-block
+/// reclaim from deferred list). The write should reuse A's offset without
+/// needing a flush — OwnStale.
+#[async_test]
+async fn deferred_same_block_reclaim(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Write block 0.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    let original_offset = vhdx.bat.get_block_mapping(0).file_megabyte();
+    assert!(original_offset > 0);
+
+    // Trim block 0 with FileSpace (creates deferred anchor).
+    let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size);
+    vhdx.trim(trim_req).await.unwrap();
+
+    // Write block 0 again — should reclaim from deferred list (no flush).
+    write_pattern(&vhdx, 0, block_size as usize, 0xBB).await;
+
+    // Block 0 should be FullyPresent at the same offset.
+    let new_offset = {
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert_eq!(mapping.bat_state(), format::BatEntryState::FullyPresent);
+        mapping.file_megabyte()
+    };
+    assert_eq!(original_offset, new_offset, "should reuse same offset");
+
+    // Verify data.
+    let buf = read_pattern(&vhdx, 0, block_size as usize).await;
+    assert!(buf.iter().all(|&b| b == 0xBB));
+
+    vhdx.close().await.unwrap();
+}
+
+/// Trim block A (FileSpace), crash before flush. On reopen, A should
+/// still be FullyPresent with its data intact — the trim was never durable.
+#[async_test]
+async fn deferred_trim_crash_no_data_loss(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Write block 0.
+    write_pattern(&vhdx, 0, block_size as usize, 0xDD).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0 with FileSpace — deferred, NOT flushed.
+    let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size);
+    vhdx.trim(trim_req).await.unwrap();
+
+    // Crash — get durable state.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Reopen from durable state.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Block 0 should still have its data (trim wasn't durable).
+    let buf = read_pattern(&vhdx2, 0, block_size as usize).await;
+    assert!(
+        buf.iter().all(|&b| b == 0xDD),
+        "data should survive crash when trim wasn't flushed"
+    );
+}
+
+/// Trim block A (FileSpace), write block B using separate space, crash
+/// before flush. A should keep its data, B's write should be lost.
+/// No data teleportation.
+#[async_test]
+async fn deferred_no_teleportation_on_crash(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Write blocks 0 and 1.
+    write_pattern(&vhdx, 0, block_size as usize, 0x11).await;
+    write_pattern(&vhdx, block_size, block_size as usize, 0x22).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0 — deferred, not flushed.
+    let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size);
+    vhdx.trim(trim_req).await.unwrap();
+
+    // Write block 1 with new data — this uses block 1's existing offset
+    // (overwrite, no allocation needed).
+    write_pattern(&vhdx, block_size, block_size as usize, 0x33).await;
+
+    // Do NOT flush. Crash.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Reopen.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Block 0 should still have original data (trim wasn't durable).
+    let buf0 = read_pattern(&vhdx2, 0, block_size as usize).await;
+    assert!(
+        buf0.iter().all(|&b| b == 0x11),
+        "block 0 data should be intact after crash (trim not durable)"
+    );
+
+    // Block 1: may have old (0x22) or new (0x33) data depending on
+    // whether the overwrite was flushed. Either is acceptable.
+    // What is NOT acceptable: block 1 reading as 0x11 (block 0's data).
+    let buf1 = read_pattern(&vhdx2, block_size, block_size as usize).await;
+    assert!(
+        buf1.iter().all(|&b| b == 0x22) || buf1.iter().all(|&b| b == 0x33),
+        "block 1 should have its own data, not block 0's"
+    );
+}
+
+/// Trim + flush + write + flush + reopen: verify clean ownership.
+#[async_test]
+async fn deferred_trim_flush_write_flush_reopen(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size as u64;
+
+    // Write block 0, flush.
+    write_pattern(&vhdx, 0, block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Trim block 0, flush (trim becomes durable).
+    let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size);
+    vhdx.trim(trim_req).await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Write block 0 again (same-block reclaim of durable anchor), flush.
+    write_pattern(&vhdx, 0, block_size as usize, 0xBB).await;
+    vhdx.flush().await.unwrap();
+
+    // Graceful close.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.close().await.unwrap();
+
+    // Reopen and verify.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let buf = read_pattern(&vhdx2, 0, block_size as usize).await;
+    assert!(
+        buf.iter().all(|&b| b == 0xBB),
+        "block 0 should have new data after trim+write+flush cycle"
+    );
+}
diff --git a/vm/devices/storage/vhdx/src/tests/io_tests.rs b/vm/devices/storage/vhdx/src/tests/io_tests.rs
new file mode 100644
index 0000000000..ec3746ad09
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/tests/io_tests.rs
@@ -0,0 +1,3709 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+use crate::AsyncFile;
+use crate::AsyncFileExt;
+use crate::create::{self, CreateParams};
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::format;
+use crate::format::BatEntry;
+use crate::format::BatEntryState;
+use crate::format::MB1;
+use crate::header::WriteMode;
+use crate::io::ReadRange;
+use crate::io::WriteRange;
+use crate::open::VhdxFile;
+use crate::region;
+use crate::tests::support::InMemoryFile;
+use crate::tests::support::IoInterceptor;
+use guid::Guid;
+use pal_async::DefaultDriver;
+use pal_async::async_test;
+use std::borrow::Borrow;
+use std::future::Future;
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
+use zerocopy::IntoBytes;
+
+#[async_test]
+async fn read_empty_disk_returns_zero() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: 0,
+            length: 4096,
+        }
+    );
+}
+
+#[async_test]
+async fn read_zero_length() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 0, &mut ranges).await.unwrap();
+
+    assert!(ranges.is_empty());
+}
+
+#[async_test]
+async fn read_beyond_end_of_disk() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    // Read 512 bytes past the end (both offset and length are sector-aligned).
+    let result = vhdx
+        .resolve_read(format::GB1 - 512, 1024, &mut ranges)
+        .await;
+    assert!(matches!(
+        result,
+        Err(VhdxIoError(VhdxIoErrorInner::BeyondEndOfDisk))
+    ));
+}
+
+#[async_test]
+async fn read_at_disk_end_exact() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_read(format::GB1 - 4096, 4096, &mut ranges)
+        .await
+        .unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: format::GB1 - 4096,
+            length: 4096,
+        }
+    );
+}
+
+#[async_test]
+async fn read_fully_present_block() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let regions = region::parse_region_tables(&file).await.unwrap();
+    let bat_offset = regions.bat_offset;
+
+    // Write a FullyPresent BAT entry for block 0 at file_offset_mb = 4.
+    let entry = BatEntry::new()
+        .with_state(BatEntryState::FullyPresent as u8)
+        .with_file_offset_mb(4);
+    file.write_at(bat_offset, entry.as_bytes()).await.unwrap();
+
+    // Extend file to cover the allocated range.
+    let needed = 4 * MB1 + format::DEFAULT_BLOCK_SIZE as u64;
+    file.set_file_size(needed).await.unwrap();
+
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Data {
+            guest_offset: 0,
+            length: 4096,
+            file_offset: 4 * MB1,
+        }
+    );
+}
+
+#[async_test]
+async fn read_spanning_two_blocks() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let block_size = vhdx.block_size() as u64;
+    let mut ranges = Vec::new();
+    // Read last 512 bytes of block 0 and first 512 bytes of block 1.
+    let _guard = vhdx
+        .resolve_read((block_size - 512) as u64, 1024, &mut ranges)
+        .await
+        .unwrap();
+
+    assert_eq!(ranges.len(), 2);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: block_size - 512,
+            length: 512,
+        }
+    );
+    assert_eq!(
+        ranges[1],
+        ReadRange::Zero {
+            guest_offset: block_size,
+            length: 512,
+        }
+    );
+}
+
+#[async_test]
+async fn read_spanning_multiple_blocks() {
+    // Use a small disk with 1 MiB blocks so spans are easier to test.
+    let file = InMemoryFile::new(0);
+    let block_size = MB1 as u32;
+    let mut params = CreateParams {
+        disk_size: 4 * MB1,
+        block_size,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    // Read across blocks 0, 1, 2: start at 512 KiB, length = 2 MiB.
+    // Block 0: 512 KiB remaining. Block 1: full 1 MiB. Block 2: 512 KiB.
+    let start = MB1 / 2; // middle of block 0
+    let len = (2 * MB1) as u32; // spans 3 blocks
+    let _guard = vhdx.resolve_read(start, len, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 3);
+    // Block 0: remaining half
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: start,
+            length: (MB1 / 2) as u32,
+        }
+    );
+    // Block 1: full block
+    assert_eq!(
+        ranges[1],
+        ReadRange::Zero {
+            guest_offset: MB1,
+            length: block_size,
+        }
+    );
+    // Block 2: first half
+    assert_eq!(
+        ranges[2],
+        ReadRange::Zero {
+            guest_offset: 2 * MB1,
+            length: (MB1 / 2) as u32,
+        }
+    );
+}
+
+#[async_test]
+async fn read_unaligned_within_block() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let regions = region::parse_region_tables(&file).await.unwrap();
+
+    // Set block 0 to FullyPresent at file_offset_mb = 4.
+    let entry = BatEntry::new()
+        .with_state(BatEntryState::FullyPresent as u8)
+        .with_file_offset_mb(4);
+    file.write_at(regions.bat_offset, entry.as_bytes())
+        .await
+        .unwrap();
+
+    // Extend file to cover the allocated range.
+    let needed = 4 * MB1 + format::DEFAULT_BLOCK_SIZE as u64;
+    file.set_file_size(needed).await.unwrap();
+
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+    let mut ranges = Vec::new();
+    // Read 512 bytes starting at sector 10 (offset 5120).
+    let _guard = vhdx.resolve_read(5120, 512, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Data {
+            guest_offset: 5120,
+            length: 512,
+            file_offset: 4 * MB1 + 5120,
+        }
+    );
+}
+
+#[async_test]
+async fn read_differencing_not_present() {
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: format::GB1,
+        has_parent: true,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Unmapped {
+            guest_offset: 0,
+            length: 4096,
+        }
+    );
+}
+
+#[async_test]
+async fn read_zero_state_block() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let regions = region::parse_region_tables(&file).await.unwrap();
+
+    // Set block 0 to Zero state.
+    let entry = BatEntry::new()
+        .with_state(BatEntryState::Zero as u8)
+        .with_file_offset_mb(0);
+    file.write_at(regions.bat_offset, entry.as_bytes())
+        .await
+        .unwrap();
+
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: 0,
+            length: 4096,
+        }
+    );
+}
+
+#[async_test]
+async fn read_unmapped_block() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let regions = region::parse_region_tables(&file).await.unwrap();
+
+    // Set block 0 to Unmapped (trimmed) state.
+    let entry = BatEntry::new()
+        .with_state(BatEntryState::Unmapped as u8)
+        .with_file_offset_mb(0);
+    file.write_at(regions.bat_offset, entry.as_bytes())
+        .await
+        .unwrap();
+
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: 0,
+            length: 4096,
+        }
+    );
+}
+
+#[async_test]
+async fn read_undefined_state_block() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let regions = region::parse_region_tables(&file).await.unwrap();
+
+    // Set block 0 to Undefined state (value 1).
+    let entry = BatEntry::new()
+        .with_state(BatEntryState::Undefined as u8)
+        .with_file_offset_mb(0);
+    file.write_at(regions.bat_offset, entry.as_bytes())
+        .await
+        .unwrap();
+
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: 0,
+            length: 4096,
+        }
+    );
+}
+
+#[async_test]
+async fn read_entire_disk() {
+    // Small disk: 4 MiB with 2 MiB blocks = 2 blocks.
+    let disk_size = 4 * MB1;
+    let (file, _) = InMemoryFile::create_test_vhdx(disk_size).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_read(0, disk_size as u32, &mut ranges)
+        .await
+        .unwrap();
+
+    // 2 blocks, each produces one Zero range.
+    assert_eq!(ranges.len(), 2);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: 0,
+            length: format::DEFAULT_BLOCK_SIZE,
+        }
+    );
+    assert_eq!(
+        ranges[1],
+        ReadRange::Zero {
+            guest_offset: format::DEFAULT_BLOCK_SIZE as u64,
+            length: format::DEFAULT_BLOCK_SIZE,
+        }
+    );
+}
+
+#[async_test]
+async fn read_4k_sector_disk() {
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: format::GB1,
+        logical_sector_size: 4096,
+        physical_sector_size: 4096,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    // Read one 4K sector.
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        ReadRange::Zero {
+            guest_offset: 0,
+            length: 4096,
+        }
+    );
+
+    // Unaligned read should fail.
+    let mut ranges2 = Vec::new();
+    let result = vhdx.resolve_read(512, 4096, &mut ranges2).await;
+    assert!(matches!(
+        result,
+        Err(VhdxIoError(VhdxIoErrorInner::UnalignedIo))
+    ));
+}
+
+// ---- Write tests ----
+
+#[async_test]
+async fn write_to_empty_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap();
+
+    // Should allocate a new block. With SpaceState::Zero (near-EOF
+    // extension space), zero padding is skipped — only Data emitted.
+    // Writing 4096 bytes at offset 0 in block:
+    //   Data(0, 4096, file_offset)
+    assert!(!ranges.is_empty());
+    // First should be Data
+    match ranges[0] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            file_offset,
+        } => {
+            assert_eq!(guest_offset, 0);
+            assert_eq!(length, 4096);
+            // file_offset should be MB-aligned.
+            assert!(file_offset > 0);
+            assert_eq!(file_offset % MB1, 0);
+        }
+        _ => panic!("expected Data range, got {:?}", ranges[0]),
+    }
+    // With safe data, trailing zero padding is skipped.
+    // If not safe, a trailing Zero range would follow.
+    if ranges.len() > 1 {
+        match ranges[1] {
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                assert_eq!(length, format::DEFAULT_BLOCK_SIZE - 4096);
+                assert!(file_offset > 0);
+            }
+            _ => panic!("expected Zero range, got {:?}", ranges[1]),
+        }
+    }
+}
+
+#[async_test]
+async fn write_to_fully_present_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let regions = region::parse_region_tables(&file).await.unwrap();
+
+    // Write a FullyPresent BAT entry for block 0 at file_offset_mb = 4.
+    let entry = BatEntry::new()
+        .with_state(BatEntryState::FullyPresent as u8)
+        .with_file_offset_mb(4);
+    file.write_at(regions.bat_offset, entry.as_bytes())
+        .await
+        .unwrap();
+
+    // Extend file to cover the allocated range.
+    let needed = 4 * MB1 + format::DEFAULT_BLOCK_SIZE as u64;
+    file.set_file_size(needed).await.unwrap();
+
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap();
+
+    // Should write directly to the existing block — single Data range.
+    assert_eq!(ranges.len(), 1);
+    assert_eq!(
+        ranges[0],
+        WriteRange::Data {
+            guest_offset: 0,
+            length: 4096,
+            file_offset: 4 * MB1,
+        }
+    );
+}
+
+#[async_test]
+async fn write_spanning_two_blocks(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let block_size = vhdx.block_size() as u64;
+    let mut ranges = Vec::new();
+    // Write last 512 bytes of block 0 and first 512 bytes of block 1.
+    let _guard = vhdx
+        .resolve_write((block_size - 512) as u64, 1024, &mut ranges)
+        .await
+        .unwrap();
+
+    // Each block needs allocation. Filter out the data ranges.
+    let data_ranges: Vec<_> = ranges
+        .iter()
+        .filter(|r| matches!(r, WriteRange::Data { .. }))
+        .collect();
+    assert_eq!(data_ranges.len(), 2, "expected 2 Data ranges for 2 blocks");
+
+    // First Data: last 512 bytes of block 0.
+    match data_ranges[0] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, block_size - 512);
+            assert_eq!(*length, 512);
+        }
+        _ => unreachable!(),
+    }
+    // Second Data: first 512 bytes of block 1.
+    match data_ranges[1] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, block_size);
+            assert_eq!(*length, 512);
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[async_test]
+async fn write_then_read_roundtrip(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Step 1: resolve_write to get file offsets.
+    let mut write_ranges = Vec::new();
+    let guard = vhdx.resolve_write(0, 512, &mut write_ranges).await.unwrap();
+
+    // Step 2: Write actual data at the returned Data offsets.
+    let pattern: Vec<u8> = (0..512u16).map(|i| (i % 256) as u8).collect();
+    for wr in &write_ranges {
+        match wr {
+            WriteRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                vhdx.file
+                    .write_at(*file_offset, &pattern[..(*length as usize)])
+                    .await
+                    .unwrap();
+            }
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let zeros = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &zeros).await.unwrap();
+            }
+        }
+    }
+
+    // Step 3: complete via guard.
+    guard.complete().await.unwrap();
+
+    // Step 4: resolve_read at the same offset.
+    let mut read_ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 512, &mut read_ranges).await.unwrap();
+
+    // Should now be Data (block was allocated).
+    assert_eq!(read_ranges.len(), 1);
+    match &read_ranges[0] {
+        ReadRange::Data {
+            file_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*length, 512);
+            let mut buf = vec![0u8; 512];
+            vhdx.file.read_at(*file_offset, &mut buf).await.unwrap();
+            assert_eq!(buf, pattern);
+        }
+        other => panic!("expected Data read range, got {:?}", other),
+    }
+}
+
+#[async_test]
+async fn write_partial_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write 512 bytes at offset 4096 within block 0.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(4096, 512, &mut ranges).await.unwrap();
+
+    // With safe data (near-EOF or extension space), Zero padding is
+    // skipped. Only expect the Data range.
+    // Without safe data, we'd see: Zero(leading 4096), Data(512), Zero(trailing).
+    assert!(!ranges.is_empty());
+    // Find the Data range.
+    let data_range = ranges
+        .iter()
+        .find(|r| matches!(r, WriteRange::Data { .. }))
+        .expect("expected at least one Data range");
+    match data_range {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, 4096);
+            assert_eq!(*length, 512);
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[async_test]
+async fn write_full_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write exactly one full block (no padding needed).
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_write(0, format::DEFAULT_BLOCK_SIZE, &mut ranges)
+        .await
+        .unwrap();
+
+    // Should be exactly one Data range — no zero padding.
+    assert_eq!(ranges.len(), 1);
+    match ranges[0] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            file_offset,
+        } => {
+            assert_eq!(guest_offset, 0);
+            assert_eq!(length, format::DEFAULT_BLOCK_SIZE);
+            assert!(file_offset > 0);
+            assert_eq!(file_offset % MB1, 0);
+        }
+        _ => panic!("expected Data range, got {:?}", ranges[0]),
+    }
+}
+
+#[async_test]
+async fn write_zero_length(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 0, &mut ranges).await.unwrap();
+    assert!(ranges.is_empty());
+}
+
+#[async_test]
+async fn write_beyond_end_of_disk(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let mut ranges = Vec::new();
+    let result = vhdx
+        .resolve_write(format::GB1 - 512, 1024, &mut ranges)
+        .await;
+    assert!(matches!(
+        result,
+        Err(VhdxIoError(VhdxIoErrorInner::BeyondEndOfDisk))
+    ));
+}
+
+#[async_test]
+async fn write_read_only() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let mut ranges = Vec::new();
+    let result = vhdx.resolve_write(0, 4096, &mut ranges).await;
+    assert!(matches!(
+        result,
+        Err(VhdxIoError(VhdxIoErrorInner::ReadOnly))
+    ));
+}
+
+#[async_test]
+async fn write_large_spanning_many_blocks(driver: DefaultDriver) {
+    // 4 MiB disk with 1 MiB blocks → 4 blocks.
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: 4 * MB1,
+        block_size: MB1 as u32,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write 3 MiB starting at offset 512 KiB (spans blocks 0,1,2,3).
+    let start = MB1 / 2;
+    let length = (3 * MB1) as u32;
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_write(start, length, &mut ranges)
+        .await
+        .unwrap();
+
+    let data_ranges: Vec<_> = ranges
+        .iter()
+        .filter(|r| matches!(r, WriteRange::Data { .. }))
+        .collect();
+    // Should span 4 blocks: partial block 0, full block 1, full block 2, partial block 3.
+    assert_eq!(data_ranges.len(), 4);
+
+    // Verify guest offsets and lengths.
+    let block_size = MB1;
+    match data_ranges[0] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, start);
+            assert_eq!(*length as u64, block_size - start);
+        }
+        _ => unreachable!(),
+    }
+    match data_ranges[1] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, block_size);
+            assert_eq!(*length as u64, block_size);
+        }
+        _ => unreachable!(),
+    }
+    match data_ranges[2] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, 2 * block_size);
+            assert_eq!(*length as u64, block_size);
+        }
+        _ => unreachable!(),
+    }
+    match data_ranges[3] {
+        WriteRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, 3 * block_size);
+            assert_eq!(*length as u64, start); // remaining half of last block
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[async_test]
+async fn first_write_updates_header(driver: DefaultDriver) {
+    let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let original_data_guid = params.data_write_guid;
+    assert_eq!(vhdx.data_write_guid(), original_data_guid);
+
+    // Perform a write — this triggers enable_write_mode.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap();
+
+    // data_write_guid should have changed.
+    let new_data_guid = vhdx.data_write_guid();
+    assert_ne!(new_data_guid, original_data_guid);
+    assert_ne!(new_data_guid, Guid::ZERO);
+}
+
+#[async_test]
+async fn second_write_no_header_update(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // First write — triggers header update.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap();
+    let guid_after_first = vhdx.data_write_guid();
+
+    // Second write — should NOT update header again.
+    let mut ranges2 = Vec::new();
+    let _guard2 = vhdx.resolve_write(512, 512, &mut ranges2).await.unwrap();
+    let guid_after_second = vhdx.data_write_guid();
+
+    assert_eq!(guid_after_first, guid_after_second);
+}
+
+#[async_test]
+async fn file_writable_only_does_not_change_data_guid(driver: DefaultDriver) {
+    let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let original_data_guid = params.data_write_guid;
+
+    // Enable FileWritable mode (metadata-only modification).
+    vhdx.enable_write_mode(WriteMode::FileWritable)
+        .await
+        .unwrap();
+
+    // data_write_guid should NOT have changed.
+    assert_eq!(vhdx.data_write_guid(), original_data_guid);
+
+    // But the write mode should be set (subsequent DataWritable will escalate).
+    assert_eq!(
+        vhdx.header_state.write_mode(),
+        Some(WriteMode::FileWritable)
+    );
+}
+
+// --- TFP mechanics, write integration, and error path tests ---
+
+/// Interceptor with toggleable failure for mid-test fault injection.
+struct ToggleableInterceptor {
+    fail_writes: Arc<AtomicBool>,
+    fail_set_file_size: Arc<AtomicBool>,
+}
+
+impl IoInterceptor for ToggleableInterceptor {
+    fn before_write(&self, _offset: u64, _data: &[u8]) -> Result<(), std::io::Error> {
+        if self.fail_writes.load(Ordering::SeqCst) {
+            return Err(std::io::Error::other("injected write failure"));
+        }
+        Ok(())
+    }
+
+    fn before_set_file_size(&self, _size: u64) -> Result<(), std::io::Error> {
+        if self.fail_set_file_size.load(Ordering::SeqCst) {
+            return Err(std::io::Error::other("injected set_file_size failure"));
+        }
+        Ok(())
+    }
+}
+
+#[async_test]
+async fn resolve_write_sets_tfp_on_full_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // Full-block write should set TFP on block 0.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert!(
+        mapping.transitioning_to_fully_present(),
+        "full-block resolve_write should set TFP"
+    );
+    assert!(
+        mapping.file_megabyte() > 0,
+        "allocated block should have non-zero file offset"
+    );
+}
+
+#[async_test]
+async fn resolve_write_no_tfp_on_partial_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap();
+
+    // Partial-block write should NOT set TFP — BAT committed immediately.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert!(
+        !mapping.transitioning_to_fully_present(),
+        "partial-block resolve_write should not set TFP"
+    );
+    assert_eq!(
+        mapping.bat_state(),
+        BatEntryState::FullyPresent,
+        "partial allocation should set FullyPresent immediately"
+    );
+}
+
+#[async_test]
+async fn write_read_roundtrip_multi_block(driver: DefaultDriver) {
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: 4 * MB1,
+        block_size: MB1 as u32,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let block_size = vhdx.block_size() as u64;
+    // Write 2 full blocks starting at offset 0.
+    let length = (2 * block_size) as u32;
+    let mut write_ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, length, &mut write_ranges)
+        .await
+        .unwrap();
+
+    // Write recognizable pattern to each Data range.
+    for wr in &write_ranges {
+        match wr {
+            WriteRange::Data {
+                guest_offset,
+                length,
+                file_offset,
+            } => {
+                let pattern: Vec<u8> = (0..*length)
+                    .map(|i| ((guest_offset + i as u64) % 251) as u8)
+                    .collect();
+                vhdx.file.write_at(*file_offset, &pattern).await.unwrap();
+            }
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let zeros = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &zeros).await.unwrap();
+            }
+        }
+    }
+    guard.complete().await.unwrap();
+
+    // Read back both blocks.
+    let mut read_ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_read(0, length, &mut read_ranges)
+        .await
+        .unwrap();
+
+    for rr in &read_ranges {
+        match rr {
+            ReadRange::Data {
+                guest_offset,
+                length,
+                file_offset,
+            } => {
+                let mut buf = vec![0u8; *length as usize];
+                vhdx.file.read_at(*file_offset, &mut buf).await.unwrap();
+                let expected: Vec<u8> = (0..*length)
+                    .map(|i| ((guest_offset + i as u64) % 251) as u8)
+                    .collect();
+                assert_eq!(
+                    buf, expected,
+                    "data mismatch at guest offset {guest_offset}"
+                );
+            }
+            ReadRange::Zero { .. } => {
+                panic!("expected Data range after write, got Zero");
+            }
+            ReadRange::Unmapped { .. } => {
+                panic!("expected Data range after write, got Unmapped");
+            }
+        }
+    }
+}
+
+#[async_test]
+async fn write_to_already_allocated_no_growth(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let regions = region::parse_region_tables(&file).await.unwrap();
+
+    // Pre-allocate block 0 as FullyPresent at offset 100 MB.
+    let entry = BatEntry::new()
+        .with_state(BatEntryState::FullyPresent as u8)
+        .with_file_offset_mb(100);
+    file.write_at(regions.bat_offset, entry.as_bytes())
+        .await
+        .unwrap();
+
+    // Ensure file is big enough to cover that offset.
+    let needed_size = 100 * MB1 + format::DEFAULT_BLOCK_SIZE as u64;
+    file.set_file_size(needed_size).await.unwrap();
+
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let eof_before = vhdx.allocation_lock.lock().await.file_length;
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap();
+
+    // No new allocation should occur — verify file length unchanged.
+    let eof_after = vhdx.allocation_lock.lock().await.file_length;
+    assert_eq!(
+        eof_before, eof_after,
+        "eof should not change for existing block"
+    );
+
+    // Should point to the existing block.
+    assert_eq!(ranges.len(), 1);
+    match ranges[0] {
+        WriteRange::Data { file_offset, .. } => {
+            assert_eq!(file_offset, 100 * MB1);
+        }
+        _ => panic!("expected Data range"),
+    }
+}
+
+#[async_test]
+async fn write_flush_persists_bat(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write and complete a full block.
+    let block_size = vhdx.block_size();
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+    guard.complete().await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Snapshot immediately after flush — proves flush persisted the BAT.
+    // Log GUID is still set, so reopen will do log replay.
+    let snapshot = vhdx.file.snapshot();
+
+    // Reopen from snapshot (log replay recovers the state).
+    let recovered = InMemoryFile::from_snapshot(snapshot);
+    let vhdx2 = VhdxFile::open(recovered).writable(&driver).await.unwrap();
+    let mapping = vhdx2.bat.get_block_mapping(0);
+    assert_eq!(
+        mapping.bat_state(),
+        BatEntryState::FullyPresent,
+        "BAT should show FullyPresent after flush + reopen"
+    );
+    assert!(
+        mapping.file_megabyte() > 0,
+        "BAT should have non-zero offset after flush + reopen"
+    );
+}
+
+#[async_test]
+async fn complete_write_clears_tfp(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // resolve_write should set TFP.
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    {
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert!(mapping.transitioning_to_fully_present());
+    }
+
+    // guard.complete() should clear TFP.
+    guard.complete().await.unwrap();
+
+    {
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert!(
+            !mapping.transitioning_to_fully_present(),
+            "TFP should be cleared after complete_write"
+        );
+        assert_eq!(
+            mapping.bat_state(),
+            BatEntryState::FullyPresent,
+            "block should be FullyPresent after complete"
+        );
+    }
+}
+
+#[async_test]
+async fn complete_write_writes_bat_to_disk(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // Get the allocated offset from in-memory BAT.
+    let expected_mb = vhdx.bat.get_block_mapping(0).file_megabyte();
+
+    guard.complete().await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Snapshot after flush — proves complete + flush persisted the BAT.
+    let snapshot = vhdx.file.snapshot();
+
+    // Reopen from snapshot (log replay recovers the state).
+    let recovered = InMemoryFile::from_snapshot(snapshot);
+    let vhdx2 = VhdxFile::open(recovered).writable(&driver).await.unwrap();
+    let mapping = vhdx2.bat.get_block_mapping(0);
+    assert_eq!(
+        mapping.bat_state(),
+        BatEntryState::FullyPresent,
+        "BAT should be FullyPresent after flush + reopen"
+    );
+    assert_eq!(
+        mapping.file_megabyte(),
+        expected_mb,
+        "BAT file offset should match after flush + reopen"
+    );
+}
+
+#[async_test]
+async fn resolve_write_extends_file(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let size_before = vhdx.file.file_size().await.unwrap();
+
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap();
+
+    let size_after = vhdx.file.file_size().await.unwrap();
+    assert!(
+        size_after > size_before,
+        "file should grow after allocating a new block \
+             (before={size_before}, after={size_after})"
+    );
+}
+
+#[async_test]
+async fn abort_write_reverts_bat(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // resolve_write for a full block → sets TFP.
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // Abort (drop guard without complete) → reverts in-memory BAT.
+    drop(guard);
+
+    // Block should be back to NotPresent with zero offset.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::NotPresent);
+    assert_eq!(mapping.file_offset(), 0);
+
+    vhdx.close().await.unwrap();
+}
+
+#[async_test]
+async fn abort_write_clears_tfp(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // TFP should be set.
+    {
+        assert!(
+            vhdx.bat
+                .get_block_mapping(0)
+                .transitioning_to_fully_present()
+        );
+    }
+
+    // Abort (drop guard without complete).
+    drop(guard);
+
+    // TFP should be cleared and state reverted to NotPresent.
+    {
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert!(
+            !mapping.transitioning_to_fully_present(),
+            "TFP should be cleared after abort"
+        );
+        assert_eq!(
+            mapping.bat_state(),
+            BatEntryState::NotPresent,
+            "should revert to original NotPresent state"
+        );
+        assert_eq!(
+            mapping.file_megabyte(),
+            0,
+            "should revert file_megabyte to 0"
+        );
+    }
+}
+
+#[async_test]
+async fn abort_write_allows_subsequent_write(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // First write: allocate and abort.
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+    drop(guard);
+
+    // Second write: should succeed (no TFP blocking).
+    let mut ranges2 = Vec::new();
+    let guard2 = vhdx
+        .resolve_write(0, block_size, &mut ranges2)
+        .await
+        .unwrap();
+    guard2.complete().await.unwrap();
+
+    // Block should be FullyPresent now.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent);
+    assert!(!mapping.transitioning_to_fully_present());
+}
+
+#[async_test]
+async fn complete_write_notifies_on_cache_failure(driver: DefaultDriver) {
+    // With write-back mode (no write-through), cache writes during
+    // complete() only mark pages dirty in the cache. The actual disk
+    // write happens on flush through the log task. So complete()
+    // itself should succeed even with write failures enabled.
+    let (orig_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let data = orig_file.snapshot();
+
+    let fail_writes = Arc::new(AtomicBool::new(false));
+    let interceptor = Arc::new(ToggleableInterceptor {
+        fail_writes: fail_writes.clone(),
+        fail_set_file_size: Arc::new(AtomicBool::new(false)),
+    });
+    let file = InMemoryFile::with_interceptor(0, interceptor);
+    file.write_at(0, &data).await.unwrap();
+
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // resolve_write succeeds (writes for header update, set_file_size).
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // Enable write failure.
+    fail_writes.store(true, Ordering::SeqCst);
+
+    // complete() should succeed — commit() is a no-op in write-back mode,
+    // and dirty pages are marked in cache without file I/O.
+    let result = guard.complete().await;
+    assert!(
+        result.is_ok(),
+        "complete() should succeed in write-back mode even with write failures"
+    );
+
+    // TFP should be cleared and state set to FullyPresent.
+    {
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert!(
+            !mapping.transitioning_to_fully_present(),
+            "TFP should be cleared after complete"
+        );
+        assert_eq!(
+            mapping.bat_state(),
+            BatEntryState::FullyPresent,
+            "state should be FullyPresent after complete"
+        );
+    }
+
+    // Re-enable writes.
+    fail_writes.store(false, Ordering::SeqCst);
+
+    // A subsequent resolve_write should work (not hang on TFP).
+    let mut ranges2 = Vec::new();
+    let _guard2 = vhdx
+        .resolve_write(0, block_size, &mut ranges2)
+        .await
+        .unwrap();
+}
+
+#[async_test]
+async fn resolve_write_error_reverts_tfp(driver: DefaultDriver) {
+    // Create VHDX normally, then snapshot to new file with toggleable interceptor.
+    let (orig_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let data = orig_file.snapshot();
+
+    let fail_set_file_size = Arc::new(AtomicBool::new(false));
+    let interceptor = Arc::new(ToggleableInterceptor {
+        fail_writes: Arc::new(AtomicBool::new(false)),
+        fail_set_file_size: fail_set_file_size.clone(),
+    });
+    let file = InMemoryFile::with_interceptor(0, interceptor);
+    file.write_at(0, &data).await.unwrap();
+
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // Enable set_file_size failure.
+    fail_set_file_size.store(true, Ordering::SeqCst);
+
+    // resolve_write should fail when set_file_size fails during allocation.
+    let mut ranges = Vec::new();
+    let result = vhdx.resolve_write(0, block_size, &mut ranges).await;
+    assert!(
+        result.is_err(),
+        "resolve_write should fail when set_file_size fails"
+    );
+
+    // TFP should be reverted.
+    {
+        let mapping = vhdx.bat.get_block_mapping(0);
+        assert!(
+            !mapping.transitioning_to_fully_present(),
+            "TFP should be reverted on resolve_write error"
+        );
+    }
+
+    // Disable failure, retry should succeed.
+    fail_set_file_size.store(false, Ordering::SeqCst);
+
+    let mut ranges2 = Vec::new();
+    let _guard = vhdx
+        .resolve_write(0, block_size, &mut ranges2)
+        .await
+        .unwrap();
+}
+
+/// Verify that a new allocation from near-EOF (safe data) omits zero
+/// padding, while an allocation from the free pool does emit zero padding.
+#[async_test]
+async fn safe_data_skips_zero_padding(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size() as u64;
+
+    // Step 1: Partial write to block 0 at guest_offset=0, len=512.
+    // Allocation comes from near-EOF → SpaceState::Zero → no zero ranges.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap();
+
+    let zero_ranges: Vec<_> = ranges
+        .iter()
+        .filter(|r| matches!(r, WriteRange::Zero { .. }))
+        .collect();
+    assert!(
+        zero_ranges.is_empty(),
+        "near-EOF allocation should skip zero padding, but got {} Zero ranges",
+        zero_ranges.len(),
+    );
+
+    // Extract the block base offset (block_offset=0 since guest_offset=0).
+    let allocated_offset = match ranges[0] {
+        WriteRange::Data { file_offset, .. } => file_offset,
+        _ => panic!("expected Data range"),
+    };
+
+    // Step 2: Release the allocated space back to pool.
+    // (Intentionally creating an inconsistency for testing purposes.)
+    vhdx.free_space
+        .release(allocated_offset, vhdx.block_size() as u32);
+
+    // Step 3: Partial write to block 1 at block-aligned guest offset.
+    // Should allocate from pool (unsafe data) → zero ranges emitted.
+    let mut ranges2 = Vec::new();
+    let _guard2 = vhdx
+        .resolve_write(block_size, 512, &mut ranges2)
+        .await
+        .unwrap();
+
+    let zero_ranges2: Vec<_> = ranges2
+        .iter()
+        .filter(|r| matches!(r, WriteRange::Zero { .. }))
+        .collect();
+    assert!(
+        !zero_ranges2.is_empty(),
+        "pool allocation should emit zero padding, but got 0 Zero ranges",
+    );
+}
+
+// ---- Concurrent I/O stress tests ----
+
+/// Wrapper around `InMemoryFile` that yields once on `set_file_size`.
+///
+/// `InMemoryFile`'s async methods are synchronous (return Ready
+/// immediately), so `futures::join!` won't interleave two
+/// `resolve_write` calls. This wrapper inserts a
+/// `futures::pending!()` call inside `set_file_size`, creating a yield
+/// point during `allocate_space` while the `allocation_lock` is held.
+struct YieldingFile {
+    inner: InMemoryFile,
+}
+
+impl AsyncFile for YieldingFile {
+    type Buffer = Vec<u8>;
+
+    fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+        self.inner.alloc_buffer(len)
+    }
+
+    async fn read_into(&self, offset: u64, buf: Vec<u8>) -> Result<Vec<u8>, std::io::Error> {
+        self.inner.read_into(offset, buf).await
+    }
+
+    async fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Vec<u8>> + Send + 'static,
+    ) -> Result<(), std::io::Error> {
+        self.inner.write_from(offset, buf).await
+    }
+
+    async fn flush(&self) -> Result<(), std::io::Error> {
+        self.inner.flush().await
+    }
+    async fn file_size(&self) -> Result<u64, std::io::Error> {
+        self.inner.file_size().await
+    }
+    async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+        // Yield once to allow other futures to run, then resume.
+        // We must wake ourselves before returning Pending, otherwise
+        // the executor won't re-poll us (deadlock).
+        let mut yielded = false;
+        std::future::poll_fn(|cx| {
+            if !yielded {
+                yielded = true;
+                cx.waker().wake_by_ref();
+                std::task::Poll::Pending
+            } else {
+                std::task::Poll::Ready(())
+            }
+        })
+        .await;
+        self.inner.set_file_size(size).await
+    }
+}
+
+/// Helper: create a VHDX with custom block size on an `InMemoryFile`,
+/// returning the file and params.
+async fn create_vhdx_with_block_size(
+    disk_size: u64,
+    block_size: u32,
+) -> (InMemoryFile, CreateParams) {
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size,
+        block_size,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    (file, params)
+}
+
+/// Helper: perform a full write-complete cycle on a single block.
+async fn write_block<F: AsyncFile>(
+    vhdx: &VhdxFile<F>,
+    guest_offset: u64,
+    length: u32,
+    pattern_byte: u8,
+) {
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(guest_offset, length, &mut ranges)
+        .await
+        .unwrap();
+
+    // Write pattern data at each Data range, zero at each Zero range.
+    for wr in &ranges {
+        match wr {
+            WriteRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                let data = vec![pattern_byte; *length as usize];
+                vhdx.file.write_at(*file_offset, &data).await.unwrap();
+            }
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let zeros = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &zeros).await.unwrap();
+            }
+        }
+    }
+
+    guard.complete().await.unwrap();
+}
+
+/// Helper: read a block and verify the pattern byte.
+async fn verify_block_pattern<F: AsyncFile>(
+    vhdx: &VhdxFile<F>,
+    guest_offset: u64,
+    length: u32,
+    expected_byte: u8,
+) {
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_read(guest_offset, length, &mut ranges)
+        .await
+        .unwrap();
+
+    for rr in &ranges {
+        match rr {
+            ReadRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                let mut buf = vec![0u8; *length as usize];
+                vhdx.file.read_at(*file_offset, &mut buf).await.unwrap();
+                assert!(
+                    buf.iter().all(|&b| b == expected_byte),
+                    "expected all bytes to be 0x{:02x} at file_offset {}, \
+                         but found mismatch",
+                    expected_byte,
+                    file_offset,
+                );
+            }
+            ReadRange::Zero { .. } => {
+                assert_eq!(expected_byte, 0, "expected data but got Zero range");
+            }
+            ReadRange::Unmapped { .. } => {
+                panic!("unexpected Unmapped range in non-differencing disk");
+            }
+        }
+    }
+}
+
+#[async_test]
+async fn concurrent_reads_same_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Pre-allocate block 0 with known data.
+    write_block(&*vhdx, 0, block_size, 0xAA).await;
+
+    // Spawn 10 concurrent reads to the same block.
+    let futures: Vec<_> = (0..10)
+        .map(|_| {
+            let vhdx = vhdx.clone();
+            async move {
+                let mut ranges = Vec::new();
+                let _guard = vhdx.resolve_read(0, block_size, &mut ranges).await.unwrap();
+                assert_eq!(ranges.len(), 1);
+                match &ranges[0] {
+                    ReadRange::Data {
+                        guest_offset,
+                        length,
+                        file_offset,
+                    } => {
+                        assert_eq!(*guest_offset, 0);
+                        assert_eq!(*length, block_size);
+                        assert!(*file_offset > 0);
+                    }
+                    other => panic!("expected Data range, got {:?}", other),
+                }
+                ranges
+            }
+        })
+        .collect();
+
+    let results = futures::future::join_all(futures).await;
+
+    // All results should be identical.
+    let first = &results[0];
+    for result in &results[1..] {
+        assert_eq!(first, result);
+    }
+}
+
+#[async_test]
+async fn concurrent_reads_different_blocks(driver: DefaultDriver) {
+    let (file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Pre-allocate blocks 0, 1, 2.
+    for i in 0..3u8 {
+        write_block(&*vhdx, i as u64 * block_size as u64, block_size, 0x10 + i).await;
+    }
+
+    // Spawn 3 concurrent reads, one per block.
+    let futures: Vec<_> = (0..3u32)
+        .map(|i| {
+            let vhdx = vhdx.clone();
+            let bs = block_size;
+            async move {
+                let mut ranges = Vec::new();
+                let _guard = vhdx
+                    .resolve_read(i as u64 * bs as u64, bs, &mut ranges)
+                    .await
+                    .unwrap();
+                assert_eq!(ranges.len(), 1);
+                match &ranges[0] {
+                    ReadRange::Data { file_offset, .. } => {
+                        assert!(*file_offset > 0);
+                    }
+                    other => panic!("expected Data range for block {}, got {:?}", i, other),
+                }
+            }
+        })
+        .collect();
+
+    futures::future::join_all(futures).await;
+}
+
+#[async_test]
+async fn concurrent_writes_different_blocks(driver: DefaultDriver) {
+    // 8 MiB disk with 1 MiB blocks → 8 blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Spawn 4 concurrent tasks, each writing to a unique block.
+    let futures: Vec<_> = (0..4u8)
+        .map(|i| {
+            let vhdx = vhdx.clone();
+            let bs = block_size;
+            async move {
+                let offset = i as u64 * bs as u64;
+                let pattern = 0x40 + i;
+                write_block(&*vhdx, offset, bs, pattern).await;
+            }
+        })
+        .collect();
+
+    futures::future::join_all(futures).await;
+
+    // Verify each block reads back the correct pattern.
+    for i in 0..4u8 {
+        let offset = i as u64 * block_size as u64;
+        verify_block_pattern(&*vhdx, offset, block_size, 0x40 + i).await;
+    }
+}
+
+#[async_test]
+async fn concurrent_writes_same_block(driver: DefaultDriver) {
+    // This test exercises concurrent writes to the same unallocated block.
+    // The correct behavior is serialization:
+    //   1. task_a: resolve_write → acquires allocation lock → allocates
+    //      → sets TFP → returns ranges
+    //   2. task_a: complete_write → clears TFP → FullyPresent → notifies
+    //   3. task_b: resolve_write → was waiting for TFP to clear (either
+    //      in the read phase or after acquiring the lock). Once cleared,
+    //      sees FullyPresent → emits Data range → returns.
+    //
+    // Uses YieldingFile to force a yield during set_file_size (inside
+    // allocate_space), creating the interleaving where task_b's read
+    // phase may see NotPresent before task_a sets TFP.
+
+    let (inner_file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await;
+    let data = inner_file.snapshot();
+
+    let yielding_file = YieldingFile {
+        inner: InMemoryFile::new(0),
+    };
+    yielding_file.inner.write_at(0, &data).await.unwrap();
+
+    let vhdx = Arc::new(
+        VhdxFile::open(yielding_file)
+            .writable(&driver)
+            .await
+            .unwrap(),
+    );
+    let block_size = vhdx.block_size();
+
+    // Both tasks write to block 0 (offset 0, full block).
+    // task_a does resolve + complete as a unit so TFP clears and
+    // task_b (serialized behind task_a) can proceed.
+    let vhdx_a = vhdx.clone();
+    let vhdx_b = vhdx.clone();
+
+    let task_a = async {
+        let mut ranges = Vec::new();
+        let guard = vhdx_a
+            .resolve_write(0, block_size, &mut ranges)
+            .await
+            .unwrap();
+        guard.complete().await.unwrap();
+        ranges
+    };
+
+    let task_b = async {
+        let mut ranges = Vec::new();
+        let _guard = vhdx_b
+            .resolve_write(0, block_size, &mut ranges)
+            .await
+            .unwrap();
+        ranges
+    };
+
+    let (ranges_a, ranges_b) = futures::join!(task_a, task_b);
+
+    // Both should have produced data ranges.
+    assert!(!ranges_a.is_empty(), "task_a produced no ranges");
+    assert!(!ranges_b.is_empty(), "task_b produced no ranges");
+
+    // Block should be FullyPresent.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent);
+    assert!(!mapping.transitioning_to_fully_present());
+}
+
+#[async_test]
+async fn concurrent_flush_requests(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Write to a block, complete.
+    write_block(&*vhdx, 0, block_size, 0xBB).await;
+
+    // Spawn 5 concurrent flush calls.
+    let futures: Vec<_> = (0..5)
+        .map(|_| {
+            let vhdx = vhdx.clone();
+            async move {
+                vhdx.flush().await.unwrap();
+            }
+        })
+        .collect();
+
+    futures::future::join_all(futures).await;
+}
+
+#[async_test]
+async fn stress_random_writes_no_corruption(driver: DefaultDriver) {
+    // 8 MiB disk with 1 MiB blocks → 8 blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Spawn 8 tasks, each claiming a unique block.
+    let futures: Vec<_> = (0..8u8)
+        .map(|i| {
+            let vhdx = vhdx.clone();
+            let bs = block_size;
+            async move {
+                let offset = i as u64 * bs as u64;
+                let pattern = 0x80 + i;
+                write_block(&*vhdx, offset, bs, pattern).await;
+                vhdx.flush().await.unwrap();
+            }
+        })
+        .collect();
+
+    futures::future::join_all(futures).await;
+
+    // Verify all blocks.
+    for i in 0..8u8 {
+        let offset = i as u64 * block_size as u64;
+        verify_block_pattern(&*vhdx, offset, block_size, 0x80 + i).await;
+    }
+}
+
+#[async_test]
+async fn concurrent_read_and_write_same_block(driver: DefaultDriver) {
+    let (file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Pre-allocate block 0 with known data.
+    write_block(&*vhdx, 0, block_size, 0xCC).await;
+
+    // Concurrent: read block 0, write block 1.
+    let vhdx_r = vhdx.clone();
+    let vhdx_w = vhdx.clone();
+
+    let read_task = async move {
+        let mut ranges = Vec::new();
+        let _guard = vhdx_r
+            .resolve_read(0, block_size, &mut ranges)
+            .await
+            .unwrap();
+        assert_eq!(ranges.len(), 1);
+        match &ranges[0] {
+            ReadRange::Data { .. } => {}
+            other => panic!("expected Data range, got {:?}", other),
+        }
+    };
+
+    let write_task = async move {
+        let offset = block_size as u64;
+        write_block(&*vhdx_w, offset, block_size, 0xDD).await;
+    };
+
+    futures::join!(read_task, write_task);
+
+    // Verify block 0 still has original data.
+    verify_block_pattern(&*vhdx, 0, block_size, 0xCC).await;
+    // Verify block 1 has new data.
+    verify_block_pattern(&*vhdx, block_size as u64, block_size, 0xDD).await;
+}
+
+// ---- IoGuard refcount tracking tests ----
+
+#[async_test]
+async fn read_guard_increments_refcount(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // Pre-allocate block 0 so it's FullyPresent.
+    write_block(&vhdx, 0, block_size, 0xAA).await;
+
+    // Resolve a read — refcount should be 1 while guard is alive.
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(vhdx.bat.io_refcount(0), 1);
+
+    // Drop the guard — refcount should go back to 0.
+    drop(guard);
+
+    assert_eq!(vhdx.bat.io_refcount(0), 0);
+}
+
+#[async_test]
+async fn read_guard_drop_decrements_refcount(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // Pre-allocate block 0.
+    write_block(&vhdx, 0, block_size, 0xBB).await;
+
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_read(0, block_size, &mut ranges).await.unwrap();
+
+    // Refcount is 1 while guard is held.
+    assert_eq!(vhdx.bat.io_refcount(0), 1);
+
+    // Drop explicitly.
+    drop(guard);
+
+    // Refcount back to 0.
+    assert_eq!(vhdx.bat.io_refcount(0), 0);
+}
+
+#[async_test]
+async fn read_guard_multiple_blocks(driver: DefaultDriver) {
+    let (file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // Write 3 blocks.
+    write_block(&vhdx, 0, block_size, 0x11).await;
+    write_block(&vhdx, block_size as u64, block_size, 0x22).await;
+    write_block(&vhdx, 2 * block_size as u64, block_size, 0x33).await;
+
+    // Read spanning all 3 blocks.
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_read(0, 3 * block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    assert_eq!(vhdx.bat.io_refcount(0), 1);
+    assert_eq!(vhdx.bat.io_refcount(1), 1);
+    assert_eq!(vhdx.bat.io_refcount(2), 1);
+
+    drop(guard);
+
+    assert_eq!(vhdx.bat.io_refcount(0), 0);
+    assert_eq!(vhdx.bat.io_refcount(1), 0);
+    assert_eq!(vhdx.bat.io_refcount(2), 0);
+}
+
+#[async_test]
+async fn read_guard_zero_range_has_refcount() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    // Read an unallocated (Zero) block — refcount is still incremented
+    // (harmless, since trim won't touch unallocated blocks).
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    assert_eq!(vhdx.bat.io_refcount(0), 1);
+
+    drop(guard);
+
+    assert_eq!(vhdx.bat.io_refcount(0), 0);
+}
+
+#[async_test]
+async fn write_guard_complete_drops_refcount(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // Refcount should be 1.
+    assert_eq!(vhdx.bat.io_refcount(0), 1);
+
+    // Write data and complete.
+    for wr in &ranges {
+        match wr {
+            WriteRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                let data = vec![0xEE; *length as usize];
+                vhdx.file.write_at(*file_offset, &data).await.unwrap();
+            }
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let zeros = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &zeros).await.unwrap();
+            }
+        }
+    }
+
+    guard.complete().await.unwrap();
+
+    // After complete + drop, refcount should be 0 and block should be FullyPresent.
+    assert_eq!(vhdx.bat.io_refcount(0), 0);
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent);
+}
+
+#[async_test]
+async fn write_guard_drop_aborts_and_decrements_refcount(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // Refcount should be 1.
+    assert_eq!(vhdx.bat.io_refcount(0), 1);
+
+    // Drop without calling complete() — abort.
+    drop(guard);
+
+    // Refcount should be 0, block should be back to NotPresent.
+    assert_eq!(vhdx.bat.io_refcount(0), 0);
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::NotPresent);
+}
+
+#[async_test]
+async fn concurrent_read_guards_same_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // Pre-allocate block 0.
+    write_block(&vhdx, 0, block_size, 0xFF).await;
+
+    // Two concurrent reads on the same block.
+    let mut ranges1 = Vec::new();
+    let mut ranges2 = Vec::new();
+    let guard1 = vhdx.resolve_read(0, 4096, &mut ranges1).await.unwrap();
+    let guard2 = vhdx.resolve_read(0, 4096, &mut ranges2).await.unwrap();
+
+    // Refcount should be 2.
+    assert_eq!(vhdx.bat.io_refcount(0), 2);
+
+    // Drop first guard — refcount should be 1.
+    drop(guard1);
+    assert_eq!(vhdx.bat.io_refcount(0), 1);
+
+    // Drop second guard — refcount should be 0.
+    drop(guard2);
+    assert_eq!(vhdx.bat.io_refcount(0), 0);
+}
+
+// ---- Concurrent write+trim and mixed-workload stress tests ----
+
+use crate::trim::TrimMode;
+use crate::trim::TrimRequest;
+
+#[async_test]
+async fn concurrent_write_and_trim_same_block(driver: DefaultDriver) {
+    // Setup: 8 MiB disk, 1 MiB blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Write block 0 with pattern 0xAA, complete.
+    write_block(&*vhdx, 0, block_size, 0xAA).await;
+
+    // Concurrently: write block 0 with 0xEE + trim block 0 (FileSpace).
+    let vhdx_w = vhdx.clone();
+    let vhdx_t = vhdx.clone();
+
+    let (write_result, trim_result) = futures::join!(
+        async {
+            write_block(&*vhdx_w, 0, block_size, 0xEE).await;
+            Ok::<(), VhdxIoError>(())
+        },
+        async {
+            vhdx_t
+                .trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64))
+                .await
+        }
+    );
+
+    write_result.unwrap();
+    trim_result.unwrap();
+
+    // Check what actually happened by examining block state.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    match mapping.bat_state() {
+        BatEntryState::Unmapped => {
+            // Trim won — read should return zeros.
+            verify_block_pattern(&*vhdx, 0, block_size, 0x00).await;
+        }
+        BatEntryState::FullyPresent => {
+            // Write won — read should return 0xEE.
+            verify_block_pattern(&*vhdx, 0, block_size, 0xEE).await;
+        }
+        other => panic!("unexpected state: {other:?}"),
+    }
+}
+
+#[async_test]
+async fn concurrent_trim_then_rewrite(driver: DefaultDriver) {
+    // Setup: 8 MiB disk, 1 MiB blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Write block 0 with pattern 0xAA.
+    write_block(&*vhdx, 0, block_size, 0xAA).await;
+
+    // Sequential: trim → rewrite. Verify the trim→re-allocate path.
+    vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64))
+        .await
+        .unwrap();
+
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(
+        mapping.bat_state(),
+        BatEntryState::Unmapped,
+        "block should be Unmapped after trim"
+    );
+
+    // Re-write with pattern 0xBB.
+    write_block(&*vhdx, 0, block_size, 0xBB).await;
+
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent);
+    verify_block_pattern(&*vhdx, 0, block_size, 0xBB).await;
+
+    // Now do trim + write concurrently.
+    let vhdx_t = vhdx.clone();
+    let vhdx_w = vhdx.clone();
+
+    let (trim_result, write_result) = futures::join!(
+        async {
+            vhdx_t
+                .trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64))
+                .await
+        },
+        async {
+            write_block(&*vhdx_w, 0, block_size, 0xCC).await;
+            Ok::<(), VhdxIoError>(())
+        }
+    );
+
+    trim_result.unwrap();
+    write_result.unwrap();
+
+    // Verify no panics and data is consistent.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    match mapping.bat_state() {
+        BatEntryState::Unmapped => {
+            verify_block_pattern(&*vhdx, 0, block_size, 0x00).await;
+        }
+        BatEntryState::FullyPresent => {
+            verify_block_pattern(&*vhdx, 0, block_size, 0xCC).await;
+        }
+        other => panic!("unexpected state: {other:?}"),
+    }
+}
+
+#[async_test]
+async fn mixed_workload_stress(driver: DefaultDriver) {
+    // 8 MiB disk with 1 MiB blocks → 8 blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+    let num_blocks: u32 = 8;
+
+    // Shadow state: None = unwritten/trimmed (expect zeros), Some(pattern) = last written pattern.
+    let shadow: Arc<parking_lot::Mutex<Vec<Option<u8>>>> =
+        Arc::new(parking_lot::Mutex::new(vec![None; num_blocks as usize]));
+
+    let num_tasks: u32 = 8;
+    let iters_per_task: u8 = 16;
+
+    let tasks: Vec<_> = (0..num_tasks)
+        .map(|task_id| {
+            let vhdx = vhdx.clone();
+            let shadow = shadow.clone();
+            let bs = block_size;
+
+            async move {
+                for iter in 0..iters_per_task {
+                    let block = (task_id.wrapping_mul(3).wrapping_add(iter as u32)) % num_blocks;
+                    let pattern = ((task_id as u16 * 16 + iter as u16) as u8) | 0x01; // always nonzero
+                    let block_offset = block as u64 * bs as u64;
+
+                    let op = (task_id as u8).wrapping_add(iter) % 10;
+                    match op {
+                        0..=4 => {
+                            // Write (50%)
+                            write_block(&*vhdx, block_offset, bs, pattern).await;
+                            shadow.lock()[block as usize] = Some(pattern);
+                        }
+                        5..=7 => {
+                            // Read + verify (30%)
+                            let expected = shadow.lock()[block as usize];
+                            let mut ranges = Vec::new();
+                            let guard = vhdx
+                                .resolve_read(block_offset, bs, &mut ranges)
+                                .await
+                                .unwrap();
+                            for rr in &ranges {
+                                match rr {
+                                    ReadRange::Data {
+                                        file_offset,
+                                        length,
+                                        ..
+                                    } => {
+                                        let mut buf = vec![0u8; *length as usize];
+                                        vhdx.file.read_at(*file_offset, &mut buf).await.unwrap();
+                                        let exp = expected.unwrap_or_else(|| {
+                                            panic!(
+                                                "task {task_id} iter {iter}: shadow says \
+                                                     None but got Data range"
+                                            )
+                                        });
+                                        assert!(
+                                            buf.iter().all(|&b| b == exp),
+                                            "task {task_id} iter {iter}: expected \
+                                                 0x{exp:02x}, got mismatch"
+                                        );
+                                    }
+                                    ReadRange::Zero { .. } => {
+                                        assert!(
+                                            expected.is_none(),
+                                            "task {task_id} iter {iter}: got Zero but \
+                                                 expected Some({:02x})",
+                                            expected.unwrap()
+                                        );
+                                    }
+                                    ReadRange::Unmapped { .. } => {
+                                        panic!("unexpected Unmapped on non-differencing disk");
+                                    }
+                                }
+                            }
+                            drop(guard);
+                        }
+                        8 => {
+                            // Trim (10%)
+                            vhdx.trim(TrimRequest::new(
+                                TrimMode::FileSpace,
+                                block_offset,
+                                bs as u64,
+                            ))
+                            .await
+                            .unwrap();
+                            shadow.lock()[block as usize] = None;
+                        }
+                        9 => {
+                            // Flush (10%)
+                            vhdx.flush().await.unwrap();
+                        }
+                        _ => unreachable!(),
+                    }
+                }
+            }
+        })
+        .collect();
+
+    futures::future::join_all(tasks).await;
+
+    // Post-check: verify every block against final shadow state.
+    let final_shadow = shadow.lock().clone();
+    for block in 0..num_blocks {
+        let block_offset = block as u64 * block_size as u64;
+        let expected = final_shadow[block as usize];
+        match expected {
+            Some(pattern) => {
+                verify_block_pattern(&*vhdx, block_offset, block_size, pattern).await;
+            }
+            None => {
+                // Should be zeros.
+                let mut ranges = Vec::new();
+                let _guard = vhdx
+                    .resolve_read(block_offset, block_size, &mut ranges)
+                    .await
+                    .unwrap();
+                for rr in &ranges {
+                    match rr {
+                        ReadRange::Zero { .. } => {}
+                        ReadRange::Data {
+                            file_offset,
+                            length,
+                            ..
+                        } => {
+                            let mut buf = vec![0u8; *length as usize];
+                            vhdx.file.read_at(*file_offset, &mut buf).await.unwrap();
+                            assert!(
+                                buf.iter().all(|&b| b == 0),
+                                "block {block}: shadow says None but data is non-zero"
+                            );
+                        }
+                        ReadRange::Unmapped { .. } => {
+                            panic!("unexpected Unmapped on non-differencing disk");
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[async_test]
+async fn concurrent_partial_writes_same_block(driver: DefaultDriver) {
+    // 8 MiB disk, 1 MiB blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Pre-allocate block 0 with pattern 0xAA.
+    write_block(&*vhdx, 0, block_size, 0xAA).await;
+
+    let half = block_size / 2;
+    let vhdx_a = vhdx.clone();
+    let vhdx_b = vhdx.clone();
+
+    // Concurrently write first half with 0xBB, second half with 0xCC.
+    let ((), ()) = futures::join!(
+        async {
+            // Task A: write first half.
+            let mut ranges = Vec::new();
+            let guard = vhdx_a.resolve_write(0, half, &mut ranges).await.unwrap();
+            for wr in &ranges {
+                match wr {
+                    WriteRange::Data {
+                        file_offset,
+                        length,
+                        ..
+                    } => {
+                        let data = vec![0xBB; *length as usize];
+                        vhdx_a.file.write_at(*file_offset, &data).await.unwrap();
+                    }
+                    WriteRange::Zero {
+                        file_offset,
+                        length,
+                    } => {
+                        let zeros = vec![0u8; *length as usize];
+                        vhdx_a.file.write_at(*file_offset, &zeros).await.unwrap();
+                    }
+                }
+            }
+            guard.complete().await.unwrap();
+        },
+        async {
+            // Task B: write second half.
+            let mut ranges = Vec::new();
+            let guard = vhdx_b
+                .resolve_write(half as u64, half, &mut ranges)
+                .await
+                .unwrap();
+            for wr in &ranges {
+                match wr {
+                    WriteRange::Data {
+                        file_offset,
+                        length,
+                        ..
+                    } => {
+                        let data = vec![0xCC; *length as usize];
+                        vhdx_b.file.write_at(*file_offset, &data).await.unwrap();
+                    }
+                    WriteRange::Zero {
+                        file_offset,
+                        length,
+                    } => {
+                        let zeros = vec![0u8; *length as usize];
+                        vhdx_b.file.write_at(*file_offset, &zeros).await.unwrap();
+                    }
+                }
+            }
+            guard.complete().await.unwrap();
+        }
+    );
+
+    // Read back full block: first half should be 0xBB, second half 0xCC.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, block_size, &mut ranges).await.unwrap();
+
+    for rr in &ranges {
+        match rr {
+            ReadRange::Data {
+                guest_offset,
+                length,
+                file_offset,
+            } => {
+                let mut buf = vec![0u8; *length as usize];
+                vhdx.file.read_at(*file_offset, &mut buf).await.unwrap();
+
+                // Determine expected pattern based on position within block.
+                for (i, &byte) in buf.iter().enumerate() {
+                    let pos = (*guest_offset as usize) + i;
+                    let expected = if pos < half as usize { 0xBB } else { 0xCC };
+                    assert_eq!(
+                        byte, expected,
+                        "byte at guest offset {pos}: expected 0x{expected:02x}, got 0x{byte:02x}"
+                    );
+                }
+            }
+            other => panic!("expected Data range, got {other:?}"),
+        }
+    }
+}
+
+#[async_test]
+async fn concurrent_write_flush_trim_interleaved(driver: DefaultDriver) {
+    // Setup: 8 MiB disk, 1 MiB blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Write block 0 with 0xDD, complete.
+    write_block(&*vhdx, 0, block_size, 0xDD).await;
+
+    let vhdx_f = vhdx.clone();
+    let vhdx_t = vhdx.clone();
+    let vhdx_r = vhdx.clone();
+
+    // Concurrently: flush + trim block 0 + read block 1 (unallocated → zeros).
+    let (flush_result, trim_result, read_result) = futures::join!(
+        async { vhdx_f.flush().await },
+        async {
+            vhdx_t
+                .trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64))
+                .await
+        },
+        async {
+            let mut ranges = Vec::new();
+            let _guard = vhdx_r
+                .resolve_read(block_size as u64, block_size, &mut ranges)
+                .await
+                .unwrap();
+            // Block 1 is unallocated → should be Zero.
+            for rr in &ranges {
+                assert!(
+                    matches!(rr, ReadRange::Zero { .. }),
+                    "block 1 should be Zero, got {rr:?}"
+                );
+            }
+            Ok::<(), VhdxIoError>(())
+        }
+    );
+
+    flush_result.unwrap();
+    trim_result.unwrap();
+    read_result.unwrap();
+
+    // Verify block 0 state is consistent.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    match mapping.bat_state() {
+        BatEntryState::Unmapped => {
+            // Trim completed — read should return zeros.
+            verify_block_pattern(&*vhdx, 0, block_size, 0x00).await;
+        }
+        BatEntryState::FullyPresent => {
+            // Flush completed before trim could run — data preserved.
+            verify_block_pattern(&*vhdx, 0, block_size, 0xDD).await;
+        }
+        other => panic!("unexpected state: {other:?}"),
+    }
+}
+
+#[async_test]
+async fn stress_write_trim_cycle(driver: DefaultDriver) {
+    // 8 MiB disk with 1 MiB blocks → 8 blocks.
+    let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    let num_writer_tasks: u32 = 4;
+    let num_reader_tasks: u32 = 2;
+    let iters_per_writer: u8 = 8;
+
+    // Shadow state: None = unwritten/trimmed (zeros), Some(pattern) = last written.
+    let shadow: Arc<parking_lot::Mutex<Vec<Option<u8>>>> =
+        Arc::new(parking_lot::Mutex::new(vec![None; 4]));
+
+    // Writer tasks: write → trim → write again on block `task_id`.
+    let writer_tasks: Vec<_> = (0..num_writer_tasks)
+        .map(|task_id| {
+            let vhdx = vhdx.clone();
+            let shadow = shadow.clone();
+            let bs = block_size;
+
+            async move {
+                for iter in 0..iters_per_writer {
+                    let block_offset = task_id as u64 * bs as u64;
+                    let pattern_a = ((task_id as u16 * 32 + iter as u16 * 2) as u8) | 0x01;
+                    let pattern_b = ((task_id as u16 * 32 + iter as u16 * 2 + 1) as u8) | 0x01;
+
+                    // Write with pattern_a.
+                    write_block(&*vhdx, block_offset, bs, pattern_a).await;
+                    shadow.lock()[task_id as usize] = Some(pattern_a);
+
+                    // Trim.
+                    vhdx.trim(TrimRequest::new(
+                        TrimMode::FileSpace,
+                        block_offset,
+                        bs as u64,
+                    ))
+                    .await
+                    .unwrap();
+                    shadow.lock()[task_id as usize] = None;
+
+                    // Write with pattern_b.
+                    write_block(&*vhdx, block_offset, bs, pattern_b).await;
+                    shadow.lock()[task_id as usize] = Some(pattern_b);
+                }
+            }
+        })
+        .collect();
+
+    // Reader tasks: continuously read all 4 blocks, verify consistency.
+    let reader_tasks: Vec<_> = (0..num_reader_tasks)
+        .map(|_reader_id| {
+            let vhdx = vhdx.clone();
+            let shadow = shadow.clone();
+            let bs = block_size;
+
+            async move {
+                // Read all 4 blocks multiple times.
+                for _round in 0..16 {
+                    for block in 0..4u32 {
+                        let block_offset = block as u64 * bs as u64;
+                        let expected = shadow.lock()[block as usize];
+
+                        let mut ranges = Vec::new();
+                        let guard = vhdx
+                            .resolve_read(block_offset, bs, &mut ranges)
+                            .await
+                            .unwrap();
+                        for rr in &ranges {
+                            match rr {
+                                ReadRange::Data {
+                                    file_offset,
+                                    length,
+                                    ..
+                                } => {
+                                    let mut buf = vec![0u8; *length as usize];
+                                    vhdx.file.read_at(*file_offset, &mut buf).await.unwrap();
+                                    match expected {
+                                        Some(exp) => {
+                                            assert!(
+                                                buf.iter().all(|&b| b == exp),
+                                                "reader block {block}: expected \
+                                                     0x{exp:02x}, got mismatch"
+                                            );
+                                        }
+                                        None => {
+                                            assert!(
+                                                buf.iter().all(|&b| b == 0),
+                                                "reader block {block}: expected zeros, \
+                                                     got non-zero data"
+                                            );
+                                        }
+                                    }
+                                }
+                                ReadRange::Zero { .. } => {
+                                    assert!(
+                                        expected.is_none(),
+                                        "reader block {block}: got Zero but expected \
+                                             Some({:02x})",
+                                        expected.unwrap()
+                                    );
+                                }
+                                ReadRange::Unmapped { .. } => {
+                                    panic!("unexpected Unmapped on non-differencing disk");
+                                }
+                            }
+                        }
+                        drop(guard);
+                    }
+                }
+            }
+        })
+        .collect();
+
+    // Run all tasks concurrently.
+    let all_tasks: Vec<_> = writer_tasks
+        .into_iter()
+        .map(|t| Box::pin(t) as std::pin::Pin<Box<dyn Future<Output = ()>>>)
+        .chain(
+            reader_tasks
+                .into_iter()
+                .map(|t| Box::pin(t) as std::pin::Pin<Box<dyn Future<Output = ()>>>),
+        )
+        .collect();
+
+    futures::future::join_all(all_tasks).await;
+
+    // Post-check: verify final state of all 4 blocks.
+    let final_shadow = shadow.lock().clone();
+    for block in 0..4u32 {
+        let block_offset = block as u64 * block_size as u64;
+        match final_shadow[block as usize] {
+            Some(pattern) => {
+                verify_block_pattern(&*vhdx, block_offset, block_size, pattern).await;
+            }
+            None => {
+                verify_block_pattern(&*vhdx, block_offset, block_size, 0x00).await;
+            }
+        }
+    }
+}
+
+// ---- SBM allocation tests ----
+
+#[async_test]
+async fn partial_write_diff_disk_allocates_sbm(driver: DefaultDriver) {
+    // A sub-block write to a NotPresent block in a differencing disk
+    // should allocate the SBM block and set the payload to PartiallyPresent.
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: format::GB1,
+        has_parent: true,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Partial write: 4096 bytes at offset 0 (sub-block).
+    write_block(&vhdx, 0, 4096, 0xAB).await;
+
+    // Block 0 should be PartiallyPresent.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::PartiallyPresent);
+
+    // SBM block for chunk 0 should be FullyPresent (allocated).
+    let sbm_mapping = vhdx.bat.get_sector_bitmap_mapping(0);
+    assert_eq!(sbm_mapping.bat_state(), BatEntryState::FullyPresent);
+
+    // Read the written range — should return Data.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+    let has_data = ranges.iter().any(|r| matches!(r, ReadRange::Data { .. }));
+    assert!(has_data, "written sectors should return Data");
+
+    // Read an unwritten range in the same block — should return Unmapped.
+    let mut ranges2 = Vec::new();
+    let _guard2 = vhdx.resolve_read(4096, 512, &mut ranges2).await.unwrap();
+    assert_eq!(ranges2.len(), 1);
+    assert!(
+        matches!(ranges2[0], ReadRange::Unmapped { .. }),
+        "unwritten sectors in diff disk should return Unmapped"
+    );
+}
+
+#[async_test]
+async fn partial_write_diff_disk_sbm_bits_set_correctly(driver: DefaultDriver) {
+    // Write 4096 bytes (sectors 0-7 for 512-byte sectors) to a diff disk.
+    // Verify that the written sectors read as Data and unwritten ones as Unmapped.
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: format::GB1,
+        has_parent: true,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    write_block(&vhdx, 0, 4096, 0xCD).await;
+
+    // Sectors 0-7 should be Data.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+    assert_eq!(ranges.len(), 1);
+    match &ranges[0] {
+        ReadRange::Data {
+            guest_offset,
+            length,
+            ..
+        } => {
+            assert_eq!(*guest_offset, 0);
+            assert_eq!(*length, 4096);
+        }
+        other => panic!("expected Data, got {:?}", other),
+    }
+
+    // Sector 8 onward should be Unmapped (transparent to parent).
+    let mut ranges2 = Vec::new();
+    let _guard2 = vhdx.resolve_read(4096, 512, &mut ranges2).await.unwrap();
+    assert_eq!(ranges2.len(), 1);
+    assert_eq!(
+        ranges2[0],
+        ReadRange::Unmapped {
+            guest_offset: 4096,
+            length: 512,
+        }
+    );
+}
+
+#[async_test]
+async fn full_block_write_diff_disk_no_sbm(driver: DefaultDriver) {
+    // A full-block write to a diff disk should set FullyPresent, not allocate SBM.
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: format::GB1,
+        has_parent: true,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size();
+
+    // Full-block write.
+    write_block(&vhdx, 0, block_size, 0xEE).await;
+
+    // Block 0 should be FullyPresent (TFP path).
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent);
+
+    // SBM block for chunk 0 should NOT be allocated.
+    let sbm_mapping = vhdx.bat.get_sector_bitmap_mapping(0);
+    assert_ne!(
+        sbm_mapping.bat_state(),
+        BatEntryState::FullyPresent,
+        "full-block write should not allocate SBM"
+    );
+}
+
+#[async_test]
+async fn second_partial_write_same_chunk_reuses_sbm(driver: DefaultDriver) {
+    // Two partial writes to different blocks in the same chunk should
+    // reuse the same SBM block.
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: format::GB1,
+        has_parent: true,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size() as u64;
+
+    // First partial write to block 0.
+    write_block(&vhdx, 0, 4096, 0x11).await;
+
+    let sbm_mapping_1 = vhdx.bat.get_sector_bitmap_mapping(0);
+    assert_eq!(sbm_mapping_1.bat_state(), BatEntryState::FullyPresent);
+    let sbm_offset_1 = sbm_mapping_1.file_offset();
+
+    // Second partial write to block 1 (same chunk).
+    write_block(&vhdx, block_size, 4096, 0x22).await;
+
+    let sbm_mapping_2 = vhdx.bat.get_sector_bitmap_mapping(0);
+    assert_eq!(sbm_mapping_2.bat_state(), BatEntryState::FullyPresent);
+    let sbm_offset_2 = sbm_mapping_2.file_offset();
+
+    // SBM should be reused (same file offset).
+    assert_eq!(
+        sbm_offset_1, sbm_offset_2,
+        "SBM block should be reused, not reallocated"
+    );
+
+    // Both blocks should read back correctly.
+    let mut ranges0 = Vec::new();
+    let _g0 = vhdx.resolve_read(0, 4096, &mut ranges0).await.unwrap();
+    assert!(matches!(ranges0[0], ReadRange::Data { .. }));
+
+    let mut ranges1 = Vec::new();
+    let _g1 = vhdx
+        .resolve_read(block_size, 4096, &mut ranges1)
+        .await
+        .unwrap();
+    assert!(matches!(ranges1[0], ReadRange::Data { .. }));
+}
+
+#[async_test]
+async fn partial_write_non_diff_disk_no_sbm(driver: DefaultDriver) {
+    // A sub-block write to a non-differencing disk should set FullyPresent
+    // and NOT allocate any SBM block.
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Partial write: 4096 bytes at offset 0.
+    write_block(&vhdx, 0, 4096, 0x77).await;
+
+    // Block should be FullyPresent (not PartiallyPresent).
+    let mapping = vhdx.bat.get_block_mapping(0);
+    assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent);
+
+    // SBM should NOT be allocated.
+    // For non-diff disks, sector_bitmap_block_count may be 0,
+    // so we check via bat_state directly.
+    let sbm_count = vhdx.bat.sector_bitmap_block_count;
+    if sbm_count > 0 {
+        let sbm_mapping = vhdx.bat.get_sector_bitmap_mapping(0);
+        assert_ne!(
+            sbm_mapping.bat_state(),
+            BatEntryState::FullyPresent,
+            "non-diff disk should not allocate SBM"
+        );
+    }
+
+    // Unwritten sectors within the block should read as Zero (not Unmapped).
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(4096, 512, &mut ranges).await.unwrap();
+    assert_eq!(ranges.len(), 1);
+    match &ranges[0] {
+        ReadRange::Data { .. } => {
+            // Data range is fine — zero-padded data within an allocated block.
+        }
+        ReadRange::Zero { .. } => {
+            // Zero range is also acceptable (block may be zero-padded).
+        }
+        ReadRange::Unmapped { .. } => {
+            panic!("non-diff disk should never return Unmapped for allocated block");
+        }
+    }
+}
+
+// -----------------------------------------------------------------------
+// File poisoning tests
+// -----------------------------------------------------------------------
+
+/// Interceptor with atomic flags for runtime fault injection.
+struct DynamicFailInterceptor {
+    fail_writes: AtomicBool,
+    fail_flushes: AtomicBool,
+}
+
+impl DynamicFailInterceptor {
+    fn new() -> Self {
+        Self {
+            fail_writes: AtomicBool::new(false),
+            fail_flushes: AtomicBool::new(false),
+        }
+    }
+}
+
+impl IoInterceptor for DynamicFailInterceptor {
+    fn before_write(&self, _offset: u64, _data: &[u8]) -> Result<(), std::io::Error> {
+        if self.fail_writes.load(Ordering::Relaxed) {
+            return Err(std::io::Error::other("injected write failure"));
+        }
+        Ok(())
+    }
+
+    fn before_flush(&self) -> Result<(), std::io::Error> {
+        if self.fail_flushes.load(Ordering::Relaxed) {
+            return Err(std::io::Error::other("injected flush failure"));
+        }
+        Ok(())
+    }
+}
+
+/// Helper: create a writable VHDX with a dynamic fault interceptor.
+async fn create_writable_with_faults(
+    driver: &DefaultDriver,
+) -> (VhdxFile<InMemoryFile>, Arc<DynamicFailInterceptor>) {
+    // Create a clean VHDX, snapshot it, reopen with interceptor.
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = file.snapshot();
+
+    let interceptor = Arc::new(DynamicFailInterceptor::new());
+    let file2 = InMemoryFile::with_interceptor(snapshot.len() as u64, interceptor.clone());
+    file2.write_at(0, &snapshot).await.unwrap();
+
+    let vhdx = VhdxFile::open(file2).writable(driver).await.unwrap();
+    (vhdx, interceptor)
+}
+
+#[async_test]
+async fn flush_io_error_poisons_file(driver: DefaultDriver) {
+    let (vhdx, interceptor) = create_writable_with_faults(&driver).await;
+
+    // Write some data successfully.
+    let data = [0xAAu8; 4096];
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap();
+    for range in &ranges {
+        if let WriteRange::Data {
+            file_offset,
+            length,
+            ..
+        } = range
+        {
+            vhdx.file
+                .write_at(*file_offset, &data[..*length as usize])
+                .await
+                .unwrap();
+        }
+    }
+    guard.complete().await.unwrap();
+
+    // Now inject flush failure.
+    interceptor.fail_flushes.store(true, Ordering::Relaxed);
+
+    // Flush should fail.
+    let result = vhdx.flush().await;
+    assert!(result.is_err(), "flush should fail with injected error");
+
+    // Disable the fault — shouldn't matter, file is poisoned.
+    interceptor.fail_flushes.store(false, Ordering::Relaxed);
+
+    // Subsequent writes should be rejected with Failed.
+    {
+        let mut ranges = Vec::new();
+        let result = vhdx.resolve_write(0, 4096, &mut ranges).await;
+        assert!(
+            matches!(result, Err(VhdxIoError(VhdxIoErrorInner::Failed(_)))),
+            "write after poison should return Failed"
+        );
+    }
+
+    // Reads should also be rejected.
+    {
+        let mut ranges = Vec::new();
+        let result = vhdx.resolve_read(0, 4096, &mut ranges).await;
+        assert!(
+            matches!(result, Err(VhdxIoError(VhdxIoErrorInner::Failed(_)))),
+            "read after poison should return Failed"
+        );
+    }
+
+    vhdx.abort().await;
+}
+
+#[async_test]
+async fn apply_write_error_poisons_file(driver: DefaultDriver) {
+    let (vhdx, interceptor) = create_writable_with_faults(&driver).await;
+
+    // Write one block successfully and flush to ensure the pipeline works.
+    let data = [0xBBu8; 4096];
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap();
+    for range in &ranges {
+        if let WriteRange::Data {
+            file_offset,
+            length,
+            ..
+        } = range
+        {
+            vhdx.file
+                .write_at(*file_offset, &data[..*length as usize])
+                .await
+                .unwrap();
+        }
+    }
+    guard.complete().await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Now inject write failures — this will hit the log task when
+    // it tries to write the WAL entry, and/or the apply task when
+    // it tries to write pages to their final file offsets.
+    interceptor.fail_writes.store(true, Ordering::Relaxed);
+
+    // Write to a different block to generate new dirty BAT pages.
+    let block_size = vhdx.block_size() as u64;
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(block_size, 4096, &mut ranges)
+        .await
+        .unwrap();
+    for range in &ranges {
+        if let WriteRange::Data {
+            file_offset,
+            length,
+            ..
+        } = range
+        {
+            let _ = vhdx
+                .file
+                .write_at(*file_offset, &data[..*length as usize])
+                .await;
+        }
+    }
+    guard.complete().await.unwrap();
+
+    // Flush sends to the log pipeline. The log task's WAL write
+    // will hit the injected failure and poison the file.
+    let _ = vhdx.flush().await;
+
+    // Clear the fault — the file should stay poisoned regardless.
+    interceptor.fail_writes.store(false, Ordering::Relaxed);
+
+    // A second flush attempt synchronizes with the poisoned pipeline
+    // and ensures the error has propagated.
+    let _ = vhdx.flush().await;
+
+    // The file should now be poisoned. Try an operation.
+    {
+        let mut ranges = Vec::new();
+        let result = vhdx.resolve_write(0, 4096, &mut ranges).await;
+        assert!(
+            matches!(result, Err(VhdxIoError(VhdxIoErrorInner::Failed(_)))),
+            "write after apply failure should return Failed"
+        );
+    }
+
+    vhdx.abort().await;
+}
+
+#[async_test]
+async fn poison_error_message_preserved(driver: DefaultDriver) {
+    let (vhdx, interceptor) = create_writable_with_faults(&driver).await;
+
+    // Write data.
+    let data = [0xCCu8; 4096];
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap();
+    for range in &ranges {
+        if let WriteRange::Data {
+            file_offset,
+            length,
+            ..
+        } = range
+        {
+            vhdx.file
+                .write_at(*file_offset, &data[..*length as usize])
+                .await
+                .unwrap();
+        }
+    }
+    guard.complete().await.unwrap();
+
+    // Inject flush failure and flush.
+    interceptor.fail_flushes.store(true, Ordering::Relaxed);
+    let _ = vhdx.flush().await;
+
+    // The error message should contain something useful.
+    let result = vhdx.failed.check();
+    match result {
+        Err(VhdxIoError(VhdxIoErrorInner::Failed(pf))) => {
+            assert!(
+                !pf.to_string().is_empty(),
+                "poison error message should not be empty"
+            );
+        }
+        other => panic!("expected Failed, got: {other:?}"),
+    }
+
+    vhdx.abort().await;
+}
+
+// ---- Post-Log Crash Consistency Tests ----
+//
+// These tests exercise crash recovery scenarios that aren't covered by
+// the basic crash tests or concurrent tests. They focus on:
+//   1. Unsafe (free-pool) allocation → flush → crash → no data teleportation
+//   2. High-volume log pipeline saturation → crash → replay
+//   3. Repeated crash-recovery cycles with writable reopen
+
+use crate::tests::support::CrashTestFile;
+
+/// Helper: write a data pattern via the write path.
+async fn write_pattern_p16<F: AsyncFile>(vhdx: &VhdxFile<F>, offset: u64, len: usize, value: u8) {
+    let write_buf = vec![value; len];
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(offset, len as u32, &mut ranges)
+        .await
+        .unwrap();
+    for range in &ranges {
+        match range {
+            WriteRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                vhdx.file
+                    .write_at(*file_offset, &write_buf[..(*length as usize)])
+                    .await
+                    .unwrap();
+            }
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let zeros = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &zeros).await.unwrap();
+            }
+        }
+    }
+    guard.complete().await.unwrap();
+}
+
+/// Helper: read data at a guest offset via the read path.
+async fn read_pattern_p16<F: AsyncFile>(vhdx: &VhdxFile<F>, offset: u64, len: usize) -> Vec<u8> {
+    let mut buf = vec![0u8; len];
+    let mut ranges = Vec::new();
+    let _guard = vhdx
+        .resolve_read(offset, len as u32, &mut ranges)
+        .await
+        .unwrap();
+    for range in &ranges {
+        match range {
+            ReadRange::Data {
+                guest_offset,
+                file_offset,
+                length,
+            } => {
+                let start = (*guest_offset - offset) as usize;
+                let end = start + *length as usize;
+                vhdx.file
+                    .read_at(*file_offset, &mut buf[start..end])
+                    .await
+                    .unwrap();
+            }
+            ReadRange::Zero {
+                guest_offset,
+                length,
+            } => {
+                let start = (*guest_offset - offset) as usize;
+                let end = start + *length as usize;
+                buf[start..end].fill(0);
+            }
+            ReadRange::Unmapped { .. } => {}
+        }
+    }
+    buf
+}
+
+/// Unsafe (free-pool) allocation → flush → crash → no data teleportation.
+///
+/// Allocate block A, trim it to the free pool, then write block B which
+/// reuses A's freed space. Flush (so the WAL + FSN barrier are exercised),
+/// then crash and replay. Verify:
+///   - Block B has its own data (not A's old data)
+///   - Block A reads as zeros (trimmed)
+///   - No data from A "teleports" to B via stale on-disk content
+///
+/// This is the end-to-end crash test for the pre_log_fsn barrier mechanism.
+/// Existing tests verify the barrier is *set* (bat_page_has_fsn_unsafe_free_pool)
+/// and that a flush *occurs* (flush_between_data_and_wal_unsafe), but no
+/// existing test verifies that data is correct after crash+replay when
+/// the barrier was needed.
+#[async_test]
+async fn crash_unsafe_reuse_no_teleportation(driver: DefaultDriver) {
+    let (mem_file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size() as u64;
+
+    // Step 1: Allocate block 0 with pattern 0xAA (near-EOF, safe).
+    write_pattern_p16(&vhdx, 0, block_size as usize, 0xAA).await;
+    vhdx.flush().await.unwrap();
+
+    // Step 2: Trim block 0 with FreeSpace mode → releases to free pool.
+    let trim_req = TrimRequest::new(TrimMode::FreeSpace, 0, block_size);
+    vhdx.trim(trim_req).await.unwrap();
+    vhdx.flush().await.unwrap();
+
+    // Step 3: Write block 1 — should reuse block 0's freed space.
+    // This is the unsafe allocation (SpaceState::CrossStale) that
+    // requires a pre_log_fsn barrier.
+    write_pattern_p16(&vhdx, block_size, block_size as usize, 0xBB).await;
+    vhdx.flush().await.unwrap();
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Recover and verify.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Block 0 should be zeros (trimmed with FreeSpace → Unmapped/Zero).
+    let buf0 = read_pattern_p16(&vhdx2, 0, block_size as usize).await;
+    assert!(
+        buf0.iter().all(|&b| b == 0),
+        "block 0 should be zeros after FreeSpace trim + crash"
+    );
+
+    // Block 1 should have 0xBB (not 0xAA — no teleportation).
+    let buf1 = read_pattern_p16(&vhdx2, block_size, block_size as usize).await;
+    assert!(
+        buf1.iter().all(|&b| b == 0xBB),
+        "block 1 should have 0xBB, not stale data from block 0"
+    );
+}
+
+/// High-volume log pipeline stress + crash + replay.
+///
+/// Writes many blocks through the full commit→log→apply pipeline (enough
+/// to trigger LogFull retry and circular buffer wrapping), then crashes
+/// and replays. Verifies all flushed data survives and the log replays
+/// correctly even after heavy use.
+///
+/// This combines the load profile of `log_pipeline_stress` (500 blocks)
+/// with CrashTestFile crash semantics, which no existing test does.
+#[async_test]
+async fn crash_high_volume_pipeline(driver: DefaultDriver) {
+    const BLOCK_COUNT: usize = 100;
+    const BLOCK_SIZE: u64 = 2 * MB1;
+    const WRITE_LEN: usize = 4096;
+
+    let disk_size = BLOCK_SIZE * (BLOCK_COUNT as u64 + 1);
+    let (mem_file, _) = create_vhdx_with_block_size(disk_size, BLOCK_SIZE as u32).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashTestFile::from_durable(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+
+    // Write 100 distinct blocks. The cache will trigger batch-full commits
+    // as dirty pages accumulate, and the log task will hit LogFull and
+    // retry as the circular buffer fills.
+    for i in 0..BLOCK_COUNT {
+        let offset = i as u64 * BLOCK_SIZE;
+        let pattern = (i & 0xFF) as u8;
+        write_pattern_p16(&vhdx, offset, WRITE_LEN, pattern).await;
+    }
+
+    // Flush everything — drives all batches through commit→log→apply.
+    vhdx.flush().await.unwrap();
+
+    // Crash (no clean close — log_guid remains set).
+    let durable = vhdx.file.durable_snapshot();
+    vhdx.abort().await;
+
+    // Recover with log replay.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Verify every block survived.
+    for i in 0..BLOCK_COUNT {
+        let offset = i as u64 * BLOCK_SIZE;
+        let expected = (i & 0xFF) as u8;
+        let buf = read_pattern_p16(&vhdx2, offset, WRITE_LEN).await;
+        assert!(
+            buf.iter().all(|&b| b == expected),
+            "block {i}: expected 0x{expected:02X}, got 0x{:02X}",
+            buf[0],
+        );
+    }
+}
+
+/// Repeated crash-recovery cycles with writable reopen.
+///
+/// Each cycle: open writable → write new data → flush → crash → verify.
+/// The next cycle reopens writable from the crashed state. This tests
+/// that log replay produces a file that can be opened writable again
+/// (new log set up, new sequence numbers, etc.) without corruption
+/// accumulating over multiple cycles.
+///
+/// Existing tests (`crash_recovery_then_more_writes`) do 2 rounds but
+/// always reopen read-only for verification. This test reopens writable
+/// for 5 consecutive cycles, verifying the full open-write-crash-recover
+/// lifecycle.
+#[async_test]
+async fn crash_repeated_writable_recovery_cycles(driver: DefaultDriver) {
+    const CYCLES: usize = 5;
+
+    let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let mut durable = mem_file.snapshot();
+    let block_size = MB1;
+
+    for cycle in 0..CYCLES {
+        // Open writable from the (possibly crashed) durable state.
+        let crash_file = CrashTestFile::from_durable(durable);
+        let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+
+        // Write to a different block each cycle.
+        let offset = cycle as u64 * block_size;
+        let pattern = (0x10 + cycle as u8) | 0x01; // nonzero
+        write_pattern_p16(&vhdx, offset, block_size as usize, pattern).await;
+        vhdx.flush().await.unwrap();
+
+        // Verify all blocks from this and previous cycles are correct.
+        for prev in 0..=cycle {
+            let prev_offset = prev as u64 * block_size;
+            let prev_pattern = (0x10 + prev as u8) | 0x01;
+            let buf = read_pattern_p16(&vhdx, prev_offset, block_size as usize).await;
+            assert!(
+                buf.iter().all(|&b| b == prev_pattern),
+                "cycle {cycle}, block {prev}: expected 0x{prev_pattern:02x}, \
+                 got 0x{:02x}",
+                buf[0]
+            );
+        }
+
+        // Crash.
+        durable = vhdx.file.durable_snapshot();
+        vhdx.abort().await;
+    }
+
+    // Final verification: open read-only from the last crash, verify
+    // all 5 blocks from all cycles survived.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    for cycle in 0..CYCLES {
+        let offset = cycle as u64 * block_size;
+        let pattern = (0x10 + cycle as u8) | 0x01;
+        let buf = read_pattern_p16(&vhdx, offset, block_size as usize).await;
+        assert!(
+            buf.iter().all(|&b| b == pattern),
+            "final verify block {cycle}: expected 0x{pattern:02x}, got 0x{:02x}",
+            buf[0]
+        );
+    }
+}
+
+// ---- Concurrent crash tests using YieldingCrashFile ----
+//
+// These tests use YieldingCrashFile to create genuine interleaving between
+// the log task, apply task, and user write tasks. The yield points cause
+// the apply task to yield during write_at, allowing other tasks to make
+// progress. Crash snapshots taken at these interleaving points exercise
+// the recovery path under partial-apply conditions.
+
+use crate::tests::support::YieldingCrashFile;
+
+/// Concurrent writers with interleaved apply + crash + replay.
+///
+/// Two tasks write to different blocks concurrently while the apply task
+/// yields between its writes (via `yield_on_write`). This creates a
+/// genuine interleaving: one task's data may be at its final offset while
+/// another task's WAL entry exists but hasn't been applied yet. After
+/// crash + replay, all flushed data must be present.
+#[async_test]
+async fn concurrent_writes_interleaved_apply_crash(driver: DefaultDriver) {
+    let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let snapshot = mem_file.snapshot();
+
+    // yield_on_write=true: apply task yields before each page write,
+    // allowing the log task to process another batch mid-apply.
+    let file = YieldingCrashFile::from_durable(snapshot, true, false);
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Two concurrent writers to different blocks.
+    {
+        let vhdx_a = vhdx.clone();
+        let vhdx_b = vhdx.clone();
+        let bs = block_size;
+
+        let ((), ()) = futures::join!(
+            async {
+                write_block(&*vhdx_a, 0, bs, 0xAA).await;
+            },
+            async {
+                write_block(&*vhdx_b, bs as u64, bs, 0xBB).await;
+            }
+        );
+    }
+
+    // Flush to make everything durable.
+    vhdx.flush().await.unwrap();
+
+    // Take durable snapshot and crash.
+    let durable = vhdx.file.durable_snapshot();
+    Arc::into_inner(vhdx).expect("no other refs").abort().await;
+
+    // Recover and verify both blocks survived.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let buf0 = read_pattern_p16(&vhdx2, 0, block_size as usize).await;
+    assert!(
+        buf0.iter().all(|&b| b == 0xAA),
+        "block 0 should have 0xAA after interleaved apply + crash"
+    );
+
+    let buf1 = read_pattern_p16(&vhdx2, block_size as u64, block_size as usize).await;
+    assert!(
+        buf1.iter().all(|&b| b == 0xBB),
+        "block 1 should have 0xBB after interleaved apply + crash"
+    );
+}
+
+/// Interleaved flush + write + crash.
+///
+/// `yield_on_flush=true` causes flush to yield, allowing a concurrent
+/// writer to make progress (its write reaches the log task) before the
+/// flush's file-level flush completes. After crash, the pre-flush data
+/// must be durable; the concurrent write may or may not survive.
+#[async_test]
+async fn interleaved_flush_and_write_crash(driver: DefaultDriver) {
+    let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let snapshot = mem_file.snapshot();
+
+    // yield_on_flush=true: flush yields, allowing concurrent writer to run.
+    let file = YieldingCrashFile::from_durable(snapshot, false, true);
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Write block 0 — this data must survive the flush.
+    write_block(&*vhdx, 0, block_size, 0xCC).await;
+
+    // Concurrent: flush (yields during file flush) + write block 1.
+    {
+        let vhdx_f = vhdx.clone();
+        let vhdx_w = vhdx.clone();
+        let bs = block_size;
+
+        let ((), ()) = futures::join!(
+            async {
+                vhdx_f.flush().await.unwrap();
+            },
+            async {
+                write_block(&*vhdx_w, bs as u64, bs, 0xDD).await;
+            }
+        );
+    }
+
+    // Final flush to ensure the concurrent write is also durable.
+    vhdx.flush().await.unwrap();
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    Arc::into_inner(vhdx).expect("no other refs").abort().await;
+
+    // Recover.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Block 0 must survive (was written before the first flush).
+    let buf0 = read_pattern_p16(&vhdx2, 0, block_size as usize).await;
+    assert!(
+        buf0.iter().all(|&b| b == 0xCC),
+        "block 0 should have 0xCC (pre-flush data must survive)"
+    );
+
+    // Block 1 should also survive (final flush made it durable).
+    let buf1 = read_pattern_p16(&vhdx2, block_size as u64, block_size as usize).await;
+    assert!(
+        buf1.iter().all(|&b| b == 0xDD),
+        "block 1 should have 0xDD after final flush"
+    );
+}
+
+/// Stress test: many interleaved writers with yielding apply + crash.
+///
+/// 8 tasks each write to a unique block with `yield_on_write=true`,
+/// creating maximum interleaving between the apply task and log task.
+/// After flush + crash + replay, all data must be intact.
+#[async_test]
+async fn stress_interleaved_apply_crash(driver: DefaultDriver) {
+    let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let snapshot = mem_file.snapshot();
+
+    let file = YieldingCrashFile::from_durable(snapshot, true, false);
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // 8 concurrent writers, each to a unique block.
+    let write_futures: Vec<_> = (0..8u8)
+        .map(|i| {
+            let vhdx = vhdx.clone();
+            let bs = block_size;
+            async move {
+                let offset = i as u64 * bs as u64;
+                let pattern = 0x50 + i;
+                write_block(&*vhdx, offset, bs, pattern).await;
+            }
+        })
+        .collect();
+
+    futures::future::join_all(write_futures).await;
+
+    vhdx.flush().await.unwrap();
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    let vhdx = Arc::into_inner(vhdx).expect("no other refs");
+    vhdx.abort().await;
+
+    // Recover and verify all 8 blocks.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    for i in 0..8u8 {
+        let offset = i as u64 * block_size as u64;
+        let expected = 0x50 + i;
+        let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await;
+        assert!(
+            buf.iter().all(|&b| b == expected),
+            "block {i}: expected 0x{expected:02x}, got 0x{:02x}",
+            buf[0]
+        );
+    }
+}
+
+/// Interleaved trim + write + crash with yield points.
+///
+/// Write all blocks, flush. Then concurrently trim some blocks and write
+/// others with `yield_on_write=true`. Flush, crash, and verify the
+/// expected state (trimmed blocks are zeros, written blocks have data).
+#[async_test]
+async fn interleaved_trim_write_crash(driver: DefaultDriver) {
+    let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await;
+    let snapshot = mem_file.snapshot();
+
+    let file = YieldingCrashFile::from_durable(snapshot, true, false);
+    let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap());
+    let block_size = vhdx.block_size();
+
+    // Step 1: Write all 8 blocks with initial data.
+    for i in 0..8u8 {
+        let offset = i as u64 * block_size as u64;
+        write_block(&*vhdx, offset, block_size, 0x10 + i).await;
+    }
+    vhdx.flush().await.unwrap();
+
+    // Step 2: Concurrently trim blocks 0-3 and write blocks 4-7.
+    let trim_futures: Vec<_> = (0..4u8)
+        .map(|i| {
+            let vhdx = vhdx.clone();
+            let bs = block_size;
+            async move {
+                let offset = i as u64 * bs as u64;
+                vhdx.trim(TrimRequest::new(TrimMode::FileSpace, offset, bs as u64))
+                    .await
+                    .unwrap();
+            }
+        })
+        .collect();
+
+    let write_futures: Vec<_> = (4..8u8)
+        .map(|i| {
+            let vhdx = vhdx.clone();
+            let bs = block_size;
+            async move {
+                let offset = i as u64 * bs as u64;
+                write_block(&*vhdx, offset, bs, 0x90 + i).await;
+            }
+        })
+        .collect();
+
+    let ((), ()) = futures::join!(
+        async {
+            futures::future::join_all(trim_futures).await;
+        },
+        async {
+            futures::future::join_all(write_futures).await;
+        }
+    );
+
+    vhdx.flush().await.unwrap();
+
+    // Crash.
+    let durable = vhdx.file.durable_snapshot();
+    let vhdx = Arc::into_inner(vhdx).expect("no other refs");
+    vhdx.abort().await;
+
+    // Recover.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Blocks 0-3: trimmed → zeros.
+    for i in 0..4u8 {
+        let offset = i as u64 * block_size as u64;
+        let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await;
+        assert!(
+            buf.iter().all(|&b| b == 0),
+            "block {i}: expected zeros (trimmed), got 0x{:02x}",
+            buf[0]
+        );
+    }
+
+    // Blocks 4-7: overwritten with new data.
+    for i in 4..8u8 {
+        let offset = i as u64 * block_size as u64;
+        let expected = 0x90 + i;
+        let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await;
+        assert!(
+            buf.iter().all(|&b| b == expected),
+            "block {i}: expected 0x{expected:02x}, got 0x{:02x}",
+            buf[0]
+        );
+    }
+}
+
+// ---- Selective durability crash tests ----
+//
+// These tests use CrashAfterFlushFile to crash at specific points in
+// the WAL pipeline. Unlike CrashTestFile (where flush is all-or-nothing),
+// CrashAfterFlushFile can be armed to fail after N more flushes,
+// simulating crashes between the WAL flush and the apply flush.
+
+use crate::tests::support::CrashAfterFlushFile;
+
+/// Write + flush with crash armed after 1 flush.
+///
+/// The VhdxFile::flush() path does: commit → log task writes WAL →
+/// flush_sequencer.flush() (1 file.flush()) → apply task writes BAT.
+///
+/// With arm(1), the flush_sequencer's flush succeeds (WAL + user data
+/// durable), but a subsequent flush (or the apply write itself) fails.
+/// The apply task's BAT write may or may not succeed in volatile, but
+/// the BAT is NOT durable. On recovery, WAL replay must restore the
+/// BAT page.
+#[async_test]
+async fn crash_wal_durable_apply_lost(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashAfterFlushFile::new(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size() as usize;
+
+    // Write one block.
+    write_pattern_p16(&vhdx, 0, block_size, 0xAB).await;
+
+    // Arm the crash: allow 1 more flush (the flush_sequencer's flush
+    // that makes WAL + user data durable), then fail everything.
+    vhdx.file.arm(1);
+
+    // Flush — the WAL flush succeeds; subsequent ops fail.
+    // This may return Ok (if the crash hits after the sequencer flush)
+    // or Err (if the apply task races and triggers the error).
+    let _ = vhdx.flush().await;
+
+    // Take the durable snapshot. The WAL entry and user data should be
+    // durable. The BAT page may NOT be at its final offset.
+    let durable = vhdx.file.durable_snapshot();
+
+    // Don't call abort() — the file is poisoned, tasks may be in error state.
+    // Just drop everything and recover from durable state.
+    drop(vhdx);
+
+    // Recover: open with replay. The WAL should restore the BAT page.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Verify the data survived via WAL replay.
+    let buf = read_pattern_p16(&vhdx2, 0, block_size).await;
+    assert!(
+        buf.iter().all(|&b| b == 0xAB),
+        "data should survive via WAL replay when apply is lost: got 0x{:02x}",
+        buf[0]
+    );
+}
+
+/// Write + flush with crash armed after 0 flushes.
+///
+/// The next flush fails immediately. Nothing new is durable.
+/// Recovery should see the original empty state.
+#[async_test]
+async fn crash_before_wal_flush_data_lost(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashAfterFlushFile::new(snapshot.clone());
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size() as usize;
+
+    // Write a block.
+    write_pattern_p16(&vhdx, 0, block_size, 0xCD).await;
+
+    // Arm: next flush fails. The WAL flush won't succeed, so nothing
+    // new is durable.
+    vhdx.file.arm(0);
+
+    // Flush will fail.
+    let result = vhdx.flush().await;
+    assert!(result.is_err(), "flush should fail with armed crash");
+
+    // Durable state should be the pre-write state.
+    let durable = vhdx.file.durable_snapshot();
+    drop(vhdx);
+
+    // Recover and verify data is NOT present (was never durable).
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    let buf = read_pattern_p16(&vhdx2, 0, block_size).await;
+    assert!(
+        buf.iter().all(|&b| b == 0),
+        "data should be lost when WAL flush fails: got 0x{:02x}",
+        buf[0]
+    );
+}
+
+/// Multiple writes, flush, then arm and write more.
+///
+/// First batch: write blocks 0-2, flush (all durable). Second batch:
+/// write blocks 3-4, arm(1), flush (WAL durable, apply may fail).
+/// Recovery should see blocks 0-2 (clean) and blocks 3-4 (via replay).
+#[async_test]
+async fn crash_partial_pipeline_multi_batch(driver: DefaultDriver) {
+    let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let snapshot = mem_file.snapshot();
+
+    let crash_file = CrashAfterFlushFile::new(snapshot);
+    let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap();
+    let block_size = vhdx.block_size() as u64;
+
+    // Batch 1: write blocks 0-2, flush normally (unarmed).
+    for i in 0..3u8 {
+        let offset = i as u64 * block_size;
+        write_pattern_p16(&vhdx, offset, block_size as usize, 0x10 + i).await;
+    }
+    vhdx.flush().await.unwrap();
+
+    // Batch 2: write blocks 3-4, arm, flush.
+    for i in 3..5u8 {
+        let offset = i as u64 * block_size;
+        write_pattern_p16(&vhdx, offset, block_size as usize, 0x20 + i).await;
+    }
+
+    // Arm: 1 more flush (WAL flush succeeds), then crash.
+    vhdx.file.arm(1);
+    let _ = vhdx.flush().await;
+
+    let durable = vhdx.file.durable_snapshot();
+    drop(vhdx);
+
+    // Recover.
+    let recovered = InMemoryFile::from_snapshot(durable);
+    let vhdx2 = VhdxFile::open(recovered)
+        .allow_replay(true)
+        .read_only()
+        .await
+        .unwrap();
+
+    // Blocks 0-2: from batch 1 (fully durable before arm).
+    for i in 0..3u8 {
+        let offset = i as u64 * block_size;
+        let expected = 0x10 + i;
+        let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await;
+        assert!(
+            buf.iter().all(|&b| b == expected),
+            "batch 1 block {i}: expected 0x{expected:02x}, got 0x{:02x}",
+            buf[0]
+        );
+    }
+
+    // Blocks 3-4: from batch 2 (WAL durable, may need replay).
+    for i in 3..5u8 {
+        let offset = i as u64 * block_size;
+        let expected = 0x20 + i;
+        let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await;
+        assert!(
+            buf.iter().all(|&b| b == expected),
+            "batch 2 block {i}: expected 0x{expected:02x}, got 0x{:02x}",
+            buf[0]
+        );
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/tests/mod.rs b/vm/devices/storage/vhdx/src/tests/mod.rs
new file mode 100644
index 0000000000..2bfba82243
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/tests/mod.rs
@@ -0,0 +1,547 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+pub mod support;
+
+#[cfg(test)]
+mod crash_tests;
+#[cfg(test)]
+mod io_tests;
+#[cfg(test)]
+mod trim_tests;
+
+#[cfg(test)]
+mod integration {
+    use crate::AsyncFile;
+    use crate::cache::PageCache;
+    use crate::create::{self, CreateParams};
+    use crate::format;
+    use crate::header;
+    use crate::known_meta;
+    use crate::metadata::METADATA_TAG;
+    use crate::metadata::MetadataTable;
+    use crate::region;
+    use crate::tests::support::InMemoryFile;
+    use guid::Guid;
+    use pal_async::async_test;
+    use std::sync::Arc;
+
+    fn metadata_cache(file: InMemoryFile, metadata_offset: u64) -> PageCache<InMemoryFile> {
+        let mut cache = PageCache::new(Arc::new(file), None, None, 0);
+        cache.register_tag(METADATA_TAG, metadata_offset);
+        cache
+    }
+
+    #[async_test]
+    async fn create_then_parse_full_roundtrip() {
+        let disk_size = 2 * format::GB1;
+        let mut params = CreateParams {
+            disk_size,
+            block_size: 2 * format::MB1 as u32,
+            logical_sector_size: 512,
+            physical_sector_size: 4096,
+            ..CreateParams::default()
+        };
+        let file = InMemoryFile::new(0);
+        create::create(&file, &mut params).await.unwrap();
+        let file_length = file.file_size().await.unwrap();
+
+        // 1. Parse headers.
+        let parsed_header = header::parse_headers(&file, file_length).await.unwrap();
+        assert_eq!(parsed_header.log_guid, Guid::ZERO);
+        assert_ne!(parsed_header.file_write_guid, Guid::ZERO);
+        assert_ne!(parsed_header.data_write_guid, Guid::ZERO);
+
+        // 2. Parse region tables.
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        assert!(regions.rewrite_data.is_none());
+        assert!(regions.bat_offset > 0);
+        assert!(regions.metadata_offset > 0);
+
+        // 3. Read metadata table.
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        // 4. Verify known metadata.
+        known_meta::verify_known_metadata(&table, false).unwrap();
+
+        // 5. Read known metadata.
+        let cache = metadata_cache(file, regions.metadata_offset);
+        let meta = known_meta::read_known_metadata(&cache, &table)
+            .await
+            .unwrap();
+
+        assert_eq!(meta.disk_size, disk_size);
+        assert_eq!(meta.block_size, 2 * format::MB1 as u32);
+        assert_eq!(meta.logical_sector_size, 512);
+        assert_eq!(meta.physical_sector_size, 4096);
+        assert!(!meta.has_parent);
+        assert!(!meta.leave_blocks_allocated);
+        assert_ne!(meta.page_83_data, Guid::ZERO);
+    }
+
+    #[async_test]
+    async fn create_differencing_then_parse() {
+        let mut params = CreateParams {
+            disk_size: format::GB1,
+            has_parent: true,
+            ..CreateParams::default()
+        };
+        let file = InMemoryFile::new(0);
+        create::create(&file, &mut params).await.unwrap();
+        let file_length = file.file_size().await.unwrap();
+
+        let _header = header::parse_headers(&file, file_length).await.unwrap();
+        let regions = region::parse_region_tables(&file).await.unwrap();
+        let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length)
+            .await
+            .unwrap();
+
+        known_meta::verify_known_metadata(&table, false).unwrap();
+        let cache = metadata_cache(file, regions.metadata_offset);
+        let meta = known_meta::read_known_metadata(&cache, &table)
+            .await
+            .unwrap();
+
+        assert!(meta.has_parent);
+    }
+}
+
+#[cfg(test)]
+mod log_task_integration {
+    use crate::format;
+    use crate::open::VhdxFile;
+    use crate::tests::support::InMemoryFile;
+    use crate::{AsyncFile, AsyncFileExt};
+    use pal_async::DefaultDriver;
+    use pal_async::async_test;
+    use zerocopy::FromBytes;
+
+    /// Helper: create a test VHDX file with default parameters.
+    async fn create_test_vhdx_file(disk_size: u64) -> InMemoryFile {
+        let (file, _) = InMemoryFile::create_test_vhdx(disk_size).await;
+        file
+    }
+
+    /// Helper: write a data pattern via the write path.
+    async fn write_pattern<F: AsyncFile>(vhdx: &VhdxFile<F>, offset: u64, len: usize, value: u8) {
+        let write_buf = vec![value; len];
+        let mut ranges = Vec::new();
+        let guard = vhdx
+            .resolve_write(offset, len as u32, &mut ranges)
+            .await
+            .unwrap();
+        for range in &ranges {
+            match range {
+                crate::WriteRange::Data {
+                    file_offset,
+                    length,
+                    ..
+                } => {
+                    vhdx.file
+                        .write_at(*file_offset, &write_buf[..(*length as usize)])
+                        .await
+                        .unwrap();
+                }
+                crate::WriteRange::Zero {
+                    file_offset,
+                    length,
+                } => {
+                    let zeros = vec![0u8; *length as usize];
+                    vhdx.file.write_at(*file_offset, &zeros).await.unwrap();
+                }
+            }
+        }
+        guard.complete().await.unwrap();
+    }
+
+    /// Helper: read data at a guest offset via the read path.
+    async fn read_pattern<F: AsyncFile>(vhdx: &VhdxFile<F>, offset: u64, len: usize) -> Vec<u8> {
+        let mut buf = vec![0u8; len];
+        let mut ranges = Vec::new();
+        let _guard = vhdx
+            .resolve_read(offset, len as u32, &mut ranges)
+            .await
+            .unwrap();
+        for range in &ranges {
+            match range {
+                crate::ReadRange::Data {
+                    guest_offset,
+                    file_offset,
+                    length,
+                } => {
+                    let start = (*guest_offset - offset) as usize;
+                    let end = start + *length as usize;
+                    vhdx.file
+                        .read_at(*file_offset, &mut buf[start..end])
+                        .await
+                        .unwrap();
+                }
+                crate::ReadRange::Zero {
+                    guest_offset,
+                    length,
+                } => {
+                    let start = (*guest_offset - offset) as usize;
+                    let end = start + *length as usize;
+                    buf[start..end].fill(0);
+                }
+                crate::ReadRange::Unmapped { .. } => {}
+            }
+        }
+        buf
+    }
+
+    #[async_test]
+    async fn open_writable_and_close(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // Verify the file is opened in writable mode with a log task.
+        assert!(!vhdx.read_only);
+        assert!(vhdx.log_state.is_some());
+
+        // Close should succeed cleanly.
+        vhdx.close().await.unwrap();
+    }
+
+    #[async_test]
+    async fn open_writable_sets_log_guid(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // The file should have log_guid set (the header was written during open).
+        // We verify by reading the header from the file.
+        let mut buf = vec![0u8; format::HEADER_SIZE as usize];
+        // Read both headers and check at least one has log_guid != 0.
+        vhdx.file
+            .read_at(format::HEADER_OFFSET_1, &mut buf)
+            .await
+            .unwrap();
+        let h1 = format::Header::read_from_prefix(&buf).ok().map(|(h, _)| h);
+        vhdx.file
+            .read_at(format::HEADER_OFFSET_2, &mut buf)
+            .await
+            .unwrap();
+        let h2 = format::Header::read_from_prefix(&buf).ok().map(|(h, _)| h);
+
+        let has_log_guid = h1.as_ref().is_some_and(|h| h.log_guid != guid::Guid::ZERO)
+            || h2.as_ref().is_some_and(|h| h.log_guid != guid::Guid::ZERO);
+        assert!(has_log_guid, "log_guid should be set after open_writable");
+
+        vhdx.close().await.unwrap();
+    }
+
+    #[async_test]
+    async fn close_clears_log_guid(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+        let file_ref = vhdx.file.clone();
+
+        // Close the file.
+        vhdx.close().await.unwrap();
+
+        // After close, both headers should have log_guid == ZERO
+        // (at least the current one).
+        let mut buf1 = vec![0u8; format::HEADER_SIZE as usize];
+        file_ref
+            .read_at(format::HEADER_OFFSET_1, &mut buf1)
+            .await
+            .unwrap();
+        let mut buf2 = vec![0u8; format::HEADER_SIZE as usize];
+        file_ref
+            .read_at(format::HEADER_OFFSET_2, &mut buf2)
+            .await
+            .unwrap();
+
+        use zerocopy::FromBytes as _;
+        let h1 = format::Header::read_from_prefix(&buf1).ok().map(|(h, _)| h);
+        let h2 = format::Header::read_from_prefix(&buf2).ok().map(|(h, _)| h);
+
+        // The current header (highest sequence_number) should have ZERO log_guid.
+        let current = match (&h1, &h2) {
+            (Some(a), Some(b)) if b.sequence_number >= a.sequence_number => b,
+            (Some(a), _) => a,
+            (_, Some(b)) => b,
+            _ => panic!("no valid headers"),
+        };
+        assert_eq!(current.log_guid, guid::Guid::ZERO);
+    }
+
+    #[async_test]
+    async fn write_flush_close_reopen(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+
+        // Open with log, write data, flush, close.
+        let file_arc = {
+            let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+            write_pattern(&vhdx, 0, 4096, 0xAB).await;
+            vhdx.flush().await.unwrap();
+            let file_arc = vhdx.file.clone();
+            vhdx.close().await.unwrap();
+            file_arc
+        };
+
+        // Reopen (no log needed since we closed cleanly) and verify data.
+        {
+            let vhdx = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot()))
+                .read_only()
+                .await
+                .unwrap();
+            let read_buf = read_pattern(&vhdx, 0, 4096).await;
+            assert!(read_buf.iter().all(|&b| b == 0xAB));
+        }
+    }
+
+    #[async_test]
+    async fn close_then_reopen_is_clean(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+
+        // Open with log, do nothing, close.
+        let file_arc = {
+            let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+            let file_arc = vhdx.file.clone();
+            vhdx.close().await.unwrap();
+            file_arc
+        };
+
+        // Reopen — should succeed without log replay.
+        let vhdx = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot()))
+            .read_only()
+            .await
+            .unwrap();
+        assert!(vhdx.read_only);
+    }
+
+    #[async_test]
+    async fn open_read_only_no_spawner() {
+        let file = create_test_vhdx_file(format::GB1).await;
+        let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+        assert!(vhdx.read_only);
+        assert!(vhdx.log_state.is_none());
+    }
+
+    #[async_test]
+    async fn flush_returns_fsn(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // Write data to dirty some cache pages.
+        write_pattern(&vhdx, 0, 4096, 0xEE).await;
+
+        // Commit should return a valid FSN via the cache.
+        let _fsn = vhdx.cache.commit().unwrap();
+        // FSN can be 0 if no dirty pages (BAT may or may not be dirty depending
+        // on cache state). Just verify no errors.
+
+        vhdx.close().await.unwrap();
+    }
+
+    #[async_test]
+    async fn multiple_writes_single_flush(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // Multiple writes at different offsets.
+        write_pattern(&vhdx, 0, 4096, 0x11).await;
+        write_pattern(&vhdx, 4096, 4096, 0x22).await;
+        write_pattern(&vhdx, 8192, 4096, 0x33).await;
+
+        // Single flush should handle all dirty pages.
+        vhdx.flush().await.unwrap();
+        let file_arc = vhdx.file.clone();
+        vhdx.close().await.unwrap();
+
+        // Reopen and verify.
+        let vhdx2 = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot()))
+            .read_only()
+            .await
+            .unwrap();
+
+        let buf0 = read_pattern(&vhdx2, 0, 4096).await;
+        assert!(buf0.iter().all(|&b| b == 0x11), "first write mismatch");
+        let buf1 = read_pattern(&vhdx2, 4096, 4096).await;
+        assert!(buf1.iter().all(|&b| b == 0x22), "second write mismatch");
+        let buf2 = read_pattern(&vhdx2, 8192, 4096).await;
+        assert!(buf2.iter().all(|&b| b == 0x33), "third write mismatch");
+    }
+
+    /// Write to 200+ distinct data blocks, flush, close, reopen, and verify.
+    ///
+    /// This exercises the eager commit path: with 200+ distinct block writes
+    /// the cache will accumulate more dirty BAT pages than `MAX_COMMIT_PAGES`,
+    /// triggering automatic intermediate commits before the final flush.
+    #[async_test]
+    async fn large_write_survives_close_reopen(driver: DefaultDriver) {
+        const BLOCK_COUNT: usize = 200;
+        // Default block size is 2 MiB; place one 4 KiB write in each block.
+        const BLOCK_SIZE: u64 = 2 * format::MB1;
+        const WRITE_LEN: usize = 4096;
+
+        let disk_size = BLOCK_SIZE * (BLOCK_COUNT as u64 + 1);
+        let file = create_test_vhdx_file(disk_size).await;
+
+        // Open with log, write a distinct pattern into each of 200 blocks.
+        let file_arc = {
+            let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+            for i in 0..BLOCK_COUNT {
+                let offset = i as u64 * BLOCK_SIZE;
+                let pattern = (i & 0xFF) as u8;
+                write_pattern(&vhdx, offset, WRITE_LEN, pattern).await;
+            }
+            vhdx.flush().await.unwrap();
+            let file_arc = vhdx.file.clone();
+            vhdx.close().await.unwrap();
+            file_arc
+        };
+
+        // Reopen from snapshot and verify every block.
+        {
+            let vhdx = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot()))
+                .read_only()
+                .await
+                .unwrap();
+            for i in 0..BLOCK_COUNT {
+                let offset = i as u64 * BLOCK_SIZE;
+                let expected = (i & 0xFF) as u8;
+                let buf = read_pattern(&vhdx, offset, WRITE_LEN).await;
+                assert!(
+                    buf.iter().all(|&b| b == expected),
+                    "block {} mismatch: expected 0x{:02X}, got 0x{:02X}",
+                    i,
+                    expected,
+                    buf[0],
+                );
+            }
+        }
+    }
+
+    /// After flush + close, all permits should have been released.
+    /// Verifies the apply task releases permits (not commit).
+    #[async_test]
+    async fn permits_released_after_apply(driver: DefaultDriver) {
+        let file = create_test_vhdx_file(format::GB1).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // Write to several distinct blocks so multiple BAT pages are dirtied.
+        for i in 0..10u64 {
+            let offset = i * 2 * format::MB1; // each in a different block
+            write_pattern(&vhdx, offset, 4096, (i & 0xFF) as u8).await;
+        }
+
+        // Flush commits dirty pages → log → apply → permits released.
+        vhdx.flush().await.unwrap();
+
+        // Write 10 more blocks. If permits weren't released, this would
+        // eventually block (deadlock). The fact that it completes proves
+        // permits are flowing back from the apply task.
+        for i in 10..20u64 {
+            let offset = i * 2 * format::MB1;
+            write_pattern(&vhdx, offset, 4096, (i & 0xFF) as u8).await;
+        }
+        vhdx.flush().await.unwrap();
+
+        // Verify all data survived.
+        for i in 0..20u64 {
+            let offset = i * 2 * format::MB1;
+            let expected = (i & 0xFF) as u8;
+            let buf = read_pattern(&vhdx, offset, 4096).await;
+            assert!(
+                buf.iter().all(|&b| b == expected),
+                "block {i} mismatch after permit recycling"
+            );
+        }
+
+        vhdx.close().await.unwrap();
+    }
+
+    /// Pump many batches through the full pipeline, exceeding the log's
+    /// circular buffer capacity. The log task must handle LogFull by
+    /// waiting for the apply task to drain, advancing tails, and retrying.
+    #[async_test]
+    async fn many_commits_forward_progress(driver: DefaultDriver) {
+        const BATCH_COUNT: usize = 20;
+        const BLOCK_SIZE: u64 = 2 * format::MB1;
+
+        let disk_size = BLOCK_SIZE * (BATCH_COUNT as u64 + 1);
+        let file = create_test_vhdx_file(disk_size).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // Each iteration writes to a new block (dirtying its BAT page),
+        // then flushes. This forces commit → log → apply for each batch.
+        // With a 1 MiB log the circular buffer will fill up, exercising
+        // the LogFull retry path in handle_commit.
+        for i in 0..BATCH_COUNT {
+            let offset = i as u64 * BLOCK_SIZE;
+            let pattern = (i & 0xFF) as u8;
+            write_pattern(&vhdx, offset, 4096, pattern).await;
+            vhdx.flush().await.unwrap();
+        }
+
+        let file_arc = vhdx.file.clone();
+        vhdx.close().await.unwrap();
+
+        // Reopen read-only and verify every block.
+        let vhdx2 = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot()))
+            .read_only()
+            .await
+            .unwrap();
+        for i in 0..BATCH_COUNT {
+            let offset = i as u64 * BLOCK_SIZE;
+            let expected = (i & 0xFF) as u8;
+            let buf = read_pattern(&vhdx2, offset, 4096).await;
+            assert!(
+                buf.iter().all(|&b| b == expected),
+                "block {i} mismatch: expected 0x{expected:02X}, got 0x{:02X}",
+                buf[0],
+            );
+        }
+    }
+
+    /// Stress test: pump a large number of writes through the pipeline
+    /// without individual flushes, then flush once at the end. This
+    /// exercises batch-full commit (automatic commits when dirty_count
+    /// reaches MAX_COMMIT_PAGES) combined with the log task's LogFull
+    /// retry and permit backpressure from the apply task.
+    #[async_test]
+    async fn log_pipeline_stress(driver: DefaultDriver) {
+        const BLOCK_COUNT: usize = 500;
+        const BLOCK_SIZE: u64 = 2 * format::MB1;
+        const WRITE_LEN: usize = 4096;
+
+        let disk_size = BLOCK_SIZE * (BLOCK_COUNT as u64 + 1);
+        let file = create_test_vhdx_file(disk_size).await;
+        let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+        // Write to 500 distinct blocks without flushing. The cache will
+        // trigger batch-full commits as dirty pages accumulate, and the
+        // log task will hit LogFull and retry as the circular buffer
+        // fills. The apply task must release permits to keep the
+        // pipeline moving.
+        for i in 0..BLOCK_COUNT {
+            let offset = i as u64 * BLOCK_SIZE;
+            let pattern = (i & 0xFF) as u8;
+            write_pattern(&vhdx, offset, WRITE_LEN, pattern).await;
+        }
+
+        vhdx.flush().await.unwrap();
+        let file_arc = vhdx.file.clone();
+        vhdx.close().await.unwrap();
+
+        // Reopen and verify.
+        let vhdx2 = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot()))
+            .read_only()
+            .await
+            .unwrap();
+        for i in 0..BLOCK_COUNT {
+            let offset = i as u64 * BLOCK_SIZE;
+            let expected = (i & 0xFF) as u8;
+            let buf = read_pattern(&vhdx2, offset, WRITE_LEN).await;
+            assert!(
+                buf.iter().all(|&b| b == expected),
+                "block {i} mismatch: expected 0x{expected:02X}, got 0x{:02X}",
+                buf[0],
+            );
+        }
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/tests/support.rs b/vm/devices/storage/vhdx/src/tests/support.rs
new file mode 100644
index 0000000000..7deaccdc58
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/tests/support.rs
@@ -0,0 +1,813 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Test support utilities: in-memory file backing store and I/O interceptors.
+
+use crate::{AsyncFile, AsyncFileExt};
+use parking_lot::Mutex;
+use std::borrow::Borrow;
+use std::sync::Arc;
+
+/// Trait for intercepting I/O operations in tests.
+///
+/// Default implementations return `Ok(())` (no interception).
+pub trait IoInterceptor: Send + Sync {
+    /// Called before a read operation.
+    fn before_read(&self, offset: u64, len: usize) -> Result<(), std::io::Error> {
+        let _ = (offset, len);
+        Ok(())
+    }
+
+    /// Called before a write operation.
+    fn before_write(&self, offset: u64, data: &[u8]) -> Result<(), std::io::Error> {
+        let _ = (offset, data);
+        Ok(())
+    }
+
+    /// Called before a flush operation.
+    fn before_flush(&self) -> Result<(), std::io::Error> {
+        Ok(())
+    }
+
+    /// Called before a set_file_size operation.
+    fn before_set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+        let _ = size;
+        Ok(())
+    }
+
+    /// Returns `true` if the write should be silently discarded (data not
+    /// written). The default is `false`.
+    fn should_discard_write(&self, offset: u64, data: &[u8]) -> bool {
+        let _ = (offset, data);
+        false
+    }
+}
+
+/// An interceptor that fails all I/O operations of specified types.
+pub struct FailingInterceptor {
+    /// Whether reads should fail.
+    pub fail_reads: bool,
+    /// Whether writes should fail.
+    pub fail_writes: bool,
+    /// Whether flushes should fail.
+    pub fail_flushes: bool,
+    /// Whether set_file_size should fail.
+    pub fail_set_file_size: bool,
+}
+
+impl IoInterceptor for FailingInterceptor {
+    fn before_read(&self, _offset: u64, _len: usize) -> Result<(), std::io::Error> {
+        if self.fail_reads {
+            return Err(std::io::Error::other("injected I/O failure"));
+        }
+        Ok(())
+    }
+
+    fn before_write(&self, _offset: u64, _data: &[u8]) -> Result<(), std::io::Error> {
+        if self.fail_writes {
+            return Err(std::io::Error::other("injected I/O failure"));
+        }
+        Ok(())
+    }
+
+    fn before_flush(&self) -> Result<(), std::io::Error> {
+        if self.fail_flushes {
+            return Err(std::io::Error::other("injected I/O failure"));
+        }
+        Ok(())
+    }
+
+    fn before_set_file_size(&self, _size: u64) -> Result<(), std::io::Error> {
+        if self.fail_set_file_size {
+            return Err(std::io::Error::other("injected I/O failure"));
+        }
+        Ok(())
+    }
+}
+
+/// An interceptor that silently discards writes.
+///
+/// Reads and flushes pass through normally. Writes appear to succeed
+/// but the underlying data is not modified. This simulates a crash
+/// where writes were in flight but not persisted.
+pub struct DiscardWritesInterceptor;
+
+impl IoInterceptor for DiscardWritesInterceptor {
+    fn should_discard_write(&self, _offset: u64, _data: &[u8]) -> bool {
+        true
+    }
+}
+
+/// In-memory file backing store for tests.
+///
+/// Supports optional I/O interception for failure injection and write
+/// discarding (used in crash tests).
+pub struct InMemoryFile {
+    inner: Mutex<InMemoryFileInner>,
+    interceptor: Option<Arc<dyn IoInterceptor>>,
+}
+
+struct InMemoryFileInner {
+    data: Vec<u8>,
+}
+
+impl InMemoryFile {
+    /// Creates a zero-filled file of the given size.
+    pub fn new(size: u64) -> Self {
+        Self {
+            inner: Mutex::new(InMemoryFileInner {
+                data: vec![0u8; size as usize],
+            }),
+            interceptor: None,
+        }
+    }
+
+    /// Creates a zero-filled file with an I/O interceptor.
+    pub fn with_interceptor(size: u64, interceptor: Arc<dyn IoInterceptor>) -> Self {
+        Self {
+            inner: Mutex::new(InMemoryFileInner {
+                data: vec![0u8; size as usize],
+            }),
+            interceptor: Some(interceptor),
+        }
+    }
+
+    /// Returns a clone of the current file contents.
+    pub fn snapshot(&self) -> Vec<u8> {
+        self.inner.lock().data.clone()
+    }
+
+    /// Create an `InMemoryFile` from existing data (e.g. a snapshot).
+    pub fn from_snapshot(data: Vec<u8>) -> InMemoryFile {
+        InMemoryFile {
+            inner: Mutex::new(InMemoryFileInner { data }),
+            interceptor: None,
+        }
+    }
+
+    /// Create a VHDX file in memory with the given disk size and default parameters.
+    ///
+    /// Returns the `InMemoryFile` and the validated `CreateParams`.
+    pub async fn create_test_vhdx(disk_size: u64) -> (InMemoryFile, crate::create::CreateParams) {
+        let file = InMemoryFile::new(0);
+        let mut params = crate::create::CreateParams {
+            disk_size,
+            ..Default::default()
+        };
+        crate::create::create(&file, &mut params).await.unwrap();
+        (file, params)
+    }
+}
+
+impl AsyncFile for InMemoryFile {
+    type Buffer = Vec<u8>;
+
+    fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+        vec![0u8; len]
+    }
+
+    async fn read_into(&self, offset: u64, mut buf: Vec<u8>) -> Result<Vec<u8>, std::io::Error> {
+        if let Some(interceptor) = &self.interceptor {
+            interceptor.before_read(offset, buf.len())?;
+        }
+        let inner = self.inner.lock();
+        let offset = offset as usize;
+        let file_len = inner.data.len();
+        for (i, byte) in buf.iter_mut().enumerate() {
+            let pos = offset + i;
+            *byte = if pos < file_len { inner.data[pos] } else { 0 };
+        }
+        Ok(buf)
+    }
+
+    async fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Vec<u8>> + Send + 'static,
+    ) -> Result<(), std::io::Error> {
+        let buf = buf.borrow();
+        if let Some(interceptor) = &self.interceptor {
+            interceptor.before_write(offset, buf.as_ref())?;
+            if interceptor.should_discard_write(offset, buf.as_ref()) {
+                return Ok(());
+            }
+        }
+        let mut inner = self.inner.lock();
+        let offset = offset as usize;
+        let end = offset + buf.len();
+        if end > inner.data.len() {
+            inner.data.resize(end, 0);
+        }
+        inner.data[offset..end].copy_from_slice(buf.as_ref());
+        Ok(())
+    }
+
+    async fn flush(&self) -> Result<(), std::io::Error> {
+        if let Some(interceptor) = &self.interceptor {
+            interceptor.before_flush()?;
+        }
+        Ok(())
+    }
+
+    async fn file_size(&self) -> Result<u64, std::io::Error> {
+        Ok(self.inner.lock().data.len() as u64)
+    }
+
+    async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+        if let Some(interceptor) = &self.interceptor {
+            interceptor.before_set_file_size(size)?;
+        }
+        let mut inner = self.inner.lock();
+        inner.data.resize(size as usize, 0);
+        Ok(())
+    }
+}
+
+/// A file implementation that separates volatile and durable state,
+/// with a write log for verifying operation ordering.
+///
+/// - `write_at()` → writes to volatile only (reads see it, but it won't
+///   survive a crash).
+/// - `flush()` → copies volatile to durable (survives crash).
+/// - `crash()` → returns durable state; volatile-only writes are lost.
+/// - `from_durable(data)` → creates a new file from a crash snapshot.
+///
+/// The write log records every `write_at`, `flush`, and `set_file_size`
+/// call, enabling ordering tests that verify flush barriers exist between
+/// data writes and WAL writes.
+pub struct CrashTestFile {
+    inner: Mutex<CrashTestFileInner>,
+}
+
+impl std::fmt::Debug for CrashTestFile {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let inner = self.inner.lock();
+        f.debug_struct("CrashTestFile")
+            .field("durable_len", &inner.durable.len())
+            .field("volatile_len", &inner.volatile.len())
+            .field("flush_count", &inner.flush_count)
+            .finish()
+    }
+}
+
+struct CrashTestFileInner {
+    /// Data that has survived flush — survives power failure.
+    durable: Vec<u8>,
+    /// Data as seen by reads — includes unflushed writes.
+    volatile: Vec<u8>,
+    /// How many flush() calls have occurred.
+    flush_count: u64,
+}
+
+impl CrashTestFile {
+    /// Create a CrashTestFile from existing durable data (e.g. from a crash snapshot).
+    pub fn from_durable(data: Vec<u8>) -> Self {
+        Self {
+            inner: Mutex::new(CrashTestFileInner {
+                volatile: data.clone(),
+                durable: data,
+                flush_count: 0,
+            }),
+        }
+    }
+
+    /// Snapshot durable state without consuming the file.
+    pub fn durable_snapshot(&self) -> Vec<u8> {
+        self.inner.lock().durable.clone()
+    }
+
+    /// How many flushes have occurred.
+    pub fn flush_count(&self) -> u64 {
+        self.inner.lock().flush_count
+    }
+}
+
+/// A crash-test file that yields during `write_at` and/or `flush`,
+/// allowing other tasks to interleave.
+///
+/// This combines `CrashTestFile`'s durable/volatile split with
+/// `YieldingFile`'s yield-point mechanism. When a yield is configured,
+/// the file yields (returns Pending once) at the start of the operation,
+/// allowing other spawned tasks to run. This creates genuine interleaving
+/// between the log task, apply task, and user write tasks.
+///
+/// # Use cases
+///
+/// - **`yield_on_write = true`**: The apply task yields before each
+///   `write_at`, allowing the log task to process another commit. This
+///   creates a crash point where one batch's applies are in progress
+///   while another batch is being logged.
+///
+/// - **`yield_on_flush = true`**: The flush path yields, allowing
+///   concurrent writes to reach the log task before the flush completes.
+pub struct YieldingCrashFile {
+    inner: Mutex<CrashTestFileYieldInner>,
+}
+
+struct CrashTestFileYieldInner {
+    durable: Vec<u8>,
+    volatile: Vec<u8>,
+    flush_count: u64,
+    yield_on_write: bool,
+    yield_on_flush: bool,
+}
+
+impl YieldingCrashFile {
+    /// Create a `YieldingCrashFile` from existing durable data.
+    pub fn from_durable(data: Vec<u8>, yield_on_write: bool, yield_on_flush: bool) -> Self {
+        Self {
+            inner: Mutex::new(CrashTestFileYieldInner {
+                volatile: data.clone(),
+                durable: data,
+                flush_count: 0,
+                yield_on_write,
+                yield_on_flush,
+            }),
+        }
+    }
+
+    /// Snapshot durable state without consuming the file.
+    pub fn durable_snapshot(&self) -> Vec<u8> {
+        self.inner.lock().durable.clone()
+    }
+}
+
+/// A crash-test file where the crash point is armed dynamically.
+///
+/// Before arming, the file behaves like a normal `CrashTestFile`: writes
+/// go to volatile, flush copies volatile→durable.
+///
+/// After [`arm(n)`](Self::arm) is called, the file will allow exactly `n`
+/// more flushes to succeed (making data durable), then start failing all
+/// writes and flushes with I/O errors. The durable state is frozen at
+/// the last successful flush.
+///
+/// # Typical usage
+///
+/// ```ignore
+/// // Create and open writable (flushes during open are unaffected).
+/// let file = CrashAfterFlushFile::new(snapshot);
+/// let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+///
+/// // Do some writes.
+/// write_block(&vhdx, 0, bs, 0xAA).await;
+///
+/// // Arm: allow 1 more flush (the WAL flush), then crash.
+/// vhdx.file.arm(1);
+///
+/// // This flush will: commit → log task writes WAL → flush_sequencer
+/// // calls file.flush() (succeeds, armed count decrements to 0) →
+/// // apply task tries to write → I/O error → file poisoned.
+/// let _ = vhdx.flush().await; // may fail if apply races
+/// ```
+pub struct CrashAfterFlushFile {
+    inner: Mutex<CrashAfterFlushInner>,
+}
+
+struct CrashAfterFlushInner {
+    /// Data that has survived flush — survives power failure.
+    durable: Vec<u8>,
+    /// Data as seen by reads — includes unflushed writes.
+    volatile: Vec<u8>,
+    /// How many flushes have occurred.
+    flush_count: u64,
+    /// When Some(n), allow n more flushes then crash. None = not armed.
+    remaining_flushes: Option<u64>,
+    /// Whether the crash has been triggered.
+    crashed: bool,
+}
+
+impl CrashAfterFlushFile {
+    /// Create a new crash-armed file from existing data.
+    /// The file starts unarmed; call [`arm()`](Self::arm) to set the crash point.
+    pub fn new(data: Vec<u8>) -> Self {
+        Self {
+            inner: Mutex::new(CrashAfterFlushInner {
+                volatile: data.clone(),
+                durable: data,
+                flush_count: 0,
+                remaining_flushes: None,
+                crashed: false,
+            }),
+        }
+    }
+
+    /// Arm the crash: allow `n` more successful flushes, then fail.
+    ///
+    /// - `arm(0)` — the next flush fails immediately.
+    /// - `arm(1)` — the next flush succeeds (makes data durable), then
+    ///   the one after that fails.
+    pub fn arm(&self, remaining_flushes: u64) {
+        let mut inner = self.inner.lock();
+        inner.remaining_flushes = Some(remaining_flushes);
+    }
+
+    /// Snapshot durable state without consuming the file.
+    pub fn durable_snapshot(&self) -> Vec<u8> {
+        self.inner.lock().durable.clone()
+    }
+}
+
+impl AsyncFile for CrashAfterFlushFile {
+    type Buffer = Vec<u8>;
+
+    fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+        vec![0u8; len]
+    }
+
+    async fn read_into(&self, offset: u64, mut buf: Vec<u8>) -> Result<Vec<u8>, std::io::Error> {
+        let inner = self.inner.lock();
+        let offset = offset as usize;
+        let file_len = inner.volatile.len();
+        for (i, byte) in buf.iter_mut().enumerate() {
+            let pos = offset + i;
+            *byte = if pos < file_len {
+                inner.volatile[pos]
+            } else {
+                0
+            };
+        }
+        Ok(buf)
+    }
+
+    async fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Vec<u8>> + Send + 'static,
+    ) -> Result<(), std::io::Error> {
+        let buf = buf.borrow();
+        let mut inner = self.inner.lock();
+        if inner.crashed {
+            return Err(std::io::Error::other("crash: disk unavailable"));
+        }
+        let off = offset as usize;
+        let end = off + buf.len();
+        if end > inner.volatile.len() {
+            inner.volatile.resize(end, 0);
+        }
+        inner.volatile[off..end].copy_from_slice(buf.as_ref());
+        Ok(())
+    }
+
+    async fn flush(&self) -> Result<(), std::io::Error> {
+        let mut inner = self.inner.lock();
+        if inner.crashed {
+            return Err(std::io::Error::other("crash: disk unavailable"));
+        }
+        // Check if armed and out of remaining flushes.
+        if let Some(ref remaining) = inner.remaining_flushes {
+            if *remaining == 0 {
+                // Crash NOW — don't make data durable, fail the flush.
+                inner.crashed = true;
+                return Err(std::io::Error::other("crash: disk unavailable"));
+            }
+        }
+        // Make data durable.
+        inner.durable = inner.volatile.clone();
+        inner.flush_count += 1;
+        // Decrement remaining flushes.
+        if let Some(ref mut remaining) = inner.remaining_flushes {
+            *remaining -= 1;
+        }
+        Ok(())
+    }
+
+    async fn file_size(&self) -> Result<u64, std::io::Error> {
+        Ok(self.inner.lock().volatile.len() as u64)
+    }
+
+    async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+        let mut inner = self.inner.lock();
+        if inner.crashed {
+            return Err(std::io::Error::other("crash: disk unavailable"));
+        }
+        inner.volatile.resize(size as usize, 0);
+        inner.durable.resize(size as usize, 0);
+        Ok(())
+    }
+}
+
+/// Yield once to allow other tasks to run, then resume.
+async fn yield_once() {
+    let mut yielded = false;
+    std::future::poll_fn(|cx| {
+        if !yielded {
+            yielded = true;
+            cx.waker().wake_by_ref();
+            std::task::Poll::Pending
+        } else {
+            std::task::Poll::Ready(())
+        }
+    })
+    .await;
+}
+
+impl AsyncFile for YieldingCrashFile {
+    type Buffer = Vec<u8>;
+
+    fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+        vec![0u8; len]
+    }
+
+    async fn read_into(&self, offset: u64, mut buf: Vec<u8>) -> Result<Vec<u8>, std::io::Error> {
+        let inner = self.inner.lock();
+        let offset = offset as usize;
+        let file_len = inner.volatile.len();
+        for (i, byte) in buf.iter_mut().enumerate() {
+            let pos = offset + i;
+            *byte = if pos < file_len {
+                inner.volatile[pos]
+            } else {
+                0
+            };
+        }
+        Ok(buf)
+    }
+
+    async fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Vec<u8>> + Send + 'static,
+    ) -> Result<(), std::io::Error> {
+        let should_yield = self.inner.lock().yield_on_write;
+        if should_yield {
+            yield_once().await;
+        }
+        let buf = buf.borrow();
+        let mut inner = self.inner.lock();
+        let off = offset as usize;
+        let end = off + buf.len();
+        if end > inner.volatile.len() {
+            inner.volatile.resize(end, 0);
+        }
+        inner.volatile[off..end].copy_from_slice(buf.as_ref());
+        Ok(())
+    }
+
+    async fn flush(&self) -> Result<(), std::io::Error> {
+        let should_yield = self.inner.lock().yield_on_flush;
+        if should_yield {
+            yield_once().await;
+        }
+
+        let mut inner = self.inner.lock();
+        inner.durable = inner.volatile.clone();
+        inner.flush_count += 1;
+        Ok(())
+    }
+
+    async fn file_size(&self) -> Result<u64, std::io::Error> {
+        Ok(self.inner.lock().volatile.len() as u64)
+    }
+
+    async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+        let mut inner = self.inner.lock();
+        inner.volatile.resize(size as usize, 0);
+        inner.durable.resize(size as usize, 0);
+        Ok(())
+    }
+}
+
+impl AsyncFile for CrashTestFile {
+    type Buffer = Vec<u8>;
+
+    fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+        vec![0u8; len]
+    }
+
+    async fn read_into(&self, offset: u64, mut buf: Vec<u8>) -> Result<Vec<u8>, std::io::Error> {
+        let inner = self.inner.lock();
+        let offset = offset as usize;
+        let file_len = inner.volatile.len();
+        for (i, byte) in buf.iter_mut().enumerate() {
+            let pos = offset + i;
+            *byte = if pos < file_len {
+                inner.volatile[pos]
+            } else {
+                0
+            };
+        }
+        Ok(buf)
+    }
+
+    async fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Vec<u8>> + Send + 'static,
+    ) -> Result<(), std::io::Error> {
+        let buf = buf.borrow();
+        let mut inner = self.inner.lock();
+        let off = offset as usize;
+        let end = off + buf.len();
+        if end > inner.volatile.len() {
+            inner.volatile.resize(end, 0);
+        }
+        inner.volatile[off..end].copy_from_slice(buf.as_ref());
+        Ok(())
+    }
+
+    async fn flush(&self) -> Result<(), std::io::Error> {
+        let mut inner = self.inner.lock();
+        // Copy volatile to durable (all unflushed writes become durable).
+        inner.durable = inner.volatile.clone();
+        inner.flush_count += 1;
+        Ok(())
+    }
+
+    async fn file_size(&self) -> Result<u64, std::io::Error> {
+        // Return volatile size (latest state as seen by reads).
+        Ok(self.inner.lock().volatile.len() as u64)
+    }
+
+    async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> {
+        let mut inner = self.inner.lock();
+        // File size changes are immediately durable (metadata is sync).
+        inner.volatile.resize(size as usize, 0);
+        inner.durable.resize(size as usize, 0);
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pal_async::async_test;
+
+    #[async_test]
+    async fn write_then_read() {
+        let file = InMemoryFile::new(1024);
+        let data = b"hello, vhdx!";
+        file.write_at(100, data).await.unwrap();
+
+        let mut buf = vec![0u8; data.len()];
+        file.read_at(100, &mut buf).await.unwrap();
+        assert_eq!(&buf, data);
+    }
+
+    #[async_test]
+    async fn read_zeros_on_new_file() {
+        let file = InMemoryFile::new(256);
+        let mut buf = vec![0xFFu8; 256];
+        file.read_at(0, &mut buf).await.unwrap();
+        assert!(buf.iter().all(|&b| b == 0));
+    }
+
+    #[async_test]
+    async fn read_beyond_eof_zero_fills() {
+        let file = InMemoryFile::new(8);
+        // Write known data to the entire file.
+        file.write_at(0, &[1, 2, 3, 4, 5, 6, 7, 8]).await.unwrap();
+
+        // Read a range that extends 4 bytes past EOF.
+        let mut buf = vec![0xFFu8; 12];
+        file.read_at(0, &mut buf).await.unwrap();
+        assert_eq!(&buf[..8], &[1, 2, 3, 4, 5, 6, 7, 8]);
+        assert_eq!(&buf[8..], &[0, 0, 0, 0]);
+
+        // Read entirely beyond EOF.
+        let mut buf2 = vec![0xFFu8; 4];
+        file.read_at(100, &mut buf2).await.unwrap();
+        assert!(buf2.iter().all(|&b| b == 0));
+    }
+
+    #[async_test]
+    async fn write_beyond_eof_grows() {
+        let file = InMemoryFile::new(4);
+        assert_eq!(file.file_size().await.unwrap(), 4);
+
+        file.write_at(8, b"hi").await.unwrap();
+        assert_eq!(file.file_size().await.unwrap(), 10);
+
+        // Gap between old EOF (4) and write offset (8) should be zeros.
+        let mut gap = vec![0xFFu8; 4];
+        file.read_at(4, &mut gap).await.unwrap();
+        assert!(gap.iter().all(|&b| b == 0));
+
+        // Written data should be present.
+        let mut buf = vec![0u8; 2];
+        file.read_at(8, &mut buf).await.unwrap();
+        assert_eq!(&buf, b"hi");
+    }
+
+    #[async_test]
+    async fn set_file_size_grow() {
+        let file = InMemoryFile::new(4);
+        file.write_at(0, &[1, 2, 3, 4]).await.unwrap();
+
+        file.set_file_size(8).await.unwrap();
+        assert_eq!(file.file_size().await.unwrap(), 8);
+
+        let mut buf = vec![0xFFu8; 8];
+        file.read_at(0, &mut buf).await.unwrap();
+        assert_eq!(&buf, &[1, 2, 3, 4, 0, 0, 0, 0]);
+    }
+
+    #[async_test]
+    async fn set_file_size_shrink() {
+        let file = InMemoryFile::new(8);
+        file.write_at(0, &[1, 2, 3, 4, 5, 6, 7, 8]).await.unwrap();
+
+        file.set_file_size(4).await.unwrap();
+        assert_eq!(file.file_size().await.unwrap(), 4);
+
+        let snapshot = file.snapshot();
+        assert_eq!(&snapshot, &[1, 2, 3, 4]);
+    }
+
+    #[async_test]
+    async fn file_size_reports_correctly() {
+        let file = InMemoryFile::new(100);
+        assert_eq!(file.file_size().await.unwrap(), 100);
+
+        file.set_file_size(200).await.unwrap();
+        assert_eq!(file.file_size().await.unwrap(), 200);
+
+        file.set_file_size(50).await.unwrap();
+        assert_eq!(file.file_size().await.unwrap(), 50);
+    }
+
+    #[async_test]
+    async fn snapshot_returns_copy() {
+        let file = InMemoryFile::new(4);
+        file.write_at(0, &[1, 2, 3, 4]).await.unwrap();
+
+        let snap = file.snapshot();
+        assert_eq!(&snap, &[1, 2, 3, 4]);
+
+        // Subsequent write should not affect the snapshot.
+        file.write_at(0, &[9, 9, 9, 9]).await.unwrap();
+        assert_eq!(&snap, &[1, 2, 3, 4]);
+    }
+
+    #[async_test]
+    async fn failing_interceptor_read() {
+        let file = InMemoryFile::with_interceptor(
+            64,
+            Arc::new(FailingInterceptor {
+                fail_reads: true,
+                fail_writes: false,
+                fail_flushes: false,
+                fail_set_file_size: false,
+            }),
+        );
+
+        let mut buf = vec![0u8; 8];
+        let result = file.read_at(0, &mut buf).await;
+        assert!(result.is_err());
+    }
+
+    #[async_test]
+    async fn failing_interceptor_write() {
+        let file = InMemoryFile::with_interceptor(
+            64,
+            Arc::new(FailingInterceptor {
+                fail_reads: false,
+                fail_writes: true,
+                fail_flushes: false,
+                fail_set_file_size: false,
+            }),
+        );
+
+        let result = file.write_at(0, &[1, 2, 3, 4]).await;
+        assert!(result.is_err());
+
+        // File should not be modified.
+        let snapshot = file.snapshot();
+        assert!(snapshot.iter().all(|&b| b == 0));
+    }
+
+    #[async_test]
+    async fn failing_interceptor_flush() {
+        let file = InMemoryFile::with_interceptor(
+            64,
+            Arc::new(FailingInterceptor {
+                fail_reads: false,
+                fail_writes: false,
+                fail_flushes: true,
+                fail_set_file_size: false,
+            }),
+        );
+
+        let result = file.flush().await;
+        assert!(result.is_err());
+    }
+
+    #[async_test]
+    async fn discard_writes_interceptor() {
+        let file = InMemoryFile::with_interceptor(8, Arc::new(DiscardWritesInterceptor));
+
+        // Write should appear to succeed.
+        file.write_at(0, &[1, 2, 3, 4]).await.unwrap();
+
+        // But the data should not actually be written.
+        let mut buf = vec![0xFFu8; 4];
+        file.read_at(0, &mut buf).await.unwrap();
+        assert!(buf.iter().all(|&b| b == 0));
+    }
+
+    #[async_test]
+    async fn flush_is_noop() {
+        let file = InMemoryFile::new(64);
+        file.flush().await.unwrap();
+    }
+}
diff --git a/vm/devices/storage/vhdx/src/tests/trim_tests.rs b/vm/devices/storage/vhdx/src/tests/trim_tests.rs
new file mode 100644
index 0000000000..f2cb201ace
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/tests/trim_tests.rs
@@ -0,0 +1,710 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+use crate::AsyncFileExt;
+use crate::create::{self, CreateParams};
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::format;
+use crate::format::BatEntryState;
+use crate::format::MB1;
+use crate::io::ReadRange;
+use crate::io::WriteRange;
+use crate::open::VhdxFile;
+use crate::tests::support::InMemoryFile;
+use crate::trim::{TrimMode, TrimRequest};
+use pal_async::DefaultDriver;
+use pal_async::async_test;
+
+/// Helper to create a disk and write a full block, returning the VhdxFile.
+async fn create_and_write_block(
+    disk_size: u64,
+    block_number: u32,
+    driver: &DefaultDriver,
+) -> VhdxFile<InMemoryFile> {
+    let (file, _) = InMemoryFile::create_test_vhdx(disk_size).await;
+    let vhdx = VhdxFile::open(file).writable(driver).await.unwrap();
+    let block_offset = block_number as u64 * vhdx.block_size() as u64;
+    let block_size = vhdx.block_size();
+
+    // Write a full block of data.
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(block_offset, block_size, &mut ranges)
+        .await
+        .unwrap();
+
+    // Perform the writes (we don't actually need to write data for BAT testing).
+    for range in &ranges {
+        match range {
+            WriteRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                let buf = vec![0xAA; *length as usize];
+                vhdx.file.write_at(*file_offset, &buf).await.unwrap();
+            }
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let buf = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &buf).await.unwrap();
+            }
+        }
+    }
+    guard.complete().await.unwrap();
+    vhdx
+}
+
+/// Helper to verify a block's BAT state.
+fn assert_block_state(vhdx: &VhdxFile<InMemoryFile>, block_number: u32, expected: BatEntryState) {
+    let mapping = vhdx.bat.get_block_mapping(block_number);
+    let actual = mapping.bat_state();
+    assert_eq!(
+        actual, expected,
+        "block {block_number}: expected {expected:?}, got {actual:?}"
+    );
+}
+
+/// Helper to check if a block has a non-zero file megabyte (soft anchor).
+fn block_has_file_offset(vhdx: &VhdxFile<InMemoryFile>, block_number: u32) -> bool {
+    vhdx.bat.get_block_mapping(block_number).file_megabyte() != 0
+}
+
+/// Helper to write data to ranges returned by resolve_write.
+async fn write_ranges(vhdx: &VhdxFile<InMemoryFile>, ranges: &[WriteRange], pattern: u8) {
+    for range in ranges {
+        match range {
+            WriteRange::Data {
+                file_offset,
+                length,
+                ..
+            } => {
+                let buf = vec![pattern; *length as usize];
+                vhdx.file.write_at(*file_offset, &buf).await.unwrap();
+            }
+            WriteRange::Zero {
+                file_offset,
+                length,
+            } => {
+                let buf = vec![0u8; *length as usize];
+                vhdx.file.write_at(*file_offset, &buf).await.unwrap();
+            }
+        }
+    }
+}
+
+// ---- Basic Trim Tests ----
+
+#[async_test]
+async fn trim_full_block_file_space(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+    assert_block_state(&vhdx, 0, BatEntryState::FullyPresent);
+    assert!(block_has_file_offset(&vhdx, 0));
+
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+    // Soft anchor preserved.
+    assert!(block_has_file_offset(&vhdx, 0));
+}
+
+#[async_test]
+async fn trim_full_block_free_space(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FreeSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+
+    assert_block_state(&vhdx, 0, BatEntryState::Undefined);
+    // FreeSpace on FullyPresent clears file offset (releases space).
+    assert!(!block_has_file_offset(&vhdx, 0));
+}
+
+#[async_test]
+async fn trim_full_block_zero(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    vhdx.trim(TrimRequest::new(
+        TrimMode::Zero,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+
+    assert_block_state(&vhdx, 0, BatEntryState::Zero);
+    assert!(!block_has_file_offset(&vhdx, 0));
+}
+
+#[async_test]
+async fn trim_full_block_make_transparent(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    vhdx.trim(TrimRequest::new(
+        TrimMode::MakeTransparent,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+
+    assert_block_state(&vhdx, 0, BatEntryState::NotPresent);
+    assert!(!block_has_file_offset(&vhdx, 0));
+}
+
+#[async_test]
+async fn trim_remove_soft_anchors(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    // First trim with FileSpace to create a soft anchor.
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+    assert!(block_has_file_offset(&vhdx, 0));
+
+    // Now remove the soft anchor.
+    vhdx.trim(TrimRequest::new(
+        TrimMode::RemoveSoftAnchors,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+    assert!(!block_has_file_offset(&vhdx, 0));
+}
+
+#[async_test]
+async fn trim_already_trimmed_idempotent(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+
+    // Second trim with FileSpace → no-op.
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+}
+
+#[async_test]
+async fn trim_undefined_block_file_space_noop(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Block 0 starts as NotPresent on a fresh non-differencing disk.
+    assert_block_state(&vhdx, 0, BatEntryState::NotPresent);
+
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+
+    // FileSpace is a no-op for NotPresent → should still be NotPresent.
+    assert_block_state(&vhdx, 0, BatEntryState::NotPresent);
+}
+
+#[async_test]
+async fn trim_zero_block_noop(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // First: write and trim to Zero to get a Zero block.
+    let block_size = vhdx.block_size();
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(0, block_size, &mut ranges)
+        .await
+        .unwrap();
+    write_ranges(&vhdx, &ranges, 0).await;
+    guard.complete().await.unwrap();
+
+    vhdx.trim(TrimRequest::new(TrimMode::Zero, 0, block_size as u64))
+        .await
+        .unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Zero);
+
+    // Second Zero trim → no-op.
+    vhdx.trim(TrimRequest::new(TrimMode::Zero, 0, block_size as u64))
+        .await
+        .unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Zero);
+}
+
+// ---- Range Tests ----
+
+#[async_test]
+async fn trim_cross_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let bs = vhdx.block_size();
+
+    // Write blocks 0, 1, 2.
+    for block in 0..3u32 {
+        let offset = block as u64 * bs as u64;
+        let mut ranges = Vec::new();
+        let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap();
+        write_ranges(&vhdx, &ranges, 0xBB).await;
+        guard.complete().await.unwrap();
+    }
+
+    // Trim all 3 blocks at once.
+    vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 3 * bs as u64))
+        .await
+        .unwrap();
+
+    for block in 0..3u32 {
+        assert_block_state(&vhdx, block, BatEntryState::Unmapped);
+        assert!(block_has_file_offset(&vhdx, block));
+    }
+}
+
+#[async_test]
+async fn trim_partial_range_skips_edges(driver: DefaultDriver) {
+    let file = InMemoryFile::new(0);
+    let bs = MB1 as u32; // Use 1 MiB blocks for easier testing.
+    let mut params = CreateParams {
+        disk_size: 4 * MB1,
+        block_size: bs,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write blocks 0, 1, 2.
+    for block in 0..3u32 {
+        let offset = block as u64 * bs as u64;
+        let mut ranges = Vec::new();
+        let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap();
+        write_ranges(&vhdx, &ranges, 0xCC).await;
+        guard.complete().await.unwrap();
+    }
+
+    // Trim from mid-block-0 through mid-block-2 → only block 1 is trimmed.
+    let trim_offset = MB1 / 2; // mid-block-0
+    let trim_length = 2 * MB1; // covers block 1 fully, partial block 0 and 2
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        trim_offset,
+        trim_length,
+    ))
+    .await
+    .unwrap();
+
+    assert_block_state(&vhdx, 0, BatEntryState::FullyPresent); // partial → not trimmed
+    assert_block_state(&vhdx, 1, BatEntryState::Unmapped); // fully covered → trimmed
+    assert_block_state(&vhdx, 2, BatEntryState::FullyPresent); // partial → not trimmed
+}
+
+#[async_test]
+async fn trim_entire_disk(driver: DefaultDriver) {
+    let file = InMemoryFile::new(0);
+    let bs = MB1 as u32;
+    let mut params = CreateParams {
+        disk_size: 4 * MB1,
+        block_size: bs,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write all 4 blocks.
+    for block in 0..4u32 {
+        let offset = block as u64 * bs as u64;
+        let mut ranges = Vec::new();
+        let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap();
+        write_ranges(&vhdx, &ranges, 0xDD).await;
+        guard.complete().await.unwrap();
+    }
+
+    // Trim the entire disk.
+    vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 4 * MB1))
+        .await
+        .unwrap();
+
+    for block in 0..4u32 {
+        assert_block_state(&vhdx, block, BatEntryState::Unmapped);
+    }
+}
+
+#[async_test]
+async fn trim_at_disk_end_rounds_up(driver: DefaultDriver) {
+    // The disk size may not be an exact multiple of block size.
+    // If trim range ends exactly at disk_size, we round up.
+    let file = InMemoryFile::new(0);
+    let bs = MB1 as u32;
+    // 3.5 MiB disk with 1 MiB blocks → 4 blocks (last block is partial).
+    let disk_size = 3 * MB1 + MB1 / 2;
+    let mut params = CreateParams {
+        disk_size,
+        block_size: bs,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // Write block 3 (the last, partial block).
+    let block3_offset = 3 * MB1;
+    // Write less than a full block (only the valid portion).
+    let write_size = (MB1 / 2) as u32;
+    let mut ranges = Vec::new();
+    let guard = vhdx
+        .resolve_write(block3_offset, write_size, &mut ranges)
+        .await
+        .unwrap();
+    write_ranges(&vhdx, &ranges, 0xEE).await;
+    guard.complete().await.unwrap();
+
+    // Trim from block 3 to end of disk.
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        block3_offset,
+        disk_size - block3_offset,
+    ))
+    .await
+    .unwrap();
+
+    // Block 3 should be trimmed (disk end rounding kicks in).
+    assert_block_state(&vhdx, 3, BatEntryState::Unmapped);
+}
+
+// ---- Read-After-Trim Tests ----
+
+#[async_test]
+async fn read_after_trim_returns_zeros(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    // Verify data is present.
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+    assert!(matches!(ranges[0], ReadRange::Data { .. }));
+    drop(guard);
+
+    // Trim.
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+
+    // Read after trim → zeros.
+    let mut ranges = Vec::new();
+    let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+    assert_eq!(ranges.len(), 1);
+    assert!(
+        matches!(ranges[0], ReadRange::Zero { .. }),
+        "expected Zero range after trim, got {:?}",
+        ranges[0]
+    );
+}
+
+#[async_test]
+async fn trim_then_write_reallocates(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    // Trim with FileSpace (soft anchor).
+    vhdx.trim(TrimRequest::new(
+        TrimMode::FileSpace,
+        0,
+        vhdx.block_size() as u64,
+    ))
+    .await
+    .unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+
+    // Write again — should reallocate (possibly reusing soft anchor).
+    let bs = vhdx.block_size();
+    let mut ranges = Vec::new();
+    let guard = vhdx.resolve_write(0, bs, &mut ranges).await.unwrap();
+    write_ranges(&vhdx, &ranges, 0xFF).await;
+    guard.complete().await.unwrap();
+
+    assert_block_state(&vhdx, 0, BatEntryState::FullyPresent);
+}
+
+// ---- Fully-Allocated Disk Tests ----
+
+#[async_test]
+async fn trim_fixed_disk_file_space_noop(driver: DefaultDriver) {
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: 4 * MB1,
+        block_size: MB1 as u32,
+        is_fully_allocated: true,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // FileSpace trim on fixed → no-op.
+    vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 4 * MB1))
+        .await
+        .unwrap();
+
+    // Blocks should be unchanged.
+    let mapping = vhdx.bat.get_block_mapping(0);
+    let state = mapping.bat_state();
+    // On a fully-allocated disk, blocks start as Undefined (not yet written).
+    // The FileSpace mode is a no-op, so they stay the same.
+    assert_ne!(state, BatEntryState::Unmapped);
+}
+
+#[async_test]
+async fn trim_fixed_disk_make_transparent_allowed(driver: DefaultDriver) {
+    let file = InMemoryFile::new(0);
+    let mut params = CreateParams {
+        disk_size: 4 * MB1,
+        block_size: MB1 as u32,
+        is_fully_allocated: true,
+        ..Default::default()
+    };
+    create::create(&file, &mut params).await.unwrap();
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // MakeTransparent on fixed → allowed.
+    vhdx.trim(TrimRequest::new(TrimMode::MakeTransparent, 0, 4 * MB1))
+        .await
+        .unwrap();
+
+    // Blocks should be NotPresent (MakeTransparent succeeded).
+    assert_block_state(&vhdx, 0, BatEntryState::NotPresent);
+}
+
+// ---- Concurrent Safety Tests ----
+
+#[async_test]
+async fn trim_waits_for_in_flight_read(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    // Acquire a read guard on block 0 to hold its refcount.
+    let mut ranges = Vec::new();
+    let read_guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap();
+
+    // Spawn trim concurrently. It should block until the guard is dropped.
+    let trim_done = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
+    let trim_done2 = trim_done.clone();
+
+    let (trim_result, _) = futures::join!(
+        async {
+            let r = vhdx
+                .trim(TrimRequest::new(
+                    TrimMode::FileSpace,
+                    0,
+                    vhdx.block_size() as u64,
+                ))
+                .await;
+            trim_done2.store(true, std::sync::atomic::Ordering::SeqCst);
+            r
+        },
+        async {
+            // After a yield, drop the read guard.
+            // The trim should be able to see the refcount eventually.
+            // Yield to let the trim task run.
+            std::future::poll_fn(|cx| {
+                cx.waker().wake_by_ref();
+                std::task::Poll::Ready(())
+            })
+            .await;
+            assert!(
+                !trim_done.load(std::sync::atomic::Ordering::SeqCst),
+                "trim should not complete while read guard is held"
+            );
+            drop(read_guard);
+        }
+    );
+
+    trim_result.unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+}
+
+#[async_test]
+async fn trim_waits_for_in_flight_write(driver: DefaultDriver) {
+    let vhdx = create_and_write_block(format::GB1, 0, &driver).await;
+
+    // Acquire a write guard on block 0.
+    let mut ranges = Vec::new();
+    let write_guard = vhdx
+        .resolve_write(0, vhdx.block_size(), &mut ranges)
+        .await
+        .unwrap();
+
+    let trim_done = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
+    let trim_done2 = trim_done.clone();
+
+    let (trim_result, _) = futures::join!(
+        async {
+            let r = vhdx
+                .trim(TrimRequest::new(
+                    TrimMode::FileSpace,
+                    0,
+                    vhdx.block_size() as u64,
+                ))
+                .await;
+            trim_done2.store(true, std::sync::atomic::Ordering::SeqCst);
+            r
+        },
+        async {
+            // Yield to let the trim task run.
+            std::future::poll_fn(|cx| {
+                cx.waker().wake_by_ref();
+                std::task::Poll::Ready(())
+            })
+            .await;
+            assert!(
+                !trim_done.load(std::sync::atomic::Ordering::SeqCst),
+                "trim should not complete while write guard is held"
+            );
+            // Complete the write so the guard drops after.
+            write_guard.complete().await.unwrap();
+        }
+    );
+
+    trim_result.unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+}
+
+#[async_test]
+async fn trim_concurrent_different_block(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+    let bs = vhdx.block_size();
+
+    // Write blocks 0 and 1.
+    for block in 0..2u32 {
+        let offset = block as u64 * bs as u64;
+        let mut ranges = Vec::new();
+        let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap();
+        write_ranges(&vhdx, &ranges, 0xAA).await;
+        guard.complete().await.unwrap();
+    }
+
+    // Trim block 0, read block 1 concurrently.
+    let (trim_result, read_result) = futures::join!(
+        vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, bs as u64)),
+        async {
+            let mut ranges = Vec::new();
+            let guard = vhdx
+                .resolve_read(bs as u64, 4096, &mut ranges)
+                .await
+                .unwrap();
+            let result = ranges.clone();
+            drop(guard);
+            result
+        }
+    );
+
+    trim_result.unwrap();
+    assert_block_state(&vhdx, 0, BatEntryState::Unmapped);
+    assert!(matches!(read_result[0], ReadRange::Data { .. }));
+}
+
+// ---- Validation Tests ----
+
+#[async_test]
+async fn trim_read_only_fails() {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).read_only().await.unwrap();
+
+    let result = vhdx
+        .trim(TrimRequest::new(
+            TrimMode::FileSpace,
+            0,
+            vhdx.block_size() as u64,
+        ))
+        .await;
+    assert!(matches!(
+        result,
+        Err(VhdxIoError(VhdxIoErrorInner::ReadOnly))
+    ));
+}
+
+#[async_test]
+async fn trim_unaligned_offset_fails(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let result = vhdx
+        .trim(TrimRequest::new(TrimMode::FileSpace, 1, 512))
+        .await;
+    assert!(matches!(
+        result,
+        Err(VhdxIoError(VhdxIoErrorInner::UnalignedIo))
+    ));
+}
+
+#[async_test]
+async fn trim_beyond_disk_fails(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let result = vhdx
+        .trim(TrimRequest::new(
+            TrimMode::FileSpace,
+            format::GB1 - 512,
+            1024,
+        ))
+        .await;
+    assert!(matches!(
+        result,
+        Err(VhdxIoError(VhdxIoErrorInner::BeyondEndOfDisk))
+    ));
+}
+
+#[async_test]
+async fn trim_beyond_disk_ok_with_skip(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    // With skip_disk_size_check, goes beyond but computes no included blocks → ok.
+    let result = vhdx
+        .trim(
+            TrimRequest::new(TrimMode::FileSpace, format::GB1 - 512, 1024)
+                .skip_disk_size_check(true),
+        )
+        .await;
+    assert!(result.is_ok());
+}
+
+#[async_test]
+async fn trim_zero_length_noop(driver: DefaultDriver) {
+    let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await;
+    let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap();
+
+    let result = vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 0)).await;
+    assert!(result.is_ok());
+}
diff --git a/vm/devices/storage/vhdx/src/trim.rs b/vm/devices/storage/vhdx/src/trim.rs
new file mode 100644
index 0000000000..7ad7962967
--- /dev/null
+++ b/vm/devices/storage/vhdx/src/trim.rs
@@ -0,0 +1,578 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! VHDX block trim (unmap) support.
+//!
+//! Implements the `VhdxFile::trim()` method that transitions blocks to
+//! unmapped states, releasing file space back to the free pool or
+//! soft-anchoring it for later reuse.
+
+use crate::AsyncFile;
+use crate::bat::BlockMapping;
+use crate::bat::BlockType;
+use crate::error::VhdxIoError;
+use crate::error::VhdxIoErrorInner;
+use crate::format::BatEntryState;
+use crate::format::MB1;
+use crate::header::WriteMode;
+use crate::open::VhdxFile;
+
+/// Trim mode determining the target block state.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TrimMode {
+    /// Move blocks to the Unmapped (trimmed) state, keeping file offset
+    /// as a soft anchor for potential reuse.
+    ///
+    /// Denied for Undefined blocks: a block that was never written
+    /// should stay Undefined (preserves backup semantics — backup tools
+    /// skip Undefined blocks, but not Unmapped ones).
+    FileSpace,
+
+    /// Move blocks to the Undefined state. Soft anchor may be kept or
+    /// cleared depending on the original state.
+    FreeSpace,
+
+    /// Move blocks to the Zero state, clearing the file offset.
+    Zero,
+
+    /// Move blocks to the NotPresent (transparent) state, clearing the
+    /// file offset. For differencing disks, reads fall through to parent.
+    ///
+    /// Allowed on fully-allocated (fixed) disks.
+    MakeTransparent,
+
+    /// Remove soft anchors from trimmed/undefined blocks without changing
+    /// their state. Clears file_megabyte if the block is soft-anchored.
+    ///
+    /// Allowed on fully-allocated (fixed) disks. Does not change data
+    /// content, so DataWriteGuid is not updated.
+    RemoveSoftAnchors,
+}
+
+/// Builder for a trim operation on a VHDX file.
+///
+/// Created via [`VhdxFile::trim`]. Required parameters (`mode`, `offset`,
+/// `length`) are provided at construction; optional flags default to the
+/// safe/common values and can be overridden with builder methods.
+#[derive(Debug, Clone)]
+pub struct TrimRequest {
+    mode: TrimMode,
+    offset: u64,
+    length: u64,
+    skip_disk_size_check: bool,
+    skip_write_guid_change: bool,
+}
+
+impl TrimRequest {
+    /// Create a new trim request.
+    ///
+    /// * `mode` - Determines the target block state.
+    /// * `offset` - Virtual disk byte offset (must be sector-aligned).
+    /// * `length` - Length in bytes (must be sector-aligned).
+    pub fn new(mode: TrimMode, offset: u64, length: u64) -> Self {
+        Self {
+            mode,
+            offset,
+            length,
+            skip_disk_size_check: false,
+            skip_write_guid_change: false,
+        }
+    }
+
+    /// Skip bounds checking against the virtual disk size.
+    pub fn skip_disk_size_check(mut self, skip: bool) -> Self {
+        self.skip_disk_size_check = skip;
+        self
+    }
+
+    /// Don't update DataWriteGuid when trimming.
+    pub fn skip_write_guid_change(mut self, skip: bool) -> Self {
+        self.skip_write_guid_change = skip;
+        self
+    }
+}
+
+/// Returns true if the given trim mode is allowed on fully-allocated (fixed) disks.
+fn mode_allowed_on_fixed(mode: TrimMode) -> bool {
+    matches!(
+        mode,
+        TrimMode::MakeTransparent | TrimMode::RemoveSoftAnchors
+    )
+}
+
+/// Returns true if this trim mode should skip the DataWriteGuid update.
+fn mode_skips_write_guid(mode: TrimMode) -> bool {
+    matches!(mode, TrimMode::RemoveSoftAnchors)
+}
+
+/// Convert a block mapping according to the trim mode.
+///
+/// Returns the new mapping, which may be identical to `old` (no-op).
+fn convert_mapping(mode: TrimMode, old: BlockMapping) -> BlockMapping {
+    let state = old.bat_state();
+    match mode {
+        TrimMode::FileSpace => convert_file_space(state, old),
+        TrimMode::FreeSpace => convert_free_space(state, old),
+        TrimMode::Zero => convert_zero(state, old),
+        TrimMode::MakeTransparent => convert_make_transparent(state, old),
+        TrimMode::RemoveSoftAnchors => convert_remove_soft_anchors(old),
+    }
+}
+
+/// FileSpace: FullyPresent/PartiallyPresent → Unmapped (keep soft anchor).
+/// All other states are no-ops.
+fn convert_file_space(state: BatEntryState, old: BlockMapping) -> BlockMapping {
+    match state {
+        BatEntryState::FullyPresent | BatEntryState::PartiallyPresent => {
+            BlockMapping::new()
+                .with_bat_state(BatEntryState::Unmapped)
+                .with_transitioning_to_fully_present(false)
+                .with_file_megabyte(old.file_megabyte()) // keep as soft anchor
+        }
+        _ => old, // NotPresent, Undefined, Zero, Unmapped → no change
+    }
+}
+
+/// FreeSpace: FullyPresent/PartiallyPresent → Undefined (clear offset, release space).
+/// Zero → Undefined (clear offset).
+/// Unmapped → Undefined (keep soft anchor).
+/// Others → no change.
+fn convert_free_space(state: BatEntryState, old: BlockMapping) -> BlockMapping {
+    match state {
+        BatEntryState::FullyPresent | BatEntryState::PartiallyPresent => {
+            // Release space — clear file offset.
+            BlockMapping::new()
+                .with_bat_state(BatEntryState::Undefined)
+                .with_transitioning_to_fully_present(false)
+                .with_file_megabyte(0)
+        }
+        BatEntryState::Zero => BlockMapping::new()
+            .with_bat_state(BatEntryState::Undefined)
+            .with_transitioning_to_fully_present(false)
+            .with_file_megabyte(0),
+        BatEntryState::Unmapped => {
+            // Keep soft anchor if present.
+            BlockMapping::new()
+                .with_bat_state(BatEntryState::Undefined)
+                .with_transitioning_to_fully_present(false)
+                .with_file_megabyte(old.file_megabyte())
+        }
+        _ => old, // NotPresent, Undefined → no change
+    }
+}
+
+/// Zero: any state → Zero (clear file offset).
+fn convert_zero(state: BatEntryState, old: BlockMapping) -> BlockMapping {
+    match state {
+        BatEntryState::Zero if old.file_megabyte() == 0 => old, // already Zero with no offset
+        _ => {
+            debug_assert!(
+                !old.transitioning_to_fully_present(),
+                "cannot trim TFP block to Zero"
+            );
+            BlockMapping::new()
+                .with_bat_state(BatEntryState::Zero)
+                .with_transitioning_to_fully_present(false)
+                .with_file_megabyte(0)
+        }
+    }
+}
+
+/// MakeTransparent: any state → NotPresent (clear file offset).
+fn convert_make_transparent(state: BatEntryState, old: BlockMapping) -> BlockMapping {
+    match state {
+        BatEntryState::NotPresent if old.file_megabyte() == 0 => old, // already NotPresent
+        _ => BlockMapping::new()
+            .with_bat_state(BatEntryState::NotPresent)
+            .with_transitioning_to_fully_present(false)
+            .with_file_megabyte(0),
+    }
+}
+
+/// RemoveSoftAnchors: clear file offset if soft-anchored, otherwise no-op.
+fn convert_remove_soft_anchors(old: BlockMapping) -> BlockMapping {
+    if old.is_soft_anchored() {
+        BlockMapping::new()
+            .with_bat_state(old.bat_state())
+            .with_transitioning_to_fully_present(false)
+            .with_file_megabyte(0)
+    } else {
+        old
+    }
+}
+
+/// Compute the block range fully included in a byte range.
+///
+/// Returns `(start_block, block_count)`. Only blocks whose entire extent
+/// falls within `[offset..offset+length)` are included. Leading and
+/// trailing partial blocks are skipped.
+fn included_blocks(offset: u64, length: u64, block_size: u64) -> (u32, u32) {
+    if length == 0 {
+        return (0, 0);
+    }
+    // First fully-included block: round UP to next block boundary.
+    let start = offset.div_ceil(block_size) as u32;
+    // First block NOT included: round DOWN.
+    let end = ((offset + length) / block_size) as u32;
+    if end <= start {
+        (start, 0)
+    } else {
+        (start, end - start)
+    }
+}
+
+impl<F: AsyncFile> VhdxFile<F> {
+    /// Trim (unmap) a range of virtual disk blocks.
+    ///
+    /// Transitions blocks to unmapped/zero/transparent state depending on
+    /// the mode specified in `request`. Only blocks fully covered by the
+    /// range are trimmed.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - The file is read-only
+    /// - Offset or length is not aligned to the logical sector size
+    /// - The range extends beyond the disk size (unless `skip_disk_size_check`)
+    /// - The file is in a permanently failed state
+    pub async fn trim(&self, request: TrimRequest) -> Result<(), VhdxIoError> {
+        self.failed.check()?;
+
+        let TrimRequest {
+            mode,
+            offset,
+            length,
+            skip_disk_size_check,
+            skip_write_guid_change,
+        } = request;
+
+        // 1. Check read-only.
+        if self.read_only {
+            return Err(VhdxIoErrorInner::ReadOnly.into());
+        }
+
+        // 2. Zero-length — immediate success.
+        if length == 0 {
+            return Ok(());
+        }
+
+        // 3. Validate alignment to logical sector size.
+        if !offset.is_multiple_of(self.logical_sector_size as u64)
+            || !length.is_multiple_of(self.logical_sector_size as u64)
+        {
+            return Err(VhdxIoErrorInner::UnalignedIo.into());
+        }
+
+        // 4. Validate bounds (unless skipped).
+        if !skip_disk_size_check {
+            if offset
+                .checked_add(length)
+                .is_none_or(|end| end > self.disk_size)
+            {
+                return Err(VhdxIoErrorInner::BeyondEndOfDisk.into());
+            }
+        }
+
+        // 5. If fully-allocated (fixed) disk and mode doesn't allow it: no-op.
+        if self.is_fully_allocated() && !mode_allowed_on_fixed(mode) {
+            return Ok(());
+        }
+
+        // 6. Enable write mode.
+        // All trim modes modify the file (BAT entries), so FileWritable
+        // is always needed. DataWritable is additionally needed when the
+        // mode changes user-visible data (everything except
+        // RemoveSoftAnchors) and the caller hasn't opted out.
+        if !skip_write_guid_change && !mode_skips_write_guid(mode) {
+            self.enable_write_mode(WriteMode::DataWritable)
+                .await
+                .map_err(VhdxIoErrorInner::WriteHeader)?;
+        } else {
+            self.enable_write_mode(WriteMode::FileWritable)
+                .await
+                .map_err(VhdxIoErrorInner::WriteHeader)?;
+        }
+
+        // 7. Compute effective length: if trim extends to exactly disk_size,
+        //    round up to cover the full last block.
+        let effective_length = if !skip_disk_size_check && offset + length == self.disk_size {
+            let block_size = self.block_size as u64;
+            let full_disk_size = crate::create::round_up(self.disk_size, block_size);
+            full_disk_size - offset
+        } else {
+            length
+        };
+
+        // 8. Compute included blocks.
+        let (start_block, block_count) =
+            included_blocks(offset, effective_length, self.block_size as u64);
+        if block_count == 0 {
+            return Ok(());
+        }
+        let end_block = start_block + block_count;
+
+        // 9. Main trim loop.
+        //
+        // For each block, we atomically claim it (CAS 0 → SENTINEL),
+        // preventing any new I/O from reading stale mappings. Then we
+        // read + convert the mapping, write the BAT, handle space
+        // management, and release the claim.
+        let mut current_block = start_block;
+        loop {
+            if current_block >= end_block {
+                return Ok(());
+            }
+
+            // 9a. Claim the block: set trim-pending to block new I/O,
+            //     wait for in-flight I/Os to drain, then take exclusive
+            //     ownership.
+            let claim = self.bat.claim_for_trim(current_block).await;
+
+            // 9b. Block is claimed — no new I/O can start on it.
+            //     Read the mapping and compute the trim conversion.
+            let old_mapping = self.bat.get_block_mapping(current_block);
+            let new_mapping = convert_mapping(mode, old_mapping);
+
+            if old_mapping == new_mapping {
+                // No-op — release claim and advance.
+                current_block += 1;
+                continue;
+            }
+
+            // 9c. Write BAT entry to cache (also updates in-memory BAT atomically).
+            // LOCK AUDIT: Trim claim held (not a sync lock). Safe to await.
+            self.bat
+                .write_block_mapping(
+                    &self.cache,
+                    BlockType::Payload,
+                    current_block,
+                    new_mapping,
+                    None,
+                )
+                .await?;
+
+            // 9e. Handle space management based on old→new transition.
+            //
+            // Space releases are deferred until the BAT change is durable
+            // on disk. Without deferral, a crash could teleport data from
+            // a new block into the old block's offset.
+            let old_anchored = old_mapping.is_soft_anchored();
+            let new_anchored = new_mapping.is_soft_anchored();
+            let old_file_mb = old_mapping.file_megabyte();
+            let new_file_mb = new_mapping.file_megabyte();
+            let old_file_offset = old_file_mb as u64 * MB1;
+            let block_size = self.block_size;
+
+            if old_anchored && new_anchored {
+                // Same anchor — assert same file offset, no space management.
+                debug_assert_eq!(old_file_mb, new_file_mb);
+            } else if old_anchored && !new_anchored {
+                // Was soft-anchored → no longer: unmark/cancel + defer release.
+                let was_deferred = self.deferred_releases.cancel(current_block);
+                if !was_deferred {
+                    assert!(
+                        self.free_space.unmark_trimmed_block(
+                            current_block,
+                            old_file_offset,
+                            block_size,
+                        ),
+                        "soft-anchored block {current_block} not tracked as trimmed"
+                    );
+                }
+                self.deferred_releases
+                    .insert(current_block, old_file_offset, block_size, false);
+            } else if !old_anchored && new_anchored {
+                // Was not anchored → now soft-anchored: defer the anchor.
+                self.deferred_releases
+                    .insert(current_block, old_file_offset, block_size, true);
+            } else {
+                // Neither was nor becomes anchored.
+                if old_file_mb != 0 {
+                    self.deferred_releases.insert(
+                        current_block,
+                        old_file_offset,
+                        block_size,
+                        false,
+                    );
+                }
+            }
+
+            // 9f. Release the trim claim — I/O can resume on this block.
+            drop(claim);
+
+            // Quota check: force flush if too many deferred releases.
+            if self.deferred_releases.needs_flush() {
+                self.flush().await?;
+            }
+
+            current_block += 1;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // ---- included_blocks unit tests ----
+
+    #[test]
+    fn included_blocks_full_coverage() {
+        // Range exactly covers blocks 0..3 (3 blocks).
+        let block_size = 2 * MB1;
+        let (start, count) = included_blocks(0, 3 * block_size, block_size);
+        assert_eq!(start, 0);
+        assert_eq!(count, 3);
+    }
+
+    #[test]
+    fn included_blocks_partial_edges() {
+        // Start mid-block-0, end mid-block-2 → only block 1 included.
+        let block_size = 2 * MB1;
+        let (start, count) = included_blocks(MB1, 2 * block_size, block_size);
+        assert_eq!(start, 1); // block 0 is partial
+        assert_eq!(count, 1); // only block 1 fully covered
+    }
+
+    #[test]
+    fn included_blocks_zero_length() {
+        let (start, count) = included_blocks(0, 0, 2 * MB1);
+        assert_eq!(start, 0);
+        assert_eq!(count, 0);
+    }
+
+    #[test]
+    fn included_blocks_too_small() {
+        // Range is less than one block → no blocks included.
+        let block_size = 2 * MB1;
+        let (_start, count) = included_blocks(MB1, MB1, block_size);
+        assert_eq!(count, 0);
+    }
+
+    // ---- Conversion function unit tests ----
+
+    #[test]
+    fn convert_file_space_mappings() {
+        // FullyPresent → Unmapped (keep offset)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::FullyPresent)
+            .with_file_megabyte(4);
+        let r = convert_mapping(TrimMode::FileSpace, m);
+        assert_eq!(r.bat_state(), BatEntryState::Unmapped);
+        assert_eq!(r.file_megabyte(), 4);
+
+        // Undefined → Undefined (no change)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Undefined)
+            .with_file_megabyte(0);
+        let r = convert_mapping(TrimMode::FileSpace, m);
+        assert_eq!(r, m);
+
+        // Unmapped → Unmapped (no change)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Unmapped)
+            .with_file_megabyte(5);
+        let r = convert_mapping(TrimMode::FileSpace, m);
+        assert_eq!(r, m);
+    }
+
+    #[test]
+    fn convert_free_space_mappings() {
+        // FullyPresent → Undefined (clear offset)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::FullyPresent)
+            .with_file_megabyte(4);
+        let r = convert_mapping(TrimMode::FreeSpace, m);
+        assert_eq!(r.bat_state(), BatEntryState::Undefined);
+        assert_eq!(r.file_megabyte(), 0);
+
+        // Unmapped → Undefined (keep anchor)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Unmapped)
+            .with_file_megabyte(5);
+        let r = convert_mapping(TrimMode::FreeSpace, m);
+        assert_eq!(r.bat_state(), BatEntryState::Undefined);
+        assert_eq!(r.file_megabyte(), 5);
+    }
+
+    #[test]
+    fn convert_zero_mappings() {
+        // FullyPresent → Zero (clear offset)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::FullyPresent)
+            .with_file_megabyte(4);
+        let r = convert_mapping(TrimMode::Zero, m);
+        assert_eq!(r.bat_state(), BatEntryState::Zero);
+        assert_eq!(r.file_megabyte(), 0);
+
+        // Zero (no offset) → Zero (no change)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Zero)
+            .with_file_megabyte(0);
+        let r = convert_mapping(TrimMode::Zero, m);
+        assert_eq!(r, m);
+    }
+
+    #[test]
+    fn convert_make_transparent_mappings() {
+        // FullyPresent → NotPresent
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::FullyPresent)
+            .with_file_megabyte(4);
+        let r = convert_mapping(TrimMode::MakeTransparent, m);
+        assert_eq!(r.bat_state(), BatEntryState::NotPresent);
+        assert_eq!(r.file_megabyte(), 0);
+
+        // NotPresent → NotPresent (no change)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::NotPresent)
+            .with_file_megabyte(0);
+        let r = convert_mapping(TrimMode::MakeTransparent, m);
+        assert_eq!(r, m);
+    }
+
+    #[test]
+    fn convert_remove_soft_anchors_mappings() {
+        // Unmapped with offset → clear offset
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Unmapped)
+            .with_file_megabyte(5);
+        let r = convert_mapping(TrimMode::RemoveSoftAnchors, m);
+        assert_eq!(r.bat_state(), BatEntryState::Unmapped);
+        assert_eq!(r.file_megabyte(), 0);
+
+        // FullyPresent → no change (not soft-anchored)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::FullyPresent)
+            .with_file_megabyte(4);
+        let r = convert_mapping(TrimMode::RemoveSoftAnchors, m);
+        assert_eq!(r, m);
+    }
+
+    #[test]
+    fn is_soft_anchored_checks() {
+        // Unmapped with offset → anchored
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Unmapped)
+            .with_file_megabyte(5);
+        assert!(m.is_soft_anchored());
+
+        // Undefined with offset → anchored
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Undefined)
+            .with_file_megabyte(3);
+        assert!(m.is_soft_anchored());
+
+        // Unmapped with no offset → not anchored
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::Unmapped)
+            .with_file_megabyte(0);
+        assert!(!m.is_soft_anchored());
+
+        // FullyPresent with offset → not anchored (wrong state)
+        let m = BlockMapping::new()
+            .with_bat_state(BatEntryState::FullyPresent)
+            .with_file_megabyte(4);
+        assert!(!m.is_soft_anchored());
+    }
+}
diff --git a/vm/devices/storage/vhdx/tests/native_cross_validation.rs b/vm/devices/storage/vhdx/tests/native_cross_validation.rs
new file mode 100644
index 0000000000..a71566d4b9
--- /dev/null
+++ b/vm/devices/storage/vhdx/tests/native_cross_validation.rs
@@ -0,0 +1,1920 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Native API cross-validation smoke tests.
+//!
+//! These tests exercise the first interaction between the Rust VHDX parser
+//! and the Windows native VHD stack. They are deliberately limited in scope
+//! to surface format-level bugs that need to be diagnosed and fixed before
+//! writing a full test suite.
+//!
+//! **All tests are gated with `#[cfg(windows)]`.**
+//!
+//! ## Format bugs discovered and fixed
+//!
+//! (Updated as bugs are found during cross-validation.)
+
+#![cfg(windows)]
+// UNSAFETY: Windows FFI calls for virtual disk APIs and raw disk I/O.
+#![expect(unsafe_code)]
+
+use pal_async::DefaultDriver;
+use parking_lot::Mutex;
+use std::borrow::Borrow;
+use std::io;
+use std::path::Path;
+use std::sync::Arc;
+use vhdx::AsyncFile;
+use vhdx::ReadRange;
+use vhdx::TrimMode;
+use vhdx::TrimRequest;
+use vhdx::WriteRange;
+
+use windows::Win32::Foundation::CloseHandle;
+use windows::Win32::Foundation::HANDLE;
+use windows::Win32::Storage::FileSystem::ReadFile;
+use windows::Win32::Storage::FileSystem::WriteFile;
+use windows::Win32::Storage::Vhd::ATTACH_VIRTUAL_DISK_FLAG;
+use windows::Win32::Storage::Vhd::ATTACH_VIRTUAL_DISK_FLAG_NO_DRIVE_LETTER;
+use windows::Win32::Storage::Vhd::ATTACH_VIRTUAL_DISK_FLAG_NO_LOCAL_HOST;
+use windows::Win32::Storage::Vhd::AttachVirtualDisk;
+use windows::Win32::Storage::Vhd::CREATE_VIRTUAL_DISK_FLAG_NONE;
+use windows::Win32::Storage::Vhd::CREATE_VIRTUAL_DISK_PARAMETERS;
+use windows::Win32::Storage::Vhd::CREATE_VIRTUAL_DISK_VERSION_2;
+use windows::Win32::Storage::Vhd::CreateVirtualDisk;
+use windows::Win32::Storage::Vhd::DetachVirtualDisk;
+use windows::Win32::Storage::Vhd::OPEN_VIRTUAL_DISK_FLAG_NONE;
+use windows::Win32::Storage::Vhd::OPEN_VIRTUAL_DISK_PARAMETERS;
+use windows::Win32::Storage::Vhd::OPEN_VIRTUAL_DISK_VERSION_2;
+use windows::Win32::Storage::Vhd::OpenVirtualDisk;
+use windows::Win32::Storage::Vhd::VIRTUAL_DISK_ACCESS_MASK;
+use windows::Win32::Storage::Vhd::VIRTUAL_STORAGE_TYPE;
+use windows::Win32::System::IO::GetOverlappedResult;
+use windows::Win32::System::IO::OVERLAPPED;
+use windows::Win32::System::Threading::CreateEventW;
+use windows::core::PCWSTR;
+
+// ---------------------------------------------------------------------
+// StdFile — blocking AsyncFile adapter for integration tests
+// ---------------------------------------------------------------------
+
+/// Blocking `AsyncFile` impl backed by `std::fs::File`.
+/// Suitable for tests only — all operations block the current thread.
+struct StdFile {
+    file: Mutex<std::fs::File>,
+}
+
+impl StdFile {
+    fn open(path: &Path, read_only: bool) -> io::Result<Self> {
+        let file = std::fs::OpenOptions::new()
+            .read(true)
+            .write(!read_only)
+            .open(path)?;
+        Ok(Self {
+            file: Mutex::new(file),
+        })
+    }
+
+    fn create(path: &Path) -> io::Result<Self> {
+        let file = std::fs::OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(true)
+            .open(path)?;
+        Ok(Self {
+            file: Mutex::new(file),
+        })
+    }
+}
+
+impl AsyncFile for StdFile {
+    type Buffer = Vec<u8>;
+
+    fn alloc_buffer(&self, len: usize) -> Vec<u8> {
+        vec![0u8; len]
+    }
+
+    async fn read_into(&self, offset: u64, mut buf: Vec<u8>) -> Result<Vec<u8>, io::Error> {
+        use std::io::Read;
+        use std::io::Seek;
+        use std::io::SeekFrom;
+        let mut file = self.file.lock();
+        file.seek(SeekFrom::Start(offset))?;
+        file.read_exact(buf.as_mut())?;
+        Ok(buf)
+    }
+
+    async fn write_from(
+        &self,
+        offset: u64,
+        buf: impl Borrow<Vec<u8>> + Send + 'static,
+    ) -> Result<(), io::Error> {
+        use std::io::Seek;
+        use std::io::SeekFrom;
+        use std::io::Write;
+        let buf = buf.borrow();
+        let mut file = self.file.lock();
+        file.seek(SeekFrom::Start(offset))?;
+        file.write_all(buf.as_ref())
+    }
+
+    async fn flush(&self) -> Result<(), io::Error> {
+        use std::io::Write;
+        let mut file = self.file.lock();
+        file.flush()
+    }
+
+    async fn file_size(&self) -> Result<u64, io::Error> {
+        let file = self.file.lock();
+        file.metadata().map(|m| m.len())
+    }
+
+    async fn set_file_size(&self, size: u64) -> Result<(), io::Error> {
+        let file = self.file.lock();
+        file.set_len(size)
+    }
+}
+
+// ---------------------------------------------------------------------
+// Windows Virtual Disk Type Constants
+// ---------------------------------------------------------------------
+
+const VIRTUAL_STORAGE_TYPE_DEVICE_VHDX: u32 = 3;
+
+// Microsoft vendor GUID: {EC984AEC-A0F9-47e9-901F-71415A66345B}
+const VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT: windows::core::GUID = windows::core::GUID {
+    data1: 0xEC984AEC,
+    data2: 0xA0F9,
+    data3: 0x47e9,
+    data4: [0x90, 0x1F, 0x71, 0x41, 0x5A, 0x66, 0x34, 0x5B],
+};
+
+// ---------------------------------------------------------------------
+// Path helper
+// ---------------------------------------------------------------------
+
+fn to_wide(path: &Path) -> Vec<u16> {
+    use std::os::windows::ffi::OsStrExt;
+    path.as_os_str()
+        .encode_wide()
+        .chain(std::iter::once(0))
+        .collect()
+}
+
+// ---------------------------------------------------------------------
+// NativeVhdx — RAII wrapper around Windows virtual disk APIs
+// ---------------------------------------------------------------------
+
+struct NativeVhdx {
+    handle: HANDLE,
+    attached: bool,
+}
+
+impl NativeVhdx {
+    /// Create a new dynamic VHDX via CreateVirtualDisk.
+    fn create_dynamic(path: &Path, size_bytes: u64, block_size: u32, sector_size: u32) -> Self {
+        let storage_type = VIRTUAL_STORAGE_TYPE {
+            DeviceId: VIRTUAL_STORAGE_TYPE_DEVICE_VHDX,
+            VendorId: VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT,
+        };
+
+        let wide = to_wide(path);
+
+        let mut params = CREATE_VIRTUAL_DISK_PARAMETERS {
+            Version: CREATE_VIRTUAL_DISK_VERSION_2,
+            ..Default::default()
+        };
+        params.Anonymous.Version2.MaximumSize = size_bytes;
+        params.Anonymous.Version2.BlockSizeInBytes = block_size;
+        params.Anonymous.Version2.SectorSizeInBytes = sector_size;
+
+        let mut handle = HANDLE::default();
+
+        // SAFETY: All parameters are correctly initialized, wide path is
+        // null-terminated, and handle is written by the API on success.
+        let result = unsafe {
+            CreateVirtualDisk(
+                &storage_type,
+                PCWSTR(wide.as_ptr()),
+                VIRTUAL_DISK_ACCESS_MASK(0),
+                None,
+                CREATE_VIRTUAL_DISK_FLAG_NONE,
+                0,
+                &params,
+                None,
+                &mut handle,
+            )
+        };
+        assert!(result.is_ok(), "CreateVirtualDisk failed: {result:?}");
+
+        NativeVhdx {
+            handle,
+            attached: false,
+        }
+    }
+
+    /// Open an existing VHDX via OpenVirtualDisk.
+    fn open(path: &Path, _read_only: bool) -> Self {
+        let storage_type = VIRTUAL_STORAGE_TYPE {
+            DeviceId: VIRTUAL_STORAGE_TYPE_DEVICE_VHDX,
+            VendorId: VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT,
+        };
+
+        let wide = to_wide(path);
+
+        let params = OPEN_VIRTUAL_DISK_PARAMETERS {
+            Version: OPEN_VIRTUAL_DISK_VERSION_2,
+            ..Default::default()
+        };
+
+        let mut handle = HANDLE::default();
+
+        // SAFETY: All parameters are correctly initialized, wide path is
+        // null-terminated, and handle is written by the API on success.
+        let result = unsafe {
+            OpenVirtualDisk(
+                &storage_type,
+                PCWSTR(wide.as_ptr()),
+                VIRTUAL_DISK_ACCESS_MASK(0),
+                OPEN_VIRTUAL_DISK_FLAG_NONE,
+                Some(&params),
+                &mut handle,
+            )
+        };
+        assert!(result.is_ok(), "OpenVirtualDisk failed: {result:?}");
+
+        NativeVhdx {
+            handle,
+            attached: false,
+        }
+    }
+
+    /// Attach with NO_LOCAL_HOST for raw byte-level I/O.
+    /// With NO_LOCAL_HOST, no PhysicalDrive device is surfaced — instead,
+    /// ReadFile/WriteFile work directly on the virtual disk handle.
+    fn attach_raw(&mut self) -> RawDiskHandle {
+        let flags = ATTACH_VIRTUAL_DISK_FLAG(
+            ATTACH_VIRTUAL_DISK_FLAG_NO_LOCAL_HOST.0 | ATTACH_VIRTUAL_DISK_FLAG_NO_DRIVE_LETTER.0,
+        );
+
+        // SAFETY: Handle is valid (from Create/OpenVirtualDisk). Flags are valid.
+        let result = unsafe { AttachVirtualDisk(self.handle, None, flags, 0, None, None) };
+        assert!(result.is_ok(), "AttachVirtualDisk failed: {result:?}");
+        self.attached = true;
+
+        // With NO_LOCAL_HOST the virtual disk handle itself supports
+        // ReadFile/WriteFile at virtual-disk offsets. No PhysicalDrive path.
+        RawDiskHandle {
+            handle: self.handle,
+            owned: false,
+        }
+    }
+}
+
+impl Drop for NativeVhdx {
+    fn drop(&mut self) {
+        if self.attached {
+            // SAFETY: Handle is valid and was successfully attached.
+            let _ = unsafe { DetachVirtualDisk(self.handle, Default::default(), 0) };
+            self.attached = false;
+        }
+        if !self.handle.is_invalid() {
+            // SAFETY: Handle is valid (from Create/OpenVirtualDisk).
+            let _ = unsafe { CloseHandle(self.handle) };
+        }
+    }
+}
+
+// ---------------------------------------------------------------------
+// RawDiskHandle — read/write at byte offsets on attached virtual disk
+// ---------------------------------------------------------------------
+
+struct RawDiskHandle {
+    handle: HANDLE,
+    /// Whether this handle is owned (should be closed on drop).
+    /// When borrowed from NativeVhdx (NO_LOCAL_HOST attach), this is false.
+    owned: bool,
+}
+
+impl RawDiskHandle {
+    /// Read `buf.len()` bytes from the raw disk at the given byte offset.
+    /// Offset and length must be sector-aligned (multiples of 512).
+    fn read_at(&self, offset: u64, buf: &mut [u8]) -> io::Result<usize> {
+        // SAFETY: Creating a manual-reset event for overlapped I/O.
+        let event = unsafe { CreateEventW(None, true, false, None) }
+            .map_err(|e| io::Error::from_raw_os_error(e.code().0))?;
+        let mut overlapped: OVERLAPPED = Default::default();
+        overlapped.Anonymous.Anonymous.Offset = (offset & 0xFFFF_FFFF) as u32;
+        overlapped.Anonymous.Anonymous.OffsetHigh = (offset >> 32) as u32;
+        overlapped.hEvent = event;
+
+        let mut bytes_read = 0u32;
+        // SAFETY: Handle is valid, buf is valid for buf.len() bytes,
+        // overlapped is correctly initialized with event and offset.
+        let result = unsafe {
+            ReadFile(
+                self.handle,
+                Some(buf),
+                Some(&mut bytes_read),
+                Some(&mut overlapped),
+            )
+        };
+        match result {
+            Ok(()) => {}
+            Err(e) if e.code() == windows::Win32::Foundation::ERROR_IO_PENDING.into() => {
+                // ERROR_IO_PENDING — wait for completion.
+                // SAFETY: Handle and overlapped are valid; bWait=true blocks.
+                unsafe { GetOverlappedResult(self.handle, &overlapped, &mut bytes_read, true) }
+                    .map_err(|e| io::Error::from_raw_os_error(e.code().0))?;
+            }
+            Err(e) => {
+                // SAFETY: Event handle is valid.
+                let _ = unsafe { CloseHandle(event) };
+                return Err(io::Error::from_raw_os_error(e.code().0));
+            }
+        }
+        // SAFETY: Event handle is valid.
+        let _ = unsafe { CloseHandle(event) };
+        Ok(bytes_read as usize)
+    }
+
+    /// Write `data.len()` bytes to the raw disk at the given byte offset.
+    /// Offset and length must be sector-aligned (multiples of 512).
+    fn write_at(&self, offset: u64, data: &[u8]) -> io::Result<usize> {
+        // SAFETY: Creating a manual-reset event for overlapped I/O.
+        let event = unsafe { CreateEventW(None, true, false, None) }
+            .map_err(|e| io::Error::from_raw_os_error(e.code().0))?;
+        let mut overlapped: OVERLAPPED = Default::default();
+        overlapped.Anonymous.Anonymous.Offset = (offset & 0xFFFF_FFFF) as u32;
+        overlapped.Anonymous.Anonymous.OffsetHigh = (offset >> 32) as u32;
+        overlapped.hEvent = event;
+
+        let mut bytes_written = 0u32;
+        // SAFETY: Handle is valid, data is valid for data.len() bytes,
+        // overlapped is correctly initialized with event and offset.
+        let result = unsafe {
+            WriteFile(
+                self.handle,
+                Some(data),
+                Some(&mut bytes_written),
+                Some(&mut overlapped),
+            )
+        };
+        match result {
+            Ok(()) => {}
+            Err(e) if e.code() == windows::Win32::Foundation::ERROR_IO_PENDING.into() => {
+                // ERROR_IO_PENDING — wait for completion.
+                // SAFETY: Handle and overlapped are valid; bWait=true blocks.
+                unsafe { GetOverlappedResult(self.handle, &overlapped, &mut bytes_written, true) }
+                    .map_err(|e| io::Error::from_raw_os_error(e.code().0))?;
+            }
+            Err(e) => {
+                // SAFETY: Event handle is valid.
+                let _ = unsafe { CloseHandle(event) };
+                return Err(io::Error::from_raw_os_error(e.code().0));
+            }
+        }
+        // SAFETY: Event handle is valid.
+        let _ = unsafe { CloseHandle(event) };
+        Ok(bytes_written as usize)
+    }
+}
+
+impl Drop for RawDiskHandle {
+    fn drop(&mut self) {
+        if self.owned && !self.handle.is_invalid() {
+            // SAFETY: Handle is valid and owned by this struct.
+            let _ = unsafe { CloseHandle(self.handle) };
+        }
+    }
+}
+
+// ---------------------------------------------------------------------
+// RustVhdx — helper wrapping the Rust VHDX API for test scenarios
+// ---------------------------------------------------------------------
+
+struct RustVhdx {
+    vhdx: vhdx::VhdxFile<StdFile>,
+    /// Separate file handle for data I/O (shared backing path).
+    io_file: Arc<StdFile>,
+}
+
+impl RustVhdx {
+    async fn create(path: &Path, disk_size: u64, block_size: u32, driver: &DefaultDriver) -> Self {
+        let file = StdFile::create(path).expect("create backing file");
+        let mut params = vhdx::CreateParams {
+            disk_size,
+            block_size,
+            ..Default::default()
+        };
+        vhdx::create(&file, &mut params).await.expect("vhdx create");
+        drop(file);
+
+        // Re-open for use with log task.
+        Self::open(path, false, Some(driver)).await
+    }
+
+    async fn open(path: &Path, read_only: bool, driver: Option<&DefaultDriver>) -> Self {
+        let file = StdFile::open(path, read_only).expect("open backing file");
+        let io_file = Arc::new(StdFile::open(path, read_only).expect("open io file"));
+        let vhdx = if read_only {
+            vhdx::VhdxFile::open(file)
+                .read_only()
+                .await
+                .expect("vhdx open")
+        } else {
+            let driver = driver.expect("writable open requires a driver/spawner");
+            vhdx::VhdxFile::open(file)
+                .writable(driver)
+                .await
+                .expect("vhdx open_writable")
+        };
+        RustVhdx { vhdx, io_file }
+    }
+
+    /// Read data at a virtual offset. Returns a Vec<u8> of `len` bytes.
+    async fn read_data(&self, offset: u64, len: u32) -> Vec<u8> {
+        let mut ranges = Vec::new();
+        let guard = self
+            .vhdx
+            .resolve_read(offset, len, &mut ranges)
+            .await
+            .expect("resolve_read");
+
+        let mut result = vec![0u8; len as usize];
+
+        for range in &ranges {
+            match range {
+                ReadRange::Data {
+                    guest_offset,
+                    length,
+                    file_offset,
+                } => {
+                    let buf_offset = (*guest_offset - offset) as usize;
+                    let buf_len = *length as usize;
+                    self.io_file
+                        .read_at(*file_offset, &mut result[buf_offset..buf_offset + buf_len])
+                        .await
+                        .expect("read data from file");
+                }
+                ReadRange::Zero { .. } | ReadRange::Unmapped { .. } => {
+                    // Already zero-initialized.
+                }
+            }
+        }
+
+        drop(guard);
+        result
+    }
+
+    /// Write data at a virtual offset.
+    async fn write_data(&self, offset: u64, data: &[u8]) {
+        let mut ranges = Vec::new();
+        let guard = self
+            .vhdx
+            .resolve_write(offset, data.len() as u32, &mut ranges)
+            .await
+            .expect("resolve_write");
+
+        for range in &ranges {
+            match range {
+                WriteRange::Data {
+                    guest_offset,
+                    length,
+                    file_offset,
+                } => {
+                    let buf_offset = (*guest_offset - offset) as usize;
+                    let buf_len = *length as usize;
+                    self.io_file
+                        .write_at(*file_offset, &data[buf_offset..buf_offset + buf_len])
+                        .await
+                        .expect("write data to file");
+                }
+                WriteRange::Zero {
+                    file_offset,
+                    length,
+                } => {
+                    let zeros = vec![0u8; *length as usize];
+                    self.io_file
+                        .write_at(*file_offset, &zeros)
+                        .await
+                        .expect("zero-fill file range");
+                }
+            }
+        }
+
+        guard.complete().await.expect("write complete");
+    }
+
+    /// Flush the VHDX file.
+    async fn flush(&self) {
+        self.vhdx.flush().await.expect("flush");
+    }
+
+    /// Trim a range of the virtual disk.
+    async fn trim_range(&self, offset: u64, length: u64) {
+        self.vhdx
+            .trim(TrimRequest::new(TrimMode::Zero, offset, length))
+            .await
+            .expect("trim");
+    }
+
+    /// Close the VHDX (consume self).
+    async fn close(self) {
+        self.vhdx.close().await.expect("close");
+    }
+
+    /// Abort (crash) the VHDX — drops without clean close, leaving a dirty log.
+    async fn abort(self) {
+        self.vhdx.abort().await;
+    }
+}
+
+// =====================================================================
+// Test Data Pattern
+// =====================================================================
+
+/// Generate a test pattern for a given offset: the pattern byte
+/// is derived from the offset so each location has unique data.
+fn test_pattern(offset: u64, len: usize) -> Vec<u8> {
+    (0..len)
+        .map(|i| ((offset as usize + i) % 251) as u8) // prime modulus avoids power-of-2 alignment
+        .collect()
+}
+
+// =====================================================================
+// Test Cases
+// =====================================================================
+
+/// Test 1: Native-Create → Rust-Open (Metadata Check)
+///
+/// Native creates a dynamic VHDX (1 GiB) → close → Rust opens → verify
+/// disk geometry matches.
+#[pal_async::async_test]
+async fn native_create_rust_open_metadata() {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    // Native create: 1 GiB, default block/sector sizes (pass 0 for defaults).
+    {
+        let _native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 0);
+        // Drop closes the handle.
+    }
+
+    // Rust open and verify metadata.
+    let rust = RustVhdx::open(&vhdx_path, true, None).await;
+
+    // Native defaults: 1 GiB disk, typically 32 MiB block size, 512 sector sizes.
+    assert_eq!(rust.vhdx.disk_size(), 1024 * 1024 * 1024, "disk_size");
+    // The native default block size is typically 32 MiB, but may vary.
+    // Just assert it's a power of 2 and > 0.
+    let block_size = rust.vhdx.block_size();
+    assert!(block_size > 0 && block_size.is_power_of_two(), "block_size");
+    // Sector sizes: native defaults to 512 logical, 4096 physical.
+    assert_eq!(rust.vhdx.logical_sector_size(), 512, "logical_sector_size");
+    assert_eq!(
+        rust.vhdx.physical_sector_size(),
+        4096,
+        "physical_sector_size"
+    );
+
+    rust.close().await;
+}
+
+/// Test 2: Rust-Create → Native-Open (Open Succeeds)
+///
+/// Rust creates a dynamic VHDX (1 GiB) → close → native OpenVirtualDisk
+/// succeeds.
+#[pal_async::async_test]
+async fn rust_create_native_open(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    // Rust create: 1 GiB, 2 MiB block size (Rust default), 512-byte sectors.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 1024 * 1024 * 1024, 0, &driver).await;
+        rust.close().await;
+    }
+
+    // Native open — this is the most likely test to fail.
+    let _native = NativeVhdx::open(&vhdx_path, true);
+    // If we get here, the native stack accepted the Rust-created file.
+}
+
+/// Test 3: Rust-Create → Native-Attach → Raw-Read Zeros
+///
+/// Rust creates a small dynamic VHDX (4 MiB, 2 MiB blocks) → close →
+/// native opens → attach → raw-read first sector → verify all zeros.
+#[pal_async::async_test]
+async fn rust_create_native_attach_read_zeros(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    // Rust create: 4 MiB disk, 2 MiB block size.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 4 * 1024 * 1024, 2 * 1024 * 1024, &driver).await;
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Native open + attach.
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    // Read the first sector (512 bytes) at offset 0.
+    let mut buf = vec![0xCCu8; 512];
+    let bytes_read = raw.read_at(0, &mut buf).expect("raw read at offset 0");
+    assert_eq!(bytes_read, 512, "expected 512 bytes read");
+
+    // A freshly-created, never-written VHDX should return all zeros.
+    assert!(buf.iter().all(|&b| b == 0), "first sector should be zeros");
+}
+
+/// Test 4: Native-Create → Native-Write → Rust-Read (Data)
+///
+/// Native creates dynamic VHDX (1 GiB, default sizes) → attach → write
+/// known patterns at 3 offsets across different blocks → detach → close →
+/// Rust opens → reads at each offset → data matches.
+#[pal_async::async_test]
+async fn native_create_rust_read_data() {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    // Determine block size after native create (typically 32 MiB).
+    let block_size: u64;
+
+    // Native create + write.
+    {
+        let mut native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 0);
+        let raw = native.attach_raw();
+
+        // We need to know the block size to write across blocks.
+        // Native defaults to 32 MiB blocks.
+        block_size = 32 * 1024 * 1024;
+
+        let offsets = [0u64, block_size, 2 * block_size];
+        for &off in &offsets {
+            let pattern = test_pattern(off, 512);
+            let written = raw.write_at(off, &pattern).expect("native write");
+            assert_eq!(written, 512);
+        }
+        // Drop detaches and closes.
+    }
+
+    // Rust open + read + verify.
+    let rust = RustVhdx::open(&vhdx_path, true, None).await;
+
+    let offsets = [0u64, block_size, 2 * block_size];
+    for &off in &offsets {
+        let expected = test_pattern(off, 512);
+        let actual = rust.read_data(off, 512).await;
+        assert_eq!(actual, expected, "data mismatch at offset {off:#x}");
+    }
+
+    rust.close().await;
+}
+
+/// Test 5: Native-Create → Rust-Read (Custom 32 MiB Block Size)
+///
+/// Native creates with explicit 32 MiB block size → Rust opens →
+/// `block_size()` == 32 MiB.
+#[pal_async::async_test]
+async fn native_create_custom_block_size() {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    {
+        let _native =
+            NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 32 * 1024 * 1024, 0);
+    }
+
+    let rust = RustVhdx::open(&vhdx_path, true, None).await;
+    assert_eq!(
+        rust.vhdx.block_size(),
+        33554432,
+        "block_size should be 32 MiB"
+    );
+    rust.close().await;
+}
+
+/// Test 6: Native-Create → Rust-Read (4K Logical Sector)
+///
+/// Native creates with 4096 logical sector size → Rust opens →
+/// `logical_sector_size()` == 4096.
+#[pal_async::async_test]
+async fn native_create_4k_sector() {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    {
+        let _native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 4096);
+    }
+
+    let rust = RustVhdx::open(&vhdx_path, true, None).await;
+    assert_eq!(
+        rust.vhdx.logical_sector_size(),
+        4096,
+        "logical_sector_size should be 4096"
+    );
+    rust.close().await;
+}
+
+/// Test 7: Rust-Create → Native-Read (Data)
+///
+/// Rust creates + writes data at multiple offsets across block boundaries →
+/// flush → close → native opens → attach → raw-read at each offset →
+/// data matches.
+#[pal_async::async_test]
+async fn rust_create_native_read_data(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024; // 2 MiB
+
+    // Rust create + write.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 32 * 1024 * 1024, block_size as u32, &driver).await;
+
+        // Write to 3 different blocks (blocks 1, 3, 5 — skip block 0 since test 3 uses it).
+        let offsets = [block_size, 3 * block_size, 5 * block_size];
+        for &off in &offsets {
+            let pattern = test_pattern(off, 512);
+            rust.write_data(off, &pattern).await;
+        }
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Native open + attach + read + verify.
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    let offsets = [block_size, 3 * block_size, 5 * block_size];
+    for &off in &offsets {
+        let expected = test_pattern(off, 512);
+        let mut buf = vec![0u8; 512];
+        let bytes_read = raw.read_at(off, &mut buf).expect("native read");
+        assert_eq!(bytes_read, 512);
+        assert_eq!(buf, expected, "data mismatch at offset {off:#x}");
+    }
+}
+
+/// Test 8: Rust-Create → Native-Open (Various Block Sizes)
+///
+/// Rust creates VHDX files with 2 MiB, 4 MiB, and 32 MiB block sizes →
+/// native opens each → open succeeds without error.
+#[pal_async::async_test]
+async fn rust_create_various_block_sizes(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let block_sizes: &[u32] = &[2 * 1024 * 1024, 4 * 1024 * 1024, 32 * 1024 * 1024];
+
+    for &bs in block_sizes {
+        let name = format!("test_bs_{bs}.vhdx");
+        let vhdx_path = dir.path().join(&name);
+
+        {
+            let rust = RustVhdx::create(&vhdx_path, 64 * 1024 * 1024, bs, &driver).await;
+            rust.close().await;
+        }
+
+        let _native = NativeVhdx::open(&vhdx_path, true);
+        // If we get here, the native stack accepted the file.
+    }
+}
+
+/// Test 9: Interleaved — Native-Write Then Rust-Write
+///
+/// Native creates → attach → write region A (offset 0) → detach → close →
+/// Rust opens → writes region B (second block) → flush → close →
+/// native opens → attach → reads both regions → both intact.
+#[pal_async::async_test]
+async fn interleaved_native_then_rust(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 32 * 1024 * 1024; // native default
+
+    // Step 1: Native create + write region A at offset 0.
+    {
+        let mut native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 0);
+        let raw = native.attach_raw();
+        let pattern_a = test_pattern(0, 512);
+        let written = raw.write_at(0, &pattern_a).expect("native write region A");
+        assert_eq!(written, 512);
+    }
+
+    // Step 2: Rust opens → writes region B at block_size offset.
+    {
+        let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await;
+        let pattern_b = test_pattern(block_size, 512);
+        rust.write_data(block_size, &pattern_b).await;
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Step 3: Native opens → reads both regions → verifies.
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+
+        let expected_a = test_pattern(0, 512);
+        let mut buf_a = vec![0u8; 512];
+        let bytes = raw.read_at(0, &mut buf_a).expect("read region A");
+        assert_eq!(bytes, 512);
+        assert_eq!(buf_a, expected_a, "region A corrupted");
+
+        let expected_b = test_pattern(block_size, 512);
+        let mut buf_b = vec![0u8; 512];
+        let bytes = raw.read_at(block_size, &mut buf_b).expect("read region B");
+        assert_eq!(bytes, 512);
+        assert_eq!(buf_b, expected_b, "region B corrupted");
+    }
+}
+
+/// Test 10: Interleaved — Rust-Write Then Native-Write
+///
+/// Rust creates → writes blocks 0, 2, 4 → flush → close →
+/// native opens → attach → writes blocks 1, 3 → detach → close →
+/// Rust opens → reads all blocks → all data intact.
+#[pal_async::async_test]
+async fn interleaved_rust_then_native(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024; // 2 MiB
+
+    // Step 1: Rust create + write blocks 0, 2, 4.
+    let rust_offsets = [0u64, 2 * block_size, 4 * block_size];
+    {
+        let rust = RustVhdx::create(&vhdx_path, 32 * 1024 * 1024, block_size as u32, &driver).await;
+        for &off in &rust_offsets {
+            rust.write_data(off, &test_pattern(off, 512)).await;
+        }
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Step 2: Native opens → writes blocks 1, 3.
+    let native_offsets = [block_size, 3 * block_size];
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+        for &off in &native_offsets {
+            let pattern = test_pattern(off, 512);
+            let written = raw.write_at(off, &pattern).expect("native write");
+            assert_eq!(written, 512);
+        }
+    }
+
+    // Step 3: Rust opens → reads all blocks → verifies.
+    {
+        let rust = RustVhdx::open(&vhdx_path, true, None).await;
+
+        for &off in rust_offsets.iter().chain(native_offsets.iter()) {
+            let expected = test_pattern(off, 512);
+            let actual = rust.read_data(off, 512).await;
+            assert_eq!(actual, expected, "data mismatch at offset {off:#x}");
+        }
+
+        rust.close().await;
+    }
+}
+
+/// Test 11: Three-Way Round-Trip
+///
+/// Rust creates → writes block 0 → flush → close → native opens → attach →
+/// writes block 1 → detach → close → Rust opens → reads blocks 0 and 1 →
+/// both correct.
+#[pal_async::async_test]
+async fn three_way_round_trip(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Step 1: Rust creates and writes block 0.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 16 * 1024 * 1024, block_size as u32, &driver).await;
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Step 2: Native opens and writes block 1.
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+        let pattern = test_pattern(block_size, 512);
+        let written = raw
+            .write_at(block_size, &pattern)
+            .expect("native write block 1");
+        assert_eq!(written, 512);
+    }
+
+    // Step 3: Rust opens → reads blocks 0 and 1 → verifies.
+    {
+        let rust = RustVhdx::open(&vhdx_path, true, None).await;
+
+        let data0 = rust.read_data(0, 512).await;
+        assert_eq!(data0, test_pattern(0, 512), "block 0 data mismatch");
+
+        let data1 = rust.read_data(block_size, 512).await;
+        assert_eq!(
+            data1,
+            test_pattern(block_size, 512),
+            "block 1 data mismatch"
+        );
+
+        rust.close().await;
+    }
+}
+
+/// Test 12: Trim — Rust-Trim → Native-Read
+///
+/// Rust creates small disk (4 MiB, 2 MiB blocks) → writes all blocks →
+/// trims block 1 → flush → close → native opens → attach →
+/// raw-read block 0 (data intact) → raw-read block 1 (zeros).
+#[pal_async::async_test]
+async fn trim_rust_trim_native_read(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Rust create + write both blocks + trim block 1.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 4 * 1024 * 1024, block_size as u32, &driver).await;
+
+        // Write block 0 and block 1.
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.write_data(block_size, &test_pattern(block_size, 512))
+            .await;
+        rust.flush().await;
+
+        // Trim block 1 entirely.
+        rust.trim_range(block_size, block_size).await;
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Native open + attach + verify.
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    // Block 0 should still have data.
+    let mut buf0 = vec![0u8; 512];
+    let bytes = raw.read_at(0, &mut buf0).expect("read block 0");
+    assert_eq!(bytes, 512);
+    assert_eq!(buf0, test_pattern(0, 512), "block 0 should be intact");
+
+    // Block 1 should be zeros after trim.
+    let mut buf1 = vec![0u8; 512];
+    let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1");
+    assert_eq!(bytes, 512);
+    assert!(
+        buf1.iter().all(|&b| b == 0),
+        "block 1 should be zeros after trim"
+    );
+}
+
+/// Test 13: Trim — Native-Write → Rust-Trim → Native-Read
+///
+/// Native creates → attach → writes blocks 0 and 1 → detach → close →
+/// Rust opens → trims block 1 → flush → close → native opens → attach →
+/// raw-read block 0 (intact) → raw-read block 1 (zeros).
+#[pal_async::async_test]
+async fn trim_native_write_rust_trim_native_read(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    // Native default block size is 32 MiB. Use a smaller Rust-created disk
+    // so trim covers a full block efficiently.
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Step 1: Rust creates to control block size, then close.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await;
+        rust.close().await;
+    }
+
+    // Step 2: Native writes blocks 0 and 1.
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+
+        let written = raw
+            .write_at(0, &test_pattern(0, 512))
+            .expect("native write block 0");
+        assert_eq!(written, 512);
+
+        let written = raw
+            .write_at(block_size, &test_pattern(block_size, 512))
+            .expect("native write block 1");
+        assert_eq!(written, 512);
+    }
+
+    // Step 3: Rust opens → trims block 1 → flush → close.
+    {
+        let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await;
+        rust.trim_range(block_size, block_size).await;
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Step 4: Native opens → reads → verifies.
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+
+        // Block 0 intact.
+        let mut buf0 = vec![0u8; 512];
+        let bytes = raw.read_at(0, &mut buf0).expect("read block 0");
+        assert_eq!(bytes, 512);
+        assert_eq!(buf0, test_pattern(0, 512), "block 0 should be intact");
+
+        // Block 1 zeros.
+        let mut buf1 = vec![0u8; 512];
+        let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1");
+        assert_eq!(bytes, 512);
+        assert!(
+            buf1.iter().all(|&b| b == 0),
+            "block 1 should be zeros after trim"
+        );
+    }
+}
+
+// =====================================================================
+// Differencing Disk Helpers
+// =====================================================================
+
+impl NativeVhdx {
+    /// Create a differencing VHDX child (parent must already exist).
+    fn create_differencing(path: &Path, parent_path: &Path) -> Self {
+        let storage_type = VIRTUAL_STORAGE_TYPE {
+            DeviceId: VIRTUAL_STORAGE_TYPE_DEVICE_VHDX,
+            VendorId: VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT,
+        };
+
+        let wide = to_wide(path);
+        let parent_wide = to_wide(parent_path);
+
+        let mut params = CREATE_VIRTUAL_DISK_PARAMETERS {
+            Version: CREATE_VIRTUAL_DISK_VERSION_2,
+            ..Default::default()
+        };
+        // ParentPath tells CreateVirtualDisk to create a differencing child.
+        // MaximumSize, BlockSizeInBytes, and SectorSizeInBytes are inherited
+        // from the parent (set to 0 / left default).
+        params.Anonymous.Version2.ParentPath = PCWSTR(parent_wide.as_ptr());
+
+        let mut handle = HANDLE::default();
+
+        // SAFETY: All parameters are correctly initialized, paths are
+        // null-terminated, and handle is written by the API on success.
+        // `parent_wide` is alive for the duration of this call.
+        let result = unsafe {
+            CreateVirtualDisk(
+                &storage_type,
+                PCWSTR(wide.as_ptr()),
+                VIRTUAL_DISK_ACCESS_MASK(0),
+                None,
+                CREATE_VIRTUAL_DISK_FLAG_NONE,
+                0,
+                &params,
+                None,
+                &mut handle,
+            )
+        };
+        assert!(
+            result.is_ok(),
+            "CreateVirtualDisk (differencing) failed: {result:?}"
+        );
+
+        NativeVhdx {
+            handle,
+            attached: false,
+        }
+    }
+}
+
+impl RustVhdx {
+    /// Create a differencing VHDX via the Rust API (`has_parent: true`).
+    ///
+    /// No parent locator is written — this is sufficient for Rust-only
+    /// chained reads but NOT for native-open.
+    async fn create_diff(
+        path: &Path,
+        disk_size: u64,
+        block_size: u32,
+        driver: &DefaultDriver,
+    ) -> Self {
+        let file = StdFile::create(path).expect("create backing file");
+        let mut params = vhdx::CreateParams {
+            disk_size,
+            block_size,
+            has_parent: true,
+            ..Default::default()
+        };
+        vhdx::create(&file, &mut params)
+            .await
+            .expect("vhdx create diff");
+        drop(file);
+
+        Self::open(path, false, Some(driver)).await
+    }
+}
+
+/// Read data from a child, resolving Unmapped ranges from the parent.
+///
+/// For each `ReadRange::Unmapped` in the child's read resolution,
+/// reads the corresponding range from the parent. `Data` and `Zero`
+/// ranges are handled normally from the child.
+async fn chained_read(child: &RustVhdx, parent: &RustVhdx, offset: u64, len: u32) -> Vec<u8> {
+    let mut ranges = Vec::new();
+    let guard = child
+        .vhdx
+        .resolve_read(offset, len, &mut ranges)
+        .await
+        .expect("child resolve_read");
+
+    let mut result = vec![0u8; len as usize];
+
+    for range in &ranges {
+        match range {
+            ReadRange::Data {
+                guest_offset,
+                length,
+                file_offset,
+            } => {
+                let buf_offset = (*guest_offset - offset) as usize;
+                let buf_len = *length as usize;
+                child
+                    .io_file
+                    .read_at(*file_offset, &mut result[buf_offset..buf_offset + buf_len])
+                    .await
+                    .expect("read child data");
+            }
+            ReadRange::Zero {
+                guest_offset,
+                length,
+            } => {
+                // Already zero-initialized in result.
+                let _ = (guest_offset, length);
+            }
+            ReadRange::Unmapped {
+                guest_offset,
+                length,
+            } => {
+                // Fall through to parent.
+                let parent_data = parent.read_data(*guest_offset, *length).await;
+                let buf_offset = (*guest_offset - offset) as usize;
+                result[buf_offset..buf_offset + parent_data.len()].copy_from_slice(&parent_data);
+            }
+        }
+    }
+
+    drop(guard);
+    result
+}
+
+// =====================================================================
+// Differencing Disk Test Cases
+// =====================================================================
+
+/// Test 14: Rust-Only Chained Read — Unwritten Child
+///
+/// Rust creates parent + writes data → Rust creates diff child →
+/// child read returns zeros (Unmapped) → chained_read falls through
+/// to parent → data matches.
+#[pal_async::async_test]
+async fn diff_rust_chained_read_unwritten_child(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let parent_path = dir.path().join("parent.vhdx");
+    let child_path = dir.path().join("child.vhdx");
+
+    let block_size: u32 = 2 * 1024 * 1024;
+    let disk_size: u64 = 4 * 1024 * 1024;
+
+    // Step 1: Rust-create parent, write test_pattern at offset 0.
+    {
+        let parent = RustVhdx::create(&parent_path, disk_size, block_size, &driver).await;
+        parent.write_data(0, &test_pattern(0, 512)).await;
+        parent.flush().await;
+        parent.close().await;
+    }
+
+    // Step 2: Rust-create diff child (has_parent: true).
+    let child = RustVhdx::create_diff(&child_path, disk_size, block_size, &driver).await;
+
+    // Step 3: child.read_data returns zeros (Unmapped treated as zero).
+    let child_data = child.read_data(0, 512).await;
+    assert!(
+        child_data.iter().all(|&b| b == 0),
+        "unwritten child should return zeros"
+    );
+
+    // Step 4: chained_read falls through to parent.
+    let parent = RustVhdx::open(&parent_path, true, None).await;
+    let chained = chained_read(&child, &parent, 0, 512).await;
+    assert_eq!(
+        chained,
+        test_pattern(0, 512),
+        "chained read should return parent data"
+    );
+
+    // Step 5: Verify child is a differencing disk.
+    assert!(child.vhdx.has_parent(), "child should have has_parent set");
+
+    child.close().await;
+    parent.close().await;
+}
+
+/// Test 15: Rust-Only Chained Read — Partial Block Write
+///
+/// Rust creates parent + writes 2 sectors → Rust creates diff child →
+/// writes 1 sector to child with different data → chained_read returns
+/// child data for written sector, parent data for unwritten sector.
+///
+/// This exercises PartiallyPresent block handling: the Rust write allocates
+/// the block as PartiallyPresent (not FullyPresent), and the sector bitmap
+/// tracks which sectors are present in the child vs. transparent to parent.
+#[pal_async::async_test]
+async fn diff_rust_chained_read_partial_block(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let parent_path = dir.path().join("parent.vhdx");
+    let child_path = dir.path().join("child.vhdx");
+
+    let block_size: u32 = 2 * 1024 * 1024;
+    let disk_size: u64 = 4 * 1024 * 1024;
+
+    // Step 1: Rust-create parent, write 2 sectors at offset 0.
+    {
+        let parent = RustVhdx::create(&parent_path, disk_size, block_size, &driver).await;
+        parent.write_data(0, &test_pattern(0, 1024)).await;
+        parent.flush().await;
+        parent.close().await;
+    }
+
+    // Step 2: Rust-create diff child.
+    let child = RustVhdx::create_diff(&child_path, disk_size, block_size, &driver).await;
+
+    // Step 3: Write only sector 0 in child with a distinguishable pattern.
+    // The block should become PartiallyPresent with SBM bit 0 set.
+    let child_pattern = vec![0xAA; 512];
+    child.write_data(0, &child_pattern).await;
+    child.flush().await;
+
+    // Step 4: chained_read should return child data for sector 0,
+    //         parent data for sector 1.
+    let parent = RustVhdx::open(&parent_path, true, None).await;
+    let chained = chained_read(&child, &parent, 0, 1024).await;
+
+    // Sector 0 (bytes 0..512): from child → [0xAA; 512]
+    assert_eq!(
+        &chained[..512],
+        &child_pattern[..],
+        "sector 0 should come from child"
+    );
+    // Sector 1 (bytes 512..1024): from parent → test_pattern(512, 512)
+    assert_eq!(
+        &chained[512..1024],
+        &test_pattern(512, 512)[..],
+        "sector 1 should come from parent"
+    );
+
+    child.close().await;
+    parent.close().await;
+}
+
+/// Test 16: Native-Create Diff → Rust Reads
+///
+/// Native creates parent + writes data → native creates diff child →
+/// writes different data to child block 0 → Rust opens child → reads
+/// child data for block 0 + Unmapped for block 1 → chained_read resolves
+/// parent data for block 1.
+#[pal_async::async_test]
+async fn diff_native_create_rust_reads() {
+    let dir = tempfile::tempdir().unwrap();
+    let parent_path = dir.path().join("parent.vhdx");
+    let child_path = dir.path().join("child.vhdx");
+
+    // Native default: 32 MiB blocks.
+    let block_size: u64 = 32 * 1024 * 1024;
+
+    // Step 1: Native-create parent (1 GiB).
+    // Write test_pattern at offset 0 and offset block_size.
+    {
+        let mut native = NativeVhdx::create_dynamic(&parent_path, 1024 * 1024 * 1024, 0, 0);
+        let raw = native.attach_raw();
+
+        let written = raw
+            .write_at(0, &test_pattern(0, 512))
+            .expect("write parent block 0");
+        assert_eq!(written, 512);
+
+        let written = raw
+            .write_at(block_size, &test_pattern(block_size, 512))
+            .expect("write parent block 1");
+        assert_eq!(written, 512);
+    }
+
+    // Step 2: Native-create differencing child.
+    // Write [0xBB; 512] at offset 0 (overwrites parent's block 0).
+    {
+        let mut native = NativeVhdx::create_differencing(&child_path, &parent_path);
+        let raw = native.attach_raw();
+
+        let child_data = vec![0xBBu8; 512];
+        let written = raw.write_at(0, &child_data).expect("write child block 0");
+        assert_eq!(written, 512);
+    }
+
+    // Step 3: Rust opens child (read-only).
+    let child = RustVhdx::open(&child_path, true, None).await;
+
+    // Block 0, sector 0: child has data → should be [0xBB; 512].
+    let data_block0 = child.read_data(0, 512).await;
+    assert_eq!(
+        data_block0,
+        vec![0xBBu8; 512],
+        "child block 0 sector 0 should be 0xBB"
+    );
+
+    // Block 1: Unmapped in child → read_data returns zeros.
+    let data_block1 = child.read_data(block_size, 512).await;
+    assert!(
+        data_block1.iter().all(|&b| b == 0),
+        "child block 1 should be zeros (Unmapped)"
+    );
+
+    // Step 4: Rust opens parent (read-only).
+    let parent = RustVhdx::open(&parent_path, true, None).await;
+
+    // Verify parent block 1 data directly.
+    let parent_block1 = parent.read_data(block_size, 512).await;
+    assert_eq!(
+        parent_block1,
+        test_pattern(block_size, 512),
+        "parent block 1 should have original data"
+    );
+
+    // Step 5: chained_read for block 1 → falls through to parent.
+    let chained = chained_read(&child, &parent, block_size, 512).await;
+    assert_eq!(
+        chained,
+        test_pattern(block_size, 512),
+        "chained read block 1 should return parent data"
+    );
+
+    child.close().await;
+    parent.close().await;
+}
+
+/// Test 17: Native-Create Diff → Rust Reads Empty Child
+///
+/// Native creates parent + writes data → native creates diff child →
+/// no writes to child → Rust reads child → all Unmapped → chained read
+/// falls through to parent.
+#[pal_async::async_test]
+async fn diff_native_create_empty_child_rust_reads() {
+    let dir = tempfile::tempdir().unwrap();
+    let parent_path = dir.path().join("parent.vhdx");
+    let child_path = dir.path().join("child.vhdx");
+
+    // Step 1: Native-create parent (1 GiB), write data at offset 0.
+    {
+        let mut native = NativeVhdx::create_dynamic(&parent_path, 1024 * 1024 * 1024, 0, 0);
+        let raw = native.attach_raw();
+
+        let written = raw
+            .write_at(0, &test_pattern(0, 512))
+            .expect("write parent");
+        assert_eq!(written, 512);
+    }
+
+    // Step 2: Native-create differencing child (no writes).
+    {
+        let _native = NativeVhdx::create_differencing(&child_path, &parent_path);
+    }
+
+    // Step 3: Rust opens child.
+    let child = RustVhdx::open(&child_path, true, None).await;
+
+    // Child has_parent should be true.
+    assert!(child.vhdx.has_parent(), "child should be a diff disk");
+
+    // read_data returns zeros (Unmapped).
+    let child_data = child.read_data(0, 512).await;
+    assert!(
+        child_data.iter().all(|&b| b == 0),
+        "empty child should return zeros"
+    );
+
+    // Step 4: Rust opens parent; chained_read falls through.
+    let parent = RustVhdx::open(&parent_path, true, None).await;
+    let chained = chained_read(&child, &parent, 0, 512).await;
+    assert_eq!(
+        chained,
+        test_pattern(0, 512),
+        "chained read should return parent data"
+    );
+
+    child.close().await;
+    parent.close().await;
+}
+
+/// Test 18: Rust Writes to Native-Created Diff
+///
+/// Native creates parent → writes data at offsets 0 and 512 → native creates
+/// diff child → Rust opens child writable → writes sector 0 with different
+/// data → close → native opens child (with parent chain) → attach →
+/// raw-read → child data present at sector 0, parent data for sector 1
+/// (unwritten in child, falls through via native chain and SBM resolution).
+#[pal_async::async_test]
+async fn diff_rust_writes_to_native_diff(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let parent_path = dir.path().join("parent.vhdx");
+    let child_path = dir.path().join("child.vhdx");
+
+    // Step 1: Native-create parent (1 GiB), write data at offsets 0 and 512.
+    {
+        let mut native = NativeVhdx::create_dynamic(&parent_path, 1024 * 1024 * 1024, 0, 0);
+        let raw = native.attach_raw();
+
+        let written = raw
+            .write_at(0, &test_pattern(0, 512))
+            .expect("write parent sector 0");
+        assert_eq!(written, 512);
+
+        let written = raw
+            .write_at(512, &test_pattern(512, 512))
+            .expect("write parent sector 1");
+        assert_eq!(written, 512);
+    }
+
+    // Step 2: Native-create diff child (no writes yet).
+    {
+        let _native = NativeVhdx::create_differencing(&child_path, &parent_path);
+    }
+
+    // Step 3: Rust opens child writable, writes only sector 0.
+    // The block should become PartiallyPresent with SBM bit 0 set.
+    {
+        let child = RustVhdx::open(&child_path, false, Some(&driver)).await;
+        let child_data = vec![0xCCu8; 512];
+        child.write_data(0, &child_data).await;
+        child.flush().await;
+        child.close().await;
+    }
+
+    // Step 4: Native opens child (chain resolves automatically).
+    // Sector 0: from child (SBM bit set) → [0xCC; 512]
+    // Sector 1: from parent (SBM bit clear, falls through) → test_pattern(512, 512)
+    {
+        let mut native = NativeVhdx::open(&child_path, false);
+        let raw = native.attach_raw();
+
+        let mut buf0 = vec![0u8; 512];
+        let bytes = raw.read_at(0, &mut buf0).expect("read child sector 0");
+        assert_eq!(bytes, 512);
+        assert_eq!(buf0, vec![0xCCu8; 512], "sector 0 should be child's data");
+
+        let mut buf1 = vec![0u8; 512];
+        let bytes = raw.read_at(512, &mut buf1).expect("read child sector 1");
+        assert_eq!(bytes, 512);
+        assert_eq!(
+            buf1,
+            test_pattern(512, 512),
+            "sector 1 should come from parent via chain"
+        );
+    }
+}
+
+/// Test 19: Rust Writes + Trims in Diff Child
+///
+/// Rust-create parent → write data to blocks 0 and 1 → native-create diff
+/// child → Rust writes to child blocks 0 and 1 → Rust trims block 1 →
+/// native reads → block 0 has child data, block 1 is zeros.
+#[pal_async::async_test]
+async fn diff_rust_writes_and_trims(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let parent_path = dir.path().join("parent.vhdx");
+    let child_path = dir.path().join("child.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Step 1: Rust-create parent (to control block size), write blocks 0 and 1.
+    {
+        let parent =
+            RustVhdx::create(&parent_path, 8 * 1024 * 1024, block_size as u32, &driver).await;
+        parent.write_data(0, &test_pattern(0, 512)).await;
+        parent
+            .write_data(block_size, &test_pattern(block_size, 512))
+            .await;
+        parent.flush().await;
+        parent.close().await;
+    }
+
+    // Step 2: Native-create diff child.
+    {
+        let _native = NativeVhdx::create_differencing(&child_path, &parent_path);
+    }
+
+    // Step 3: Rust opens child writable.
+    //   - Write [0xDD; 512] at offset 0 (block 0, sector 0)
+    //   - Write [0xEE; 512] at offset block_size (block 1, sector 0)
+    //   - Trim block 1 entirely
+    {
+        let child = RustVhdx::open(&child_path, false, Some(&driver)).await;
+        child.write_data(0, &vec![0xDDu8; 512]).await;
+        child.write_data(block_size, &vec![0xEEu8; 512]).await;
+        child.trim_range(block_size, block_size).await;
+        child.flush().await;
+        child.close().await;
+    }
+
+    // Step 4: Native opens child (chain). Attach + read.
+    {
+        let mut native = NativeVhdx::open(&child_path, false);
+        let raw = native.attach_raw();
+
+        // Block 0: child's write → [0xDD; 512]
+        let mut buf0 = vec![0u8; 512];
+        let bytes = raw.read_at(0, &mut buf0).expect("read block 0");
+        assert_eq!(bytes, 512);
+        assert_eq!(buf0, vec![0xDDu8; 512], "block 0 should be child's data");
+
+        // Block 1: trimmed → zeros (TrimMode::Zero makes block Zero state;
+        // through native chain, Zero means zeros).
+        let mut buf1 = vec![0u8; 512];
+        let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1");
+        assert_eq!(bytes, 512);
+        assert!(
+            buf1.iter().all(|&b| b == 0),
+            "block 1 should be zeros after trim"
+        );
+    }
+}
+
+// =====================================================================
+// Log Replay Cross-Validation
+// =====================================================================
+//
+// These tests exercise crash recovery scenarios where the Rust stack
+// writes data with a dirty log (via `abort()`), and the native Windows
+// VHD stack replays the log on open — or vice versa.
+//
+// The key API for simulating a crash in the Rust stack is `VhdxFile::abort()`:
+// it drops the log channel without flushing, leaving the log GUID set in
+// the header. The next open (by either stack) must replay the log before
+// the file is usable.
+//
+// The native Windows VHD stack always performs a clean close on handle drop
+// (it flushes the log and clears the log GUID), so we cannot easily create
+// a dirty log via native. Tests focus on Rust-crash → Native-replay and
+// full lifecycle interleaving scenarios.
+
+/// Test 20: Rust Crash → Native Replay
+///
+/// Rust opens writable → writes data to two blocks → flush → abort
+/// (simulated crash, log stays dirty) → native opens (replays log) →
+/// attach → raw-read → data is present and correct.
+#[pal_async::async_test]
+async fn log_replay_rust_crash_native_reads(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Rust create + write + flush + abort (crash).
+    {
+        let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await;
+
+        // Write to blocks 0 and 1.
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.write_data(block_size, &test_pattern(block_size, 512))
+            .await;
+        rust.flush().await;
+
+        // Abort — leaves dirty log (log_guid is set in header).
+        rust.abort().await;
+    }
+
+    // Native opens — should replay the dirty log automatically.
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    // Block 0: should have Rust's data after log replay.
+    let mut buf0 = vec![0u8; 512];
+    let bytes = raw.read_at(0, &mut buf0).expect("read block 0");
+    assert_eq!(bytes, 512);
+    assert_eq!(buf0, test_pattern(0, 512), "block 0 data after log replay");
+
+    // Block 1: should have Rust's data after log replay.
+    let mut buf1 = vec![0u8; 512];
+    let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1");
+    assert_eq!(bytes, 512);
+    assert_eq!(
+        buf1,
+        test_pattern(block_size, 512),
+        "block 1 data after log replay"
+    );
+}
+
+/// Test 21: Rust Crash (Multiple Blocks) → Native Replay
+///
+/// Rust opens writable → writes data to many blocks across the disk
+/// (enough to exercise multiple log entries / batch commits) → flush →
+/// abort → native opens (replays all log entries) → all data intact.
+#[pal_async::async_test]
+async fn log_replay_rust_crash_many_blocks_native_reads(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+    let block_count = 8u64;
+    let disk_size = block_size * (block_count + 1);
+
+    // Rust create + write all blocks + flush + abort.
+    {
+        let rust = RustVhdx::create(&vhdx_path, disk_size, block_size as u32, &driver).await;
+
+        for i in 0..block_count {
+            let offset = i * block_size;
+            rust.write_data(offset, &test_pattern(offset, 512)).await;
+        }
+        rust.flush().await;
+        rust.abort().await;
+    }
+
+    // Native opens (replays log) → attach → read all blocks.
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    for i in 0..block_count {
+        let offset = i * block_size;
+        let expected = test_pattern(offset, 512);
+        let mut buf = vec![0u8; 512];
+        let bytes = raw.read_at(offset, &mut buf).expect("native read");
+        assert_eq!(bytes, 512);
+        assert_eq!(
+            buf, expected,
+            "data mismatch at block {i} (offset {offset:#x})"
+        );
+    }
+}
+
+/// Test 22: Rust Crash → Rust Replay → Native Reads
+///
+/// Rust writes → flush → abort → Rust reopens writable (replays log) →
+/// clean close → native opens → data intact. This verifies Rust's own
+/// log replay produces a file the native stack accepts.
+#[pal_async::async_test]
+async fn log_replay_rust_crash_rust_replay_native_reads(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Rust create + write + flush + abort.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await;
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.write_data(block_size, &test_pattern(block_size, 512))
+            .await;
+        rust.flush().await;
+        rust.abort().await;
+    }
+
+    // Rust reopens writable (replays log) → verify data → clean close.
+    {
+        let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await;
+        let data0 = rust.read_data(0, 512).await;
+        assert_eq!(data0, test_pattern(0, 512), "block 0 after Rust replay");
+        let data1 = rust.read_data(block_size, 512).await;
+        assert_eq!(
+            data1,
+            test_pattern(block_size, 512),
+            "block 1 after Rust replay"
+        );
+        rust.close().await;
+    }
+
+    // Native opens the cleanly-closed file → data intact.
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    let mut buf0 = vec![0u8; 512];
+    let bytes = raw.read_at(0, &mut buf0).expect("read block 0");
+    assert_eq!(bytes, 512);
+    assert_eq!(buf0, test_pattern(0, 512), "block 0 via native");
+
+    let mut buf1 = vec![0u8; 512];
+    let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1");
+    assert_eq!(bytes, 512);
+    assert_eq!(buf1, test_pattern(block_size, 512), "block 1 via native");
+}
+
+/// Test 23: Rust Crash → Native Replay → Native Writes More → Rust Reads
+///
+/// Full lifecycle: Rust writes block 0 → abort (crash) → native opens
+/// (replays log) → native writes block 1 → close → Rust opens → reads
+/// both blocks → both correct.
+#[pal_async::async_test]
+async fn log_replay_lifecycle_crash_replay_more_writes(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Step 1: Rust create + write block 0 + flush + abort.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 16 * 1024 * 1024, block_size as u32, &driver).await;
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.flush().await;
+        rust.abort().await;
+    }
+
+    // Step 2: Native opens (replays dirty log) → writes block 1 → closes.
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+
+        // Verify block 0 survived replay.
+        let mut buf = vec![0u8; 512];
+        let bytes = raw.read_at(0, &mut buf).expect("read block 0 after replay");
+        assert_eq!(bytes, 512);
+        assert_eq!(buf, test_pattern(0, 512), "block 0 after native replay");
+
+        // Write block 1.
+        let pattern = test_pattern(block_size, 512);
+        let written = raw
+            .write_at(block_size, &pattern)
+            .expect("native write block 1");
+        assert_eq!(written, 512);
+    }
+
+    // Step 3: Rust opens → reads both blocks → verifies.
+    {
+        let rust = RustVhdx::open(&vhdx_path, true, None).await;
+
+        let data0 = rust.read_data(0, 512).await;
+        assert_eq!(data0, test_pattern(0, 512), "block 0 via Rust");
+
+        let data1 = rust.read_data(block_size, 512).await;
+        assert_eq!(data1, test_pattern(block_size, 512), "block 1 via Rust");
+
+        rust.close().await;
+    }
+}
+
+/// Test 24: Rust Crash With Trim → Native Replay
+///
+/// Rust creates → writes blocks 0 and 1 → trims block 1 → flush → abort →
+/// native opens (replays log) → block 0 intact, block 1 is zeros.
+/// Verifies that trim state is correctly captured in the WAL and replayed.
+#[pal_async::async_test]
+async fn log_replay_rust_crash_with_trim_native_reads(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Rust create + write both blocks + trim block 1 + flush + abort.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await;
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.write_data(block_size, &test_pattern(block_size, 512))
+            .await;
+        rust.flush().await;
+
+        // Trim block 1 → BAT state change (Zero or Unmapped).
+        rust.trim_range(block_size, block_size).await;
+        rust.flush().await;
+
+        rust.abort().await;
+    }
+
+    // Native opens (replays log including the trim BAT update).
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    // Block 0: should have data.
+    let mut buf0 = vec![0u8; 512];
+    let bytes = raw.read_at(0, &mut buf0).expect("read block 0");
+    assert_eq!(bytes, 512);
+    assert_eq!(buf0, test_pattern(0, 512), "block 0 should be intact");
+
+    // Block 1: should be zeros (trimmed).
+    let mut buf1 = vec![0u8; 512];
+    let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1");
+    assert_eq!(bytes, 512);
+    assert!(
+        buf1.iter().all(|&b| b == 0),
+        "block 1 zeros after trim + crash + log replay"
+    );
+}
+
+/// Test 25: Multiple Crash-Recovery Cycles via Native
+///
+/// Rust writes → crash → native opens (replays) → writes more → close →
+/// Rust writes → crash → native opens (replays) → all data intact.
+/// Verifies that the log replay leaves the file in a clean state that
+/// supports another full write-crash-recovery cycle.
+#[pal_async::async_test]
+async fn log_replay_repeated_crash_cycles(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Cycle 1: Rust writes block 0 → crash.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 16 * 1024 * 1024, block_size as u32, &driver).await;
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.flush().await;
+        rust.abort().await;
+    }
+
+    // Cycle 1 recovery: Native opens (replays) → writes block 1 → closes.
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+
+        // Verify block 0 survived.
+        let mut buf = vec![0u8; 512];
+        raw.read_at(0, &mut buf).expect("read block 0");
+        assert_eq!(buf, test_pattern(0, 512), "cycle 1: block 0");
+
+        // Write block 1.
+        let written = raw
+            .write_at(block_size, &test_pattern(block_size, 512))
+            .expect("native write block 1");
+        assert_eq!(written, 512);
+    }
+
+    // Cycle 2: Rust opens (clean file now) → writes block 2 → crash.
+    {
+        let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await;
+        rust.write_data(2 * block_size, &test_pattern(2 * block_size, 512))
+            .await;
+        rust.flush().await;
+        rust.abort().await;
+    }
+
+    // Cycle 2 recovery: Native opens (replays) → reads all 3 blocks.
+    {
+        let mut native = NativeVhdx::open(&vhdx_path, false);
+        let raw = native.attach_raw();
+
+        let mut buf0 = vec![0u8; 512];
+        raw.read_at(0, &mut buf0).expect("read block 0");
+        assert_eq!(buf0, test_pattern(0, 512), "cycle 2: block 0");
+
+        let mut buf1 = vec![0u8; 512];
+        raw.read_at(block_size, &mut buf1).expect("read block 1");
+        assert_eq!(buf1, test_pattern(block_size, 512), "cycle 2: block 1");
+
+        let mut buf2 = vec![0u8; 512];
+        raw.read_at(2 * block_size, &mut buf2)
+            .expect("read block 2");
+        assert_eq!(buf2, test_pattern(2 * block_size, 512), "cycle 2: block 2");
+    }
+}
+
+/// Test 26: Clean Rust File → Native Opens Without Replay
+///
+/// Rust creates → writes → flush → close (clean shutdown) → native opens →
+/// data intact. A cleanly-closed file should not trigger log replay.
+#[pal_async::async_test]
+async fn log_replay_clean_close_no_replay_needed(driver: DefaultDriver) {
+    let dir = tempfile::tempdir().unwrap();
+    let vhdx_path = dir.path().join("test.vhdx");
+
+    let block_size: u64 = 2 * 1024 * 1024;
+
+    // Rust create + write + flush + clean close.
+    {
+        let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await;
+        rust.write_data(0, &test_pattern(0, 512)).await;
+        rust.write_data(block_size, &test_pattern(block_size, 512))
+            .await;
+        rust.flush().await;
+        rust.close().await;
+    }
+
+    // Native opens — should succeed without needing log replay.
+    let mut native = NativeVhdx::open(&vhdx_path, false);
+    let raw = native.attach_raw();
+
+    let mut buf0 = vec![0u8; 512];
+    let bytes = raw.read_at(0, &mut buf0).expect("read block 0");
+    assert_eq!(bytes, 512);
+    assert_eq!(buf0, test_pattern(0, 512), "block 0");
+
+    let mut buf1 = vec![0u8; 512];
+    let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1");
+    assert_eq!(bytes, 512);
+    assert_eq!(buf1, test_pattern(block_size, 512), "block 1");
+}