diff --git a/Cargo.lock b/Cargo.lock index 025a246b82..81264e4bc6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5893,6 +5893,7 @@ dependencies = [ "memory_range", "mesh", "pal_async", + "pal_event", "parking_lot", "pci_bus", "pci_core", diff --git a/Guide/src/reference/devices/firmware/linux_direct.md b/Guide/src/reference/devices/firmware/linux_direct.md index b7954297ca..3151db3aa5 100644 --- a/Guide/src/reference/devices/firmware/linux_direct.md +++ b/Guide/src/reference/devices/firmware/linux_direct.md @@ -55,9 +55,9 @@ OpenVMM synthesizes a minimal set of EFI structures in guest memory: 2. **EFI Memory Map** — describes the EFI metadata region, ACPI tables, and conventional RAM. 3. **ACPI Tables** — FADT (with `HW_REDUCED_ACPI`), MADT (GIC distributor, GICv3 - redistributors or GICv2 CPU interfaces, optional v2m MSI frame), GTDT + redistributors or GICv2 CPU interfaces, GICv3 ITS or v2m MSI frame), GTDT (virtual timer), DSDT (VMBus, serial UARTs), and optionally MCFG/SSDT for - PCIe. + PCIe and IORT for PCIe interrupt routing via the ITS. A **stub device tree** is then built. Unlike a full device tree, it contains no hardware nodes — no CPUs, GIC, timer, or devices. Its only purpose is a diff --git a/Guide/src/reference/emulated/pcie/overview.md b/Guide/src/reference/emulated/pcie/overview.md index 50e8a6d13f..84b19f2da7 100644 --- a/Guide/src/reference/emulated/pcie/overview.md +++ b/Guide/src/reference/emulated/pcie/overview.md @@ -77,6 +77,26 @@ hotplug, PME, AER, and other PCIe features rather than ACPI-based fallbacks. Linux assumes native control regardless, but Windows requires `_OSC` to enable native hotplug. +### MSI Interrupt Routing (aarch64) + +On aarch64, PCIe MSI/MSI-X interrupts are routed through either +a GICv3 ITS or a GICv2m MSI frame, depending on the platform: + +- **GICv3 ITS** (default on KVM with GICv3): The VMM creates a + KVM in-kernel ITS device. Each PCIe device gets a 32-bit + device ID composed as `(segment << 16) | BDF`, injected + transparently by per-device wrappers in the interrupt path. + ACPI boots emit an IORT with an ITS Group node and per-root- + complex ID mappings. The device tree includes an `its` child + node under the GIC with `msi-controller`. + +- **GICv2m**: MSI writes map to a fixed pool of 64 SPIs via + a v2m doorbell register. The MADT includes a GICv2m MSI + frame entry. + +The MSI controller can be overridden with the `--gic-msi` +CLI option (`auto`, `its`, or `v2m`). + ### Implementation notes ```admonish note title="No Command Completed support" diff --git a/openhcl/bootloader_fdt_parser/src/lib.rs b/openhcl/bootloader_fdt_parser/src/lib.rs index 3623f84013..302292b48b 100644 --- a/openhcl/bootloader_fdt_parser/src/lib.rs +++ b/openhcl/bootloader_fdt_parser/src/lib.rs @@ -537,7 +537,7 @@ fn parse_gic(node: &Node<'_>) -> anyhow::Result { gic_version: vm_topology::processor::aarch64::GicVersion::V3 { redistributors_base: reg[2], }, - gic_v2m: None, + gic_msi: vm_topology::processor::aarch64::GicMsiController::None, pmu_gsiv: None, // TODO: parse from the DT timer node instead of hardcoding. virt_timer_ppi: 20, @@ -1078,7 +1078,7 @@ mod tests { gic_version: vm_topology::processor::aarch64::GicVersion::V3 { redistributors_base: 0x20000, }, - gic_v2m: None, + gic_msi: vm_topology::processor::aarch64::GicMsiController::None, pmu_gsiv: Some(0x17), virt_timer_ppi: 20, gic_nr_irqs: 992, diff --git a/openhcl/virt_mshv_vtl/src/lib.rs b/openhcl/virt_mshv_vtl/src/lib.rs index 60ffabb807..aa6b3d77a1 100755 --- a/openhcl/virt_mshv_vtl/src/lib.rs +++ b/openhcl/virt_mshv_vtl/src/lib.rs @@ -1314,7 +1314,7 @@ struct UhInterruptTarget { } impl pci_core::msi::SignalMsi for UhInterruptTarget { - fn signal_msi(&self, _rid: u32, address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { self.partition .request_msi(self.vtl, MsiRequest { address, data }); } diff --git a/openvmm/openvmm_core/src/worker/dispatch.rs b/openvmm/openvmm_core/src/worker/dispatch.rs index 079a7273d4..644a478c0b 100644 --- a/openvmm/openvmm_core/src/worker/dispatch.rs +++ b/openvmm/openvmm_core/src/worker/dispatch.rs @@ -492,6 +492,7 @@ impl ExtractTopologyConfig for ProcessorTopology { Some(gsiv) => PmuGsivConfig::Gsiv(gsiv), None => PmuGsivConfig::Disabled, }, + gic_msi: Default::default(), })), } } @@ -504,6 +505,8 @@ impl BuildTopology for ProcessorTopologyConfig { platform_info: &virt::PlatformInfo, ) -> anyhow::Result> { use vm_topology::processor::aarch64::Aarch64PlatformConfig; + use vm_topology::processor::aarch64::GicItsInfo; + use vm_topology::processor::aarch64::GicMsiController; use vm_topology::processor::aarch64::GicV2mInfo; let arch = match &self.arch { @@ -511,11 +514,7 @@ impl BuildTopology for ProcessorTopologyConfig { Some(ArchTopologyConfig::Aarch64(arch)) => arch.clone(), _ => anyhow::bail!("invalid architecture config"), }; - let gic_v2m = Some(GicV2mInfo { - frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, - spi_base: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_BASE, - spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT, - }); + let pmu_gsiv = match arch.pmu_gsiv { PmuGsivConfig::Disabled => None, PmuGsivConfig::Gsiv(gsiv) => Some(gsiv), @@ -585,10 +584,39 @@ impl BuildTopology for ProcessorTopologyConfig { } }; + // Use the ITS for MSI delivery when the backend supports it + // (KVM with GICv3). Otherwise fall back to GICv2m (SPI-based MSIs). + use openvmm_defs::config::GicMsiConfig; + let is_gicv2 = matches!(gic_version, GicVersion::V2 { .. }); + let use_its = match arch.gic_msi { + GicMsiConfig::Auto => platform_info.supports_its && !is_gicv2, + GicMsiConfig::Its => { + if is_gicv2 { + anyhow::bail!("ITS is incompatible with GICv2"); + } + if !platform_info.supports_its { + anyhow::bail!("ITS requested but the hypervisor does not support it"); + } + true + } + GicMsiConfig::V2m => false, + }; + let gic_msi = if use_its { + GicMsiController::Its(GicItsInfo { + its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE, + }) + } else { + GicMsiController::V2m(GicV2mInfo { + frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE, + spi_base: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_BASE, + spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT, + }) + }; + let platform = Aarch64PlatformConfig { gic_distributor_base, gic_version, - gic_v2m, + gic_msi, pmu_gsiv, virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI, gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS, @@ -1810,8 +1838,30 @@ impl InitializedVm { (pcie_host_bridges, pcie_root_complexes) }; + // Build a port-name→segment map covering all ports in the PCIe + // topology (root complex ports and switch downstream ports). This + // is used for ITS device ID composition where the segment is part + // of the 32-bit device ID. + let mut port_segments: std::collections::HashMap, u16> = pcie_host_bridges + .iter() + .zip(pcie_root_complexes.iter()) + .flat_map(|(hb, rc)| { + rc.lock() + .downstream_ports() + .into_iter() + .map(move |(_, name)| (name, hb.segment)) + }) + .collect(); + for switch in cfg.pcie_switches { let device_name = format!("pcie-switch:{}", switch.name); + + // Inherit the segment from the switch's parent port. + let parent_segment = port_segments + .get(switch.parent_port.as_str()) + .copied() + .expect("switch parent port must be a known downstream port"); + let switch_device = chipset_builder .arc_mutex_device(device_name) .on_pcie_port(vmotherboard::BusId::new(&switch.parent_port)) @@ -1824,6 +1874,12 @@ impl InitializedVm { GenericPcieSwitch::new(definition) })?; + // Query the switch's actual downstream port names instead of + // reconstructing them from the naming convention. + for (_, name) in switch_device.lock().downstream_ports() { + port_segments.insert(name, parent_segment); + } + let bus_id = vmotherboard::BusId::new(&switch.name); chipset_builder.register_weak_mutex_pcie_enumerator(bus_id, Box::new(switch_device)); } @@ -1846,7 +1902,23 @@ impl InitializedVm { Some(handle) }; + // Determine whether ITS wrappers are needed for PCIe MSI delivery. + // Only aarch64 VMs configured with a GICv3 ITS need device ID + // injection; all other configurations pass through directly. + #[cfg(guest_arch = "aarch64")] + let use_its = matches!( + processor_topology.gic_msi(), + vm_topology::processor::aarch64::GicMsiController::Its(_) + ); + #[cfg(not(guest_arch = "aarch64"))] + let use_its = false; + // Resolve PCIe devices concurrently. + // + // Each device gets an AssignedBusRange that the root port updates when + // the guest programs the secondary and subordinate bus numbers. When + // ITS is configured, wrappers compose the RID at + // interrupt delivery time. try_join_all(cfg.pcie_devices.into_iter().map(|dev_cfg| { let chipset_builder = &chipset_builder; let driver_source = &driver_source; @@ -1854,18 +1926,47 @@ impl InitializedVm { let gm = &gm; let partition = &partition; let mapper = &mapper; + let port_segments = &port_segments; async move { + let port_name: Arc = dev_cfg.port_name.into(); + let bus_range = pcie::bus_range::AssignedBusRange::new(); + let segment = port_segments + .get(&port_name) + .copied() + .expect("device port must be a known downstream port"); + + // When ITS is active, wrap the partition's SignalMsi + // and IrqFd to inject the device identity. Otherwise + // pass through directly. + let signal_msi = partition.as_signal_msi(Vtl::Vtl0).map(|s| { + if use_its { + Arc::new(pcie::its::ItsSignalMsi::new(s, bus_range.clone(), segment)) + as Arc + } else { + s + } + }); + let irqfd = partition.irqfd().map(|fd| { + if use_its { + Arc::new(pcie::its::ItsIrqFd::new(fd, bus_range.clone(), segment)) + as Arc + } else { + fd + } + }); + vmm_core::device_builder::build_pcie_device( chipset_builder, - dev_cfg.port_name.into(), + port_name.clone(), driver_source, resolver, gm, dev_cfg.resource, partition.clone().into_doorbell_registration(Vtl::Vtl0), Some(mapper), - partition.as_signal_msi(Vtl::Vtl0), - partition.irqfd(), + signal_msi, + irqfd, + Some(bus_range), ) .await } @@ -2872,19 +2973,53 @@ impl LoadedVm { } VmRpc::AddPcieDevice(rpc) => { rpc.handle_failable(async |(port_name, resource)| { - // Validate the port exists before creating the device - // to avoid leaking a DynamicDeviceUnit on error. - let rc = self.inner.pcie_root_complexes.iter() - .find(|rc| { + // Find the root complex and its index for the named port. + let (rc_idx, rc) = self.inner.pcie_root_complexes.iter() + .enumerate() + .find(|(_, rc)| { rc.lock().downstream_ports().iter().any(|(_, name)| name.as_ref() == port_name.as_str()) }) .ok_or_else(|| anyhow::anyhow!("port '{}' not found in any root complex", port_name))?; - let msi_conn = match self.inner.partition.irqfd() { + #[cfg(guest_arch = "aarch64")] + let use_its = matches!( + self.inner.processor_topology.gic_msi(), + vm_topology::processor::aarch64::GicMsiController::Its(_) + ); + #[cfg(not(guest_arch = "aarch64"))] + let use_its = false; + + let bus_range = pcie::bus_range::AssignedBusRange::new(); + + let signal_msi = self.inner.partition.as_signal_msi(Vtl::Vtl0).map(|s| { + if use_its { + let segment = self.inner.pcie_host_bridges[rc_idx].segment; + Arc::new(pcie::its::ItsSignalMsi::new( + s, + bus_range.clone(), + segment, + )) as Arc + } else { + s + } + }); + let irqfd = self.inner.partition.irqfd().map(|fd| { + if use_its { + let segment = self.inner.pcie_host_bridges[rc_idx].segment; + Arc::new(pcie::its::ItsIrqFd::new( + fd, + bus_range.clone(), + segment, + )) as Arc + } else { + fd + } + }); + + let msi_conn = match irqfd { Some(fd) => pci_core::msi::MsiConnection::with_irqfd(fd), None => pci_core::msi::MsiConnection::new(), }; - let signal_msi = self.inner.partition.as_signal_msi(Vtl::Vtl0); let (unit, device) = self.inner.chipset_devices.add_dyn_device( &self.inner.driver_source, @@ -2927,10 +3062,13 @@ impl LoadedVm { self.state_units.start_stopped_units().await; // Now attach the device and notify the guest. + // The device_id is passed so the port can track + // the device's RID for ITS/SMMU. if let Err(e) = rc.lock().hotplug_add_device( &port_name, "hotplug-device", bus_device, + Some(bus_range), ) { // Clean up the device unit on failure let (_, unit, _) = self.inner.pcie_hotplug_devices.pop().unwrap(); diff --git a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs index 8b6253ee4c..83589086e0 100644 --- a/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs +++ b/openvmm/openvmm_core/src/worker/vm_loaders/linux.rs @@ -161,6 +161,7 @@ fn build_dt( let num_cpus = processor_topology.vps().len(); + use vm_topology::processor::aarch64::GicMsiController; use vm_topology::processor::aarch64::GicVersion; let gic_dist_base: u64 = processor_topology.gic_distributor_base(); @@ -237,6 +238,7 @@ fn build_dt( const PHANDLE_GIC: u32 = 1; const PHANDLE_APB_PCLK: u32 = 2; const PHANDLE_V2M: u32 = 3; + const PHANDLE_ITS: u32 = 4; const GIC_SPI: u32 = 0; const GIC_PPI: u32 = 1; @@ -311,8 +313,9 @@ fn build_dt( // ARM64 Generic Interrupt Controller. // GICv3 uses "arm,gic-v3"; GICv2 uses "arm,cortex-a15-gic". - // Both versions can have a v2m child for SPI-based MSIs (PCIe). - let v2m_info = processor_topology.gic_v2m(); + // GICv3 can have an ITS child for LPI-based MSIs; v2m is the + // fallback for SPI-based MSIs (GICv2 or GICv3 without ITS). + let gic_msi = processor_topology.gic_msi(); let gic_compatible = match processor_topology.gic_version() { GicVersion::V3 { .. } => "arm,gic-v3", GicVersion::V2 { .. } => "arm,cortex-a15-gic", @@ -335,8 +338,16 @@ fn build_dt( .add_null(p_interrupt_controller)? .add_u32(p_phandle, PHANDLE_GIC)? .add_null(p_ranges)?; - root_builder = if let Some(v2m) = v2m_info { - gic_node + root_builder = match gic_msi { + GicMsiController::Its(its) => gic_node + .start_node(format!("its@{:x}", its.its_base).as_str())? + .add_str(p_compatible, "arm,gic-v3-its")? + .add_null(p_msi_controller)? + .add_u64_array(p_reg, &[its.its_base, openvmm_defs::config::GIC_ITS_SIZE])? + .add_u32(p_phandle, PHANDLE_ITS)? + .end_node()? + .end_node()?, + GicMsiController::V2m(v2m) => gic_node .start_node(format!("v2m@{:x}", v2m.frame_base).as_str())? .add_str(p_compatible, "arm,gic-v2m-frame")? .add_null(p_msi_controller)? @@ -348,9 +359,8 @@ fn build_dt( .add_u32(p_arm_msi_num_spis, v2m.spi_count)? .add_u32(p_phandle, PHANDLE_V2M)? .end_node()? - .end_node()? - } else { - gic_node.end_node()? + .end_node()?, + GicMsiController::None => gic_node.end_node()?, }; // ARM64 Architectural Timer. @@ -424,7 +434,7 @@ fn build_dt( } // No interrupt-map is provided because all devices use MSIs via the - // v2m frame; legacy INTx routing is not supported. + // ITS or v2m frame; legacy INTx routing is not supported. let mut node = root_builder .start_node(name.as_str())? .add_str(p_compatible, "pci-host-ecam-generic")? @@ -439,8 +449,14 @@ fn build_dt( .add_u32(p_size_cells, 2)? .add_u32(p_interrupt_parent, PHANDLE_GIC)? .add_u32_array(p_ranges, &ranges)?; - if v2m_info.is_some() { - node = node.add_u32(p_msi_parent, PHANDLE_V2M)?; + match gic_msi { + GicMsiController::Its(_) => { + node = node.add_u32(p_msi_parent, PHANDLE_ITS)?; + } + GicMsiController::V2m(_) => { + node = node.add_u32(p_msi_parent, PHANDLE_V2M)?; + } + GicMsiController::None => {} } root_builder = node.end_node()?; } diff --git a/openvmm/openvmm_defs/src/config.rs b/openvmm/openvmm_defs/src/config.rs index 749301c0bf..c4ebf45e43 100644 --- a/openvmm/openvmm_defs/src/config.rs +++ b/openvmm/openvmm_defs/src/config.rs @@ -113,6 +113,13 @@ pub const DEFAULT_GIC_V2M_SPI_BASE: u32 = 512; /// Number of SPIs reserved for PCIe MSIs. pub const DEFAULT_GIC_V2M_SPI_COUNT: u32 = 64; +/// Base address of the GICv3 ITS MMIO region. Must be 64 KiB aligned, +/// below the v2m frame address, and not overlap other devices. +/// The region extends from this base to base + GIC_ITS_SIZE (128 KiB). +pub const DEFAULT_GIC_ITS_BASE: u64 = 0xEFFC_0000; +/// Size of the ITS MMIO region (control frame + translation frame, 2×64 KiB). +pub const GIC_ITS_SIZE: u64 = 0x2_0000; + /// Default virtual timer PPI (GIC INTID). PPI 4 = INTID 16 + 4 = 20. /// This is the EL1 virtual timer interrupt used across Hyper-V, KVM, and HVF. pub const DEFAULT_VIRT_TIMER_PPI: u32 = 20; @@ -291,10 +298,24 @@ pub enum PmuGsivConfig { Disabled, } +/// MSI controller selection for aarch64 PCIe interrupt delivery. +#[derive(Debug, Protobuf, Default, Clone)] +pub enum GicMsiConfig { + /// Automatically select the best available MSI controller: + /// ITS when the hypervisor supports it, otherwise GICv2m. + #[default] + Auto, + /// Force GICv3 ITS for MSI delivery via LPIs. + Its, + /// Force GICv2m for MSI delivery via SPIs. + V2m, +} + #[derive(Debug, Protobuf, Default, Clone)] pub struct Aarch64TopologyConfig { pub gic_config: Option, pub pmu_gsiv: PmuGsivConfig, + pub gic_msi: GicMsiConfig, } /// GIC configuration for the virtual machine. diff --git a/openvmm/openvmm_entry/src/cli_args.rs b/openvmm/openvmm_entry/src/cli_args.rs index f060fd3a8c..45803fc8ba 100644 --- a/openvmm/openvmm_entry/src/cli_args.rs +++ b/openvmm/openvmm_entry/src/cli_args.rs @@ -386,6 +386,11 @@ options: #[clap(long, default_value = "auto", value_parser = parse_x2apic)] pub x2apic: X2ApicConfig, + /// configure PCIe MSI controller for aarch64 (auto | its | v2m) + #[cfg(guest_arch = "aarch64")] + #[clap(long, default_value = "auto")] + pub gic_msi: GicMsiCli, + /// COM1 binding (console | stderr | listen=\ | file=\ (overwrites) | listen=tcp:\:\ | term[=\]\[,name=\\] | none) #[clap(long, value_name = "SERIAL")] pub com1: Option, @@ -2064,6 +2069,18 @@ pub enum Vtl0LateMapPolicyCli { Exception, } +/// PCIe MSI controller selection for aarch64. +#[derive(Debug, Copy, Clone, Default, ValueEnum)] +pub enum GicMsiCli { + /// Use ITS when available, fall back to GICv2m. + #[default] + Auto, + /// Force GICv3 ITS (LPI-based MSIs). + Its, + /// Force GICv2m (SPI-based MSIs). + V2m, +} + #[derive(Debug, Copy, Clone, ValueEnum)] pub enum IsolationCli { Vbs, diff --git a/openvmm/openvmm_entry/src/lib.rs b/openvmm/openvmm_entry/src/lib.rs index c83aa291d1..d32a4b7442 100644 --- a/openvmm/openvmm_entry/src/lib.rs +++ b/openvmm/openvmm_entry/src/lib.rs @@ -1316,6 +1316,11 @@ async fn vm_config_from_command_line( // TODO: allow this to be configured from the command line gic_config: None, pmu_gsiv: openvmm_defs::config::PmuGsivConfig::Platform, + gic_msi: match opt.gic_msi { + cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto, + cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its, + cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m, + }, }, ); #[cfg(guest_arch = "x86_64")] diff --git a/tmk/tmk_vmm/src/run.rs b/tmk/tmk_vmm/src/run.rs index 6839606568..5576e18e0e 100644 --- a/tmk/tmk_vmm/src/run.rs +++ b/tmk/tmk_vmm/src/run.rs @@ -65,7 +65,7 @@ impl CommonState { gic_version: vm_topology::processor::aarch64::GicVersion::V3 { redistributors_base: 0xff020000, }, - gic_v2m: None, + gic_msi: vm_topology::processor::aarch64::GicMsiController::None, pmu_gsiv: None, virt_timer_ppi: 20, // DEFAULT_VIRT_TIMER_PPI gic_nr_irqs: 256, diff --git a/vm/acpi_spec/src/iort.rs b/vm/acpi_spec/src/iort.rs new file mode 100644 index 0000000000..d8000f1bec --- /dev/null +++ b/vm/acpi_spec/src/iort.rs @@ -0,0 +1,205 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! IORT (IO Remapping Table) types for aarch64 PCIe topology. + +use super::Table; +use crate::packed_nums::*; +use core::mem::size_of; +use static_assertions::const_assert_eq; +use zerocopy::FromBytes; +use zerocopy::Immutable; +use zerocopy::IntoBytes; +use zerocopy::KnownLayout; +use zerocopy::Unaligned; + +pub const IORT_REVISION: u8 = 5; +pub const IORT_NODE_OFFSET: u32 = size_of::() as u32 + size_of::() as u32; + +pub const IORT_NODE_TYPE_ITS_GROUP: u8 = 0x00; +pub const IORT_NODE_TYPE_PCI_ROOT_COMPLEX: u8 = 0x02; + +pub const IORT_PCI_ROOT_COMPLEX_REVISION: u8 = 3; +pub const IORT_ITS_GROUP_REVISION: u8 = 1; + +pub const IORT_NODE_COHERENT: u32 = 0x00000001; +pub const IORT_MEMORY_ACCESS_COHERENCY: u8 = 1 << 0; +pub const IORT_MEMORY_ACCESS_ATTRIBUTES: u8 = 1 << 1; +pub const IORT_ID_SINGLE_MAPPING: u32 = 1 << 0; + +#[repr(C)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct Iort { + pub node_count: u32_ne, + pub node_offset: u32_ne, + pub reserved: u32_ne, +} + +impl Iort { + pub fn new(node_count: u32) -> Self { + Self { + node_count: node_count.into(), + node_offset: IORT_NODE_OFFSET.into(), + reserved: 0.into(), + } + } +} + +impl Table for Iort { + const SIGNATURE: [u8; 4] = *b"IORT"; +} + +const_assert_eq!(size_of::(), 12); +const_assert_eq!(IORT_NODE_OFFSET as usize, 48); + +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct IortNodeHeader { + pub node_type: u8, + pub length: u16_ne, + pub revision: u8, + pub identifier: u32_ne, + pub mapping_count: u32_ne, + pub mapping_offset: u32_ne, +} + +impl IortNodeHeader { + pub fn new(node_type: u8, revision: u8, identifier: u32, mapping_count: u32) -> Self { + Self { + node_type, + length: (size_of::() as u16).into(), + revision, + identifier: identifier.into(), + mapping_count: mapping_count.into(), + mapping_offset: if mapping_count == 0 { + 0.into() + } else { + (size_of::() as u32).into() + }, + } + } +} + +const_assert_eq!(size_of::(), 16); + +#[repr(C)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct IortMemoryAccessProperties { + pub cache_coherency: u32_ne, + pub hints: u8, + pub reserved: u16_ne, + pub memory_flags: u8, +} + +impl IortMemoryAccessProperties { + pub fn coherent() -> Self { + Self { + cache_coherency: IORT_NODE_COHERENT.into(), + hints: 0, + reserved: 0.into(), + memory_flags: IORT_MEMORY_ACCESS_COHERENCY | IORT_MEMORY_ACCESS_ATTRIBUTES, + } + } +} + +const_assert_eq!(size_of::(), 8); + +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct IortPciRootComplex { + pub header: IortNodeHeader, + pub memory_properties: IortMemoryAccessProperties, + pub ats_attribute: u32_ne, + pub pci_segment_number: u32_ne, + pub memory_address_limit: u8, + pub reserved: [u8; 3], +} + +impl IortPciRootComplex { + /// Create a PCI Root Complex node. The `length` field in the header + /// includes space for `mapping_count` trailing `IortIdMapping` entries, + /// which must be appended separately after serializing this struct. + pub fn new(identifier: u32, pci_segment_number: u16, mapping_count: u32) -> Self { + let mut header = IortNodeHeader::new::( + IORT_NODE_TYPE_PCI_ROOT_COMPLEX, + IORT_PCI_ROOT_COMPLEX_REVISION, + identifier, + mapping_count, + ); + // The node length must include the variable-length ID mapping array. + let total = + size_of::() as u16 + (mapping_count as u16) * size_of::() as u16; + header.length = total.into(); + Self { + header, + memory_properties: IortMemoryAccessProperties::coherent(), + ats_attribute: 0.into(), + pci_segment_number: u32::from(pci_segment_number).into(), + memory_address_limit: 64, + reserved: [0; 3], + } + } +} + +const_assert_eq!(size_of::(), 36); + +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct IortIdMapping { + pub input_base: u32_ne, + pub id_count: u32_ne, + pub output_base: u32_ne, + pub output_reference: u32_ne, + pub flags: u32_ne, +} + +impl IortIdMapping { + pub fn new( + input_base: u32, + id_count: u32, + output_base: u32, + output_reference: u32, + flags: u32, + ) -> Self { + Self { + input_base: input_base.into(), + id_count: id_count.into(), + output_base: output_base.into(), + output_reference: output_reference.into(), + flags: flags.into(), + } + } +} + +const_assert_eq!(size_of::(), 20); + +/// ITS Group node. Followed by `its_count` u32 ITS identifiers. +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct IortItsGroup { + pub header: IortNodeHeader, + pub its_count: u32_ne, +} + +impl IortItsGroup { + /// Create an ITS Group node. The `length` field in the header includes + /// space for `its_count` trailing u32 ITS identifiers, which must be + /// appended separately after serializing this struct. + pub fn new(identifier: u32, its_count: u32) -> Self { + let mut header = IortNodeHeader::new::( + IORT_NODE_TYPE_ITS_GROUP, + IORT_ITS_GROUP_REVISION, + identifier, + 0, + ); + // The node length must include the variable-length ITS ID array. + let total = size_of::() as u16 + (its_count as u16) * 4; + header.length = total.into(); + Self { + header, + its_count: its_count.into(), + } + } +} + +const_assert_eq!(size_of::(), 20); diff --git a/vm/acpi_spec/src/lib.rs b/vm/acpi_spec/src/lib.rs index 0a881b984d..3c4933fe13 100644 --- a/vm/acpi_spec/src/lib.rs +++ b/vm/acpi_spec/src/lib.rs @@ -13,6 +13,7 @@ extern crate alloc; pub mod aspt; pub mod fadt; pub mod gtdt; +pub mod iort; pub mod madt; pub mod mcfg; pub mod pptt; diff --git a/vm/acpi_spec/src/madt.rs b/vm/acpi_spec/src/madt.rs index 63574a5bf5..3c140b1d4a 100644 --- a/vm/acpi_spec/src/madt.rs +++ b/vm/acpi_spec/src/madt.rs @@ -43,6 +43,7 @@ open_enum! { GICC = 0xb, GICD = 0xc, GIC_MSI_FRAME = 0xd, + GIC_ITS = 0xf, } } @@ -269,6 +270,33 @@ impl MadtGicMsiFrame { } } +/// ACPI 6.5 MADT GIC ITS structure (Table 5-68). +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, IntoBytes, Immutable, KnownLayout, FromBytes, Unaligned)] +pub struct MadtGicIts { + pub typ: MadtType, + pub length: u8, + pub reserved: u16, + pub gic_its_id: u32, + pub base_address: u64, + pub reserved2: u32, +} + +const_assert_eq!(size_of::(), 20); + +impl MadtGicIts { + pub fn new(gic_its_id: u32, base_address: u64) -> Self { + Self { + typ: MadtType::GIC_ITS, + length: size_of::() as u8, + reserved: 0, + gic_its_id, + base_address, + reserved2: 0, + } + } +} + // TODO: use LE types everywhere, as here, to avoid #[repr(packed)] and to be // specific about endianness (which the ACPI spec dictates is always LE). #[repr(C)] diff --git a/vm/devices/pci/pci_core/src/capabilities/msix.rs b/vm/devices/pci/pci_core/src/capabilities/msix.rs index a5c8ac3440..5618cb96a9 100644 --- a/vm/devices/pci/pci_core/src/capabilities/msix.rs +++ b/vm/devices/pci/pci_core/src/capabilities/msix.rs @@ -174,7 +174,7 @@ impl MsiInterrupt { } if state.pending { - state.target.signal_msi(0, address, data); + state.target.signal_msi(address, data); state.pending = false; } } @@ -210,7 +210,7 @@ impl InterruptTarget for MsiInterruptTarget { fn deliver(&self) { let mut state = self.0.lock(); if state.enabled { - state.target.signal_msi(0, state.address, state.data); + state.target.signal_msi(state.address, state.data); } else { state.pending = true; } @@ -675,7 +675,7 @@ mod tests { &self.event } - fn enable(&self, address: u64, data: u32) { + fn enable(&self, address: u64, data: u32, _devid: Option) { self.calls.lock().push(RouteCall::SetMsi { address, data }); } diff --git a/vm/devices/pci/pci_core/src/msi.rs b/vm/devices/pci/pci_core/src/msi.rs index 92ccfdb83a..df73b35d1b 100644 --- a/vm/devices/pci/pci_core/src/msi.rs +++ b/vm/devices/pci/pci_core/src/msi.rs @@ -13,8 +13,11 @@ use vmcore::irqfd::IrqFdRoute; pub trait SignalMsi: Send + Sync { /// Signals a message-signaled interrupt at the specified address with the specified data. /// - /// `rid` is the requester ID of the PCI device sending the interrupt. - fn signal_msi(&self, rid: u32, address: u64, data: u32); + /// `devid` is an optional device identity. Its meaning is layer-dependent: + /// at the device layer it is a BDF for multi-function devices (`None` for + /// single-function); at the ITS wrapper layer it is the fully composed ITS + /// device ID; backends that don't need it ignore it. + fn signal_msi(&self, devid: Option, address: u64, data: u32); } /// A kernel-mediated MSI interrupt route for a single vector. @@ -43,7 +46,16 @@ impl MsiRoute { /// `address` and `data` are the MSI address and data values that /// the hypervisor will use when injecting the interrupt. pub fn enable(&self, address: u64, data: u32) { - self.0.enable(address, data) + self.0.enable(address, data, None) + } + + /// Configures the MSI address and data for this route. + /// + /// `rid` is the PCIe requester ID (RID) of the device that will signal the + /// interrupt. `address` and `data` are the MSI address and data values that + /// the hypervisor will use when injecting the interrupt. + pub fn enable_with_rid(&self, address: u64, data: u32, rid: u16) { + self.0.enable(address, data, Some(rid.into())) } /// Disables the MSI route. Interrupts that arrive while disabled @@ -64,7 +76,7 @@ impl MsiRoute { struct DisconnectedMsiTarget; impl SignalMsi for DisconnectedMsiTarget { - fn signal_msi(&self, _rid: u32, _address: u64, _data: u32) { + fn signal_msi(&self, _devid: Option, _address: u64, _data: u32) { tracelimit::warn_ratelimited!("dropped MSI interrupt to disconnected target"); } } @@ -143,12 +155,16 @@ impl MsiConnection { } impl MsiTarget { - /// Signals an MSI interrupt to this target from the specified RID. - /// - /// A single-RID device should use `0` as the RID. - pub fn signal_msi(&self, rid: u32, address: u64, data: u32) { + /// Signals an MSI interrupt to this target. + pub fn signal_msi(&self, address: u64, data: u32) { + let inner = self.inner.read(); + inner.signal_msi.signal_msi(None, address, data); + } + + /// Signals an MSI interrupt to this target from a specific RID. + pub fn signal_msi_with_rid(&self, rid: u16, address: u64, data: u32) { let inner = self.inner.read(); - inner.signal_msi.signal_msi(rid, address, data); + inner.signal_msi.signal_msi(Some(rid.into()), address, data); } /// Creates a new kernel-mediated MSI route for direct interrupt diff --git a/vm/devices/pci/pci_core/src/test_helpers/mod.rs b/vm/devices/pci/pci_core/src/test_helpers/mod.rs index efb045b703..2888bd0e3a 100644 --- a/vm/devices/pci/pci_core/src/test_helpers/mod.rs +++ b/vm/devices/pci/pci_core/src/test_helpers/mod.rs @@ -43,8 +43,7 @@ impl TestPciInterruptController { } impl SignalMsi for TestPciInterruptControllerInner { - fn signal_msi(&self, rid: u32, address: u64, data: u32) { - assert_eq!(rid, 0); + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { self.msi_requests.lock().push_back((address, data)); } } diff --git a/vm/devices/pci/pcie/Cargo.toml b/vm/devices/pci/pcie/Cargo.toml index b99da29991..d4e1ac481f 100644 --- a/vm/devices/pci/pcie/Cargo.toml +++ b/vm/devices/pci/pcie/Cargo.toml @@ -13,6 +13,7 @@ inspect.workspace = true memory_range.workspace = true mesh.workspace = true pal_async.workspace = true +pal_event.workspace = true pci_bus.workspace = true pci_core.workspace = true tracing.workspace = true diff --git a/vm/devices/pci/pcie/fuzz/fuzz_pcie.rs b/vm/devices/pci/pcie/fuzz/fuzz_pcie.rs index c5def824a0..0121327b37 100644 --- a/vm/devices/pci/pcie/fuzz/fuzz_pcie.rs +++ b/vm/devices/pci/pcie/fuzz/fuzz_pcie.rs @@ -142,7 +142,7 @@ impl FuzzRootComplex { name: impl AsRef, dev: Box, ) -> Result<(), Arc> { - self.rc.add_pcie_device(port, name, dev) + self.rc.add_pcie_device(port, name, dev, None) } pub fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> Result<(), IoError> { diff --git a/vm/devices/pci/pcie/src/bus_range.rs b/vm/devices/pci/pcie/src/bus_range.rs new file mode 100644 index 0000000000..5eb8ab6339 --- /dev/null +++ b/vm/devices/pci/pcie/src/bus_range.rs @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Shared PCIe bus range tracking. +//! +//! An [`AssignedBusRange`] holds the bus range `(secondary_bus, subordinate_bus)` +//! assigned to the PCIe port that owns a device. It is updated by the +//! PCIe port when the guest assigns bus numbers. +//! +//! Consumers (ITS wrappers, SMMU) compose a full device identity from the +//! bus range plus the device's BDF. + +use std::sync::Arc; +use std::sync::atomic::AtomicU16; +use std::sync::atomic::Ordering; + +/// Shared PCI device identity. +/// +/// Stores a packed `(secondary_bus, subordinate_bus)` as an atomic u16, +/// updated when the PCIe port's bus numbers change. +/// +/// Clone is cheap (just an `Arc` bump). +#[derive(Clone, Debug)] +pub struct AssignedBusRange(Arc); + +impl AssignedBusRange { + /// Creates a new device ID initialized to zero. + pub fn new() -> Self { + Self(Arc::new(AtomicU16::new(0))) + } + + /// Updates the bus range for the downstream port. + pub fn set_bus_range(&self, secondary: u8, subordinate: u8) { + self.0.store( + (secondary as u16) << 8 | subordinate as u16, + Ordering::Relaxed, + ); + } + + /// Returns the current `(secondary_bus, subordinate_bus)`. + pub fn bus_range(&self) -> (u8, u8) { + let v = self.0.load(Ordering::Relaxed); + ((v >> 8) as u8, v as u8) + } + + /// Composes an ITS device ID from the current bus range, segment, and + /// an optional per-device BDF override. + /// + /// Returns `None` if the secondary bus has not been assigned yet (still 0). + /// When `devid` is `None`, defaults to `(secondary_bus, dev 0, fn 0)`. + /// Logs a rate-limited warning and returns `None` if the BDF's bus + /// number falls outside the port's assigned range. + pub fn compose_its_devid(&self, segment: u16, devid: Option) -> Option { + let (secondary, subordinate) = self.bus_range(); + if secondary == 0 { + return None; + } + let bdf = devid.unwrap_or((secondary as u32) << 8); + let bus = (bdf >> 8) as u8; + if bus < secondary || bus > subordinate { + tracelimit::warn_ratelimited!(bus, secondary, subordinate, "BDF out of port bus range"); + return None; + } + Some((segment as u32) << 16 | (bdf & 0xFFFF)) + } +} diff --git a/vm/devices/pci/pcie/src/its.rs b/vm/devices/pci/pcie/src/its.rs new file mode 100644 index 0000000000..462154e0a5 --- /dev/null +++ b/vm/devices/pci/pcie/src/its.rs @@ -0,0 +1,113 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! GICv3 ITS interrupt wrappers for PCIe devices. +//! +//! The ITS routes MSIs using a 32-bit device ID. For PCIe, this is `(segment << +//! 16) | bdf`, where `bdf = (bus << 8) | (dev << 3) | fn`. +//! +//! [`ItsSignalMsi`] and [`ItsIrqFd`] wrap a partition's generic MSI and irqfd +//! implementations to inject the ITS device ID. The bus range comes from a +//! shared [`AssignedBusRange`] (updated by the PCIe port when the guest assigns +//! bus numbers); the segment is fixed at construction time. +//! +//! For single-function devices (`devid == None`), the wrapper defaults to +//! device 0, function 0 on the port's secondary bus. Multi-function devices +//! pass `Some(bdf)` where `bdf = (bus << 8) | (dev << 3) | fn`. + +use crate::bus_range::AssignedBusRange; +use pal_event::Event; +use pci_core::msi::SignalMsi; +use std::sync::Arc; +use vmcore::irqfd::IrqFd; +use vmcore::irqfd::IrqFdRoute; + +/// A [`SignalMsi`] wrapper that composes the ITS device ID before +/// forwarding to the inner implementation. +pub struct ItsSignalMsi { + inner: Arc, + bus_range: AssignedBusRange, + segment: u16, +} + +impl ItsSignalMsi { + /// Creates a new wrapper. + /// + /// `segment` is the PCI segment number of the root complex that + /// owns this device. + pub fn new(inner: Arc, bus_range: AssignedBusRange, segment: u16) -> Self { + Self { + inner, + bus_range, + segment, + } + } +} + +impl SignalMsi for ItsSignalMsi { + fn signal_msi(&self, devid: Option, address: u64, data: u32) { + let Some(its_devid) = self.bus_range.compose_its_devid(self.segment, devid) else { + return; + }; + self.inner.signal_msi(Some(its_devid), address, data); + } +} + +/// An [`IrqFd`] wrapper that produces ITS irqfd routes, each +/// of which injects the ITS device ID into the `devid` parameter on +/// `enable`. +pub struct ItsIrqFd { + inner: Arc, + bus_range: AssignedBusRange, + segment: u16, +} + +impl ItsIrqFd { + /// Creates a new wrapper. + /// + /// `segment` is the PCI segment number of the root complex that + /// owns this device. + pub fn new(inner: Arc, bus_range: AssignedBusRange, segment: u16) -> Self { + Self { + inner, + bus_range, + segment, + } + } +} + +impl IrqFd for ItsIrqFd { + fn new_irqfd_route(&self) -> anyhow::Result> { + let inner_route = self.inner.new_irqfd_route()?; + Ok(Box::new(ItsIrqFdRoute { + inner: inner_route, + bus_range: self.bus_range.clone(), + segment: self.segment, + })) + } +} + +/// An [`IrqFdRoute`] wrapper that composes the ITS device ID on +/// `enable`. +struct ItsIrqFdRoute { + inner: Box, + bus_range: AssignedBusRange, + segment: u16, +} + +impl IrqFdRoute for ItsIrqFdRoute { + fn event(&self) -> &Event { + self.inner.event() + } + + fn enable(&self, address: u64, data: u32, devid: Option) { + let Some(its_devid) = self.bus_range.compose_its_devid(self.segment, devid) else { + return; + }; + self.inner.enable(address, data, Some(its_devid)); + } + + fn disable(&self) { + self.inner.disable(); + } +} diff --git a/vm/devices/pci/pcie/src/lib.rs b/vm/devices/pci/pcie/src/lib.rs index a5f6a7dd09..29444418e3 100644 --- a/vm/devices/pci/pcie/src/lib.rs +++ b/vm/devices/pci/pcie/src/lib.rs @@ -5,6 +5,8 @@ #![forbid(unsafe_code)] +pub mod bus_range; +pub mod its; pub(crate) mod port; pub mod root; pub mod switch; diff --git a/vm/devices/pci/pcie/src/port.rs b/vm/devices/pci/pcie/src/port.rs index e7efc11aaf..da7e3e2f6e 100644 --- a/vm/devices/pci/pcie/src/port.rs +++ b/vm/devices/pci/pcie/src/port.rs @@ -3,6 +3,7 @@ //! Common PCIe port implementation shared between different port types. +use crate::bus_range::AssignedBusRange; use anyhow::bail; use chipset_device::io::IoResult; use inspect::Inspect; @@ -30,6 +31,12 @@ pub struct PcieDownstreamPort { /// The connected device, if any. #[inspect(skip)] pub link: Option<(Arc, Box)>, + + /// Shared PCI device identity for the downstream device, used to + /// update the device's RID when the secondary bus number changes. + /// Also available for SMMU stream ID mapping. + #[inspect(skip)] + bus_range: Option, } impl PcieDownstreamPort { @@ -76,6 +83,45 @@ impl PcieDownstreamPort { name: port_name, cfg_space, link: None, + bus_range: None, + } + } + + /// Sets the shared bus range for the downstream device. + /// + /// The port will update this bus range when the guest programs the + /// secondary bus number. The same bus range is shared with MSI/irqfd + /// wrappers so that interrupt delivery uses the correct requester ID. + /// + /// The bus range is immediately initialized from the port's current + /// config space so that hotplugged devices see already-assigned bus + /// numbers without waiting for a guest write. + pub fn set_bus_range(&mut self, bus_range: AssignedBusRange) { + let secondary = *self.cfg_space.assigned_bus_range().start(); + let subordinate = *self.cfg_space.assigned_bus_range().end(); + bus_range.set_bus_range(secondary, subordinate); + self.bus_range = Some(bus_range); + } + + /// Writes to the port's config space and handles any side effects + /// (e.g., bus number changes affecting downstream device identity). + pub fn write_cfg(&mut self, offset: u16, value: u32) -> IoResult { + let old_secondary = *self.cfg_space.assigned_bus_range().start(); + let old_subordinate = *self.cfg_space.assigned_bus_range().end(); + let result = self.cfg_space.write_u32(offset, value); + let new_secondary = *self.cfg_space.assigned_bus_range().start(); + let new_subordinate = *self.cfg_space.assigned_bus_range().end(); + if old_secondary != new_secondary || old_subordinate != new_subordinate { + self.on_bus_range_changed(new_secondary, new_subordinate); + } + result + } + + /// Called when the bus range has changed. Updates the downstream + /// device's bus range to match. + fn on_bus_range_changed(&self, secondary_bus: u8, subordinate_bus: u8) { + if let Some(bus_range) = &self.bus_range { + bus_range.set_bus_range(secondary_bus, subordinate_bus); } } diff --git a/vm/devices/pci/pcie/src/root.rs b/vm/devices/pci/pcie/src/root.rs index 5163deb6fd..7cc6516226 100644 --- a/vm/devices/pci/pcie/src/root.rs +++ b/vm/devices/pci/pcie/src/root.rs @@ -12,6 +12,7 @@ use crate::PAGE_SHIFT; use crate::PAGE_SIZE64; use crate::ROOT_PORT_DEVICE_ID; use crate::VENDOR_ID; +use crate::bus_range::AssignedBusRange; use crate::port::PcieDownstreamPort; use chipset_device::ChipsetDevice; use chipset_device::io::IoError; @@ -140,11 +141,16 @@ impl GenericPcieRootComplex { } /// Attach the provided `GenericPciBusDevice` to the port identified. + /// + /// `device_id` is an optional shared identity that the port will update + /// with the device's RID when the guest programs the secondary bus number. + /// Pass `None` when device identity tracking is not needed. pub fn add_pcie_device( &mut self, port: u8, name: impl AsRef, dev: Box, + device_id: Option, ) -> Result<(), Arc> { let (_port_name, root_port) = self.ports.get_mut(&port).ok_or_else(|| -> Arc { tracing::error!( @@ -155,7 +161,7 @@ impl GenericPcieRootComplex { format!("Port {:#x} not found", port).into() })?; - match root_port.connect_device(name, dev) { + match root_port.connect_device(name, dev, device_id) { Ok(()) => Ok(()), Err(existing_device) => { tracing::warn!( @@ -180,17 +186,23 @@ impl GenericPcieRootComplex { } /// Hot-add a device to a named port. + /// + /// `device_id` is an optional shared identity for RID/device ID tracking. pub fn hotplug_add_device( &mut self, port_name: &str, device_name: &str, device: Box, + device_id: Option, ) -> anyhow::Result<()> { let (_, (_, root_port)) = self .ports .iter_mut() .find(|(_, (name, _))| name.as_ref() == port_name) .ok_or_else(|| anyhow::anyhow!("port '{}' not found", port_name))?; + if let Some(id) = device_id { + root_port.port.set_bus_range(id); + } root_port.port.hotplug_add_device(device_name, device) } @@ -382,7 +394,7 @@ impl MmioIntercept for GenericPcieRootComplex { tracelimit::warn_ratelimited!("unroutable config space access"); } DecodedEcamAccess::InternalBus(port, cfg_offset) => { - check_result!(port.port.cfg_space.write_u32(cfg_offset, write_dword)); + check_result!(port.port.write_cfg(cfg_offset, write_dword)); } DecodedEcamAccess::DownstreamPort(port, bus_number, function, cfg_offset) => { check_result!(port.forward_cfg_write( @@ -447,12 +459,18 @@ impl RootPort { &mut self, name: impl AsRef, dev: Box, + device_id: Option, ) -> Result<(), Arc> { let device_name = name.as_ref(); let port_name = self.port.name.clone(); match self.port.add_pcie_device(&port_name, device_name, dev) { - Ok(()) => Ok(()), + Ok(()) => { + if let Some(id) = device_id { + self.port.set_bus_range(id); + } + Ok(()) + } Err(_error) => { // If the connection failed, it means the port is already occupied // We need to get the name of the existing device @@ -766,9 +784,10 @@ mod tests { |_, _| Some(IoResult::Err(IoError::InvalidRegister)), ); - rc.add_pcie_device(0, "ep1", Box::new(endpoint1)).unwrap(); + rc.add_pcie_device(0, "ep1", Box::new(endpoint1), None) + .unwrap(); - match rc.add_pcie_device(0, "ep2", Box::new(endpoint2)) { + match rc.add_pcie_device(0, "ep2", Box::new(endpoint2), None) { Ok(()) => panic!("should have failed"), Err(name) => { assert_eq!(name, "ep1".into()); @@ -823,7 +842,7 @@ mod tests { |_, _| Some(IoResult::Err(IoError::InvalidRegister)), ); - rc.add_pcie_device(0, "test-ep", Box::new(endpoint)) + rc.add_pcie_device(0, "test-ep", Box::new(endpoint), None) .unwrap(); // The secondary bus behind root port 0 has been assigned bus number @@ -1084,4 +1103,55 @@ mod tests { let result = rc2.restore(saved_state); assert!(result.is_err()); } + + #[test] + fn test_bus_range_updated_on_cfg_write() { + use crate::bus_range::AssignedBusRange; + + const SECONDARY_BUS_NUM_REG: u64 = 0x19; + const SUBORDINATE_BUS_NUM_REG: u64 = 0x1A; + + let mut rc = instantiate_root_complex(0, 255, 1); + + let endpoint = TestPcieEndpoint::new( + |_, _| Some(IoResult::Err(IoError::InvalidRegister)), + |_, _| Some(IoResult::Err(IoError::InvalidRegister)), + ); + + let bus_range = AssignedBusRange::new(); + assert_eq!(bus_range.bus_range(), (0, 0)); + + rc.add_pcie_device(0, "ep", Box::new(endpoint), Some(bus_range.clone())) + .unwrap(); + + // Program secondary=5, subordinate=10 via ECAM MMIO writes. + rc.mmio_write(SECONDARY_BUS_NUM_REG, &[5]).unwrap(); + rc.mmio_write(SUBORDINATE_BUS_NUM_REG, &[10]).unwrap(); + + // The shared AssignedBusRange should reflect the new values. + assert_eq!(bus_range.bus_range(), (5, 10)); + + // compose_its_devid should produce (segment << 16 | secondary << 8) + // for a single-function device (devid=None). + let segment = 2u16; + let devid = bus_range.compose_its_devid(segment, None); + assert_eq!(devid, Some((2 << 16) | (5 << 8))); + + // With a specific BDF within range: bus=7, dev=1, fn=2 + let bdf: u32 = (7 << 8) | (1 << 3) | 2; + let devid = bus_range.compose_its_devid(segment, Some(bdf)); + assert_eq!(devid, Some((2 << 16) | bdf)); + + // BDF outside range should return None. + let out_of_range_bdf: u32 = 11 << 8; // bus=11, beyond subordinate=10 + assert_eq!( + bus_range.compose_its_devid(segment, Some(out_of_range_bdf)), + None + ); + + // Reprogram bus numbers and verify tracking follows. + rc.mmio_write(SECONDARY_BUS_NUM_REG, &[20]).unwrap(); + rc.mmio_write(SUBORDINATE_BUS_NUM_REG, &[30]).unwrap(); + assert_eq!(bus_range.bus_range(), (20, 30)); + } } diff --git a/vm/devices/pci/pcie/src/switch.rs b/vm/devices/pci/pcie/src/switch.rs index 930b56f8f5..138390246b 100644 --- a/vm/devices/pci/pcie/src/switch.rs +++ b/vm/devices/pci/pcie/src/switch.rs @@ -300,7 +300,7 @@ impl GenericPcieSwitch { value: u32, ) -> Option { if let Some((_, downstream_port)) = self.downstream_ports.get_mut(&function) { - Some(downstream_port.port.cfg_space.write_u32(cfg_offset, value)) + Some(downstream_port.port.write_cfg(cfg_offset, value)) } else { // No downstream switch port found for this device function None @@ -366,11 +366,14 @@ impl GenericPcieSwitch { } /// Attach the provided `GenericPciBusDevice` to the port identified. + /// + /// `device_id` is an optional shared identity for RID/device ID tracking. pub fn add_pcie_device( &mut self, port: u8, name: &str, dev: Box, + device_id: Option, ) -> anyhow::Result<()> { // Find the specific downstream port that matches the port number if let Some((port_name, downstream_port)) = self.downstream_ports.get_mut(&port) { @@ -379,6 +382,9 @@ impl GenericPcieSwitch { .port .add_pcie_device(port_name.as_ref(), name, dev) .context("failed to add PCIe device to downstream port")?; + if let Some(id) = device_id { + downstream_port.port.set_bus_range(id); + } Ok(()) } else { // No downstream port found with matching port number @@ -750,7 +756,8 @@ mod tests { .add_pcie_device( 0, // Port number instead of port name "downstream-dev", - Box::new(downstream_device) + Box::new(downstream_device), + None, ) .is_ok() ); @@ -760,7 +767,7 @@ mod tests { |_, _| Some(IoResult::Err(IoError::InvalidRegister)), |_, _| Some(IoResult::Err(IoError::InvalidRegister)), ); - let result = switch.add_pcie_device(99, "invalid-dev", Box::new(invalid_device)); // Use invalid port number + let result = switch.add_pcie_device(99, "invalid-dev", Box::new(invalid_device), None); // Use invalid port number assert!(result.is_err()); // add_pcie_device returns an anyhow::Error on failure, // so we just verify that the connection failed @@ -783,7 +790,7 @@ mod tests { // This tests that the switch can accept device connections (routing capability) let test_device = TestPcieEndpoint::new(|_, _| Some(IoResult::Ok), |_, _| Some(IoResult::Ok)); - let add_result = switch.add_pcie_device(0, "test-device", Box::new(test_device)); + let add_result = switch.add_pcie_device(0, "test-device", Box::new(test_device), None); // Should succeed for port 0 (first downstream port) assert!(add_result.is_ok()); diff --git a/vm/devices/pci/vpci/src/test_helpers/mod.rs b/vm/devices/pci/vpci/src/test_helpers/mod.rs index 69b4a85482..f0391da2d9 100644 --- a/vm/devices/pci/vpci/src/test_helpers/mod.rs +++ b/vm/devices/pci/vpci/src/test_helpers/mod.rs @@ -76,8 +76,7 @@ impl TestVpciInterruptControllerInner { } impl SignalMsi for TestVpciInterruptControllerInner { - fn signal_msi(&self, rid: u32, address: u64, data: u32) { - assert_eq!(rid, 0); + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { self.deliver_interrupt(address, data); } } diff --git a/vm/devices/storage/nvme/src/tests/test_helpers.rs b/vm/devices/storage/nvme/src/tests/test_helpers.rs index 6fd2edaa3f..8f95f57a60 100644 --- a/vm/devices/storage/nvme/src/tests/test_helpers.rs +++ b/vm/devices/storage/nvme/src/tests/test_helpers.rs @@ -46,8 +46,7 @@ impl TestPciInterruptController { } impl SignalMsi for TestPciInterruptController { - fn signal_msi(&self, rid: u32, address: u64, data: u32) { - assert_eq!(rid, 0); + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { self.inner.msi_requests.lock().push_back((address, data)); } } diff --git a/vm/devices/storage/nvme_test/src/tests/test_helpers.rs b/vm/devices/storage/nvme_test/src/tests/test_helpers.rs index 6fd2edaa3f..8f95f57a60 100644 --- a/vm/devices/storage/nvme_test/src/tests/test_helpers.rs +++ b/vm/devices/storage/nvme_test/src/tests/test_helpers.rs @@ -46,8 +46,7 @@ impl TestPciInterruptController { } impl SignalMsi for TestPciInterruptController { - fn signal_msi(&self, rid: u32, address: u64, data: u32) { - assert_eq!(rid, 0); + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { self.inner.msi_requests.lock().push_back((address, data)); } } diff --git a/vm/devices/user_driver_emulated_mock/src/lib.rs b/vm/devices/user_driver_emulated_mock/src/lib.rs index f04476b41f..b6f95d8c22 100644 --- a/vm/devices/user_driver_emulated_mock/src/lib.rs +++ b/vm/devices/user_driver_emulated_mock/src/lib.rs @@ -59,11 +59,8 @@ impl MsiController { } impl SignalMsi for MsiController { - fn signal_msi(&self, rid: u32, address: u64, _data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, _data: u32) { let index = address as usize; - if rid != 0 { - return; - } if let Some(event) = self.events.get(index) { tracing::debug!(index, "signaling interrupt"); event.signal_uncached(); diff --git a/vm/kvm/src/lib.rs b/vm/kvm/src/lib.rs index 61c2e7caf4..00d5c953ee 100644 --- a/vm/kvm/src/lib.rs +++ b/vm/kvm/src/lib.rs @@ -520,24 +520,37 @@ impl Partition { entries: [Default::default(); MAX_ROUTES], }; for (i, route) in routes.iter().enumerate() { - let (type_, u) = match route.1 { + let (type_, flags, u) = match route.1 { RoutingEntry::Msi { address_lo, address_hi, data, - } => ( - KVM_IRQ_ROUTING_MSI, - kvm_irq_routing_entry__bindgen_ty_1 { - msi: kvm_irq_routing_msi { - address_lo, - address_hi, - data, - __bindgen_anon_1: Default::default(), + devid, + } => { + let (flags, anon) = if let Some(devid) = devid { + ( + KVM_MSI_VALID_DEVID, + kvm_irq_routing_msi__bindgen_ty_1 { devid }, + ) + } else { + (0, Default::default()) + }; + ( + KVM_IRQ_ROUTING_MSI, + flags, + kvm_irq_routing_entry__bindgen_ty_1 { + msi: kvm_irq_routing_msi { + address_lo, + address_hi, + data, + __bindgen_anon_1: anon, + }, }, - }, - ), + ) + } RoutingEntry::HvSint { vp, sint } => ( KVM_IRQ_ROUTING_HV_SINT, + 0, kvm_irq_routing_entry__bindgen_ty_1 { hv_sint: kvm_irq_routing_hv_sint { vcpu: vp, @@ -547,6 +560,7 @@ impl Partition { ), RoutingEntry::Irqchip { pin } => ( KVM_IRQ_ROUTING_IRQCHIP, + 0, kvm_irq_routing_entry__bindgen_ty_1 { irqchip: kvm_irq_routing_irqchip { pin, irqchip: 0 }, }, @@ -555,7 +569,7 @@ impl Partition { kvm_routes.entries[i] = kvm_irq_routing_entry { gsi: route.0, type_, - flags: 0, + flags, pad: 0, u, }; @@ -719,6 +733,7 @@ pub enum RoutingEntry { address_lo: u32, address_hi: u32, data: u32, + devid: Option, }, HvSint { vp: u32, diff --git a/vm/vmcore/src/irqfd.rs b/vm/vmcore/src/irqfd.rs index 97d3e6219e..4ecaf95f7e 100644 --- a/vm/vmcore/src/irqfd.rs +++ b/vm/vmcore/src/irqfd.rs @@ -51,7 +51,9 @@ pub trait IrqFdRoute: Send + Sync { /// /// `address` and `data` are the MSI address and data values that the /// hypervisor will use when injecting the interrupt into the guest. - fn enable(&self, address: u64, data: u32); + /// `devid` is an optional device identity used by backends that need a + /// device ID for MSI routing (e.g., GICv3 ITS). + fn enable(&self, address: u64, data: u32, devid: Option); /// Disables the MSI routing for this irqfd's GSI. /// diff --git a/vm/vmcore/vm_topology/src/processor/aarch64.rs b/vm/vmcore/vm_topology/src/processor/aarch64.rs index 2660108eee..6af59067ec 100644 --- a/vm/vmcore/vm_topology/src/processor/aarch64.rs +++ b/vm/vmcore/vm_topology/src/processor/aarch64.rs @@ -71,8 +71,8 @@ pub struct Aarch64PlatformConfig { pub gic_distributor_base: u64, /// GIC version and version-specific addresses. pub gic_version: GicVersion, - /// GIC v2m MSI frame, if MSIs via v2m are supported. - pub gic_v2m: Option, + /// MSI controller for PCIe interrupt delivery. + pub gic_msi: GicMsiController, /// Performance Monitor Unit GSIV (GIC INTID). `None` if not available. pub pmu_gsiv: Option, /// Virtual timer PPI (GIC INTID, e.g. 20 for PPI 4). @@ -97,6 +97,28 @@ pub struct GicV2mInfo { pub spi_count: u32, } +/// GICv3 ITS (Interrupt Translation Service) parameters. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "inspect", derive(inspect::Inspect))] +pub struct GicItsInfo { + /// Physical base address of the ITS MMIO region (must be 64 KiB aligned). + #[cfg_attr(feature = "inspect", inspect(hex))] + pub its_base: u64, +} + +/// MSI controller configuration for PCIe interrupt delivery. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "inspect", derive(inspect::Inspect))] +#[cfg_attr(feature = "inspect", inspect(external_tag))] +pub enum GicMsiController { + /// No MSI controller configured. + None, + /// GICv2m — maps MSI writes to a fixed pool of SPIs. + V2m(GicV2mInfo), + /// GICv3 ITS — routes MSIs via LPIs using (DeviceID, EventID) lookup. + Its(GicItsInfo), +} + /// ARM64 specific VP info. #[cfg_attr(feature = "inspect", derive(inspect::Inspect))] #[derive(Debug, Copy, Clone)] @@ -240,9 +262,9 @@ impl ProcessorTopology { self.arch.platform.pmu_gsiv } - /// Returns the GIC v2m MSI frame info, if present. - pub fn gic_v2m(&self) -> Option { - self.arch.platform.gic_v2m + /// Returns the MSI controller configuration. + pub fn gic_msi(&self) -> GicMsiController { + self.arch.platform.gic_msi } /// Returns the virtual timer PPI (GIC INTID). diff --git a/vmm_core/src/acpi_builder.rs b/vmm_core/src/acpi_builder.rs index a3cb682cf7..791edcb857 100644 --- a/vmm_core/src/acpi_builder.rs +++ b/vmm_core/src/acpi_builder.rs @@ -89,6 +89,15 @@ pub const OEM_INFO: acpi::builder::OemInfo = acpi::builder::OemInfo { pub trait AcpiTopology: ArchTopology + Inspect + Sized { fn extend_srat(topology: &ProcessorTopology, srat: &mut Vec); fn extend_madt(topology: &ProcessorTopology, madt: &mut Vec); + fn needs_iort(_topology: &ProcessorTopology) -> bool { + false + } + /// If the platform has an ITS, return its identifier for the IORT ITS + /// Group node. Returns `None` when no ITS is present (root complex + /// nodes will have no ID mappings). + fn iort_its_id(_topology: &ProcessorTopology) -> Option { + None + } } /// The maximum ID that can be used for a legacy APIC ID in an ACPI table. @@ -188,7 +197,7 @@ impl AcpiTopology for Aarch64Topology { } // GIC v2m MSI frame for PCIe MSI support. - if let Some(v2m) = topology.gic_v2m() { + if let vm_topology::processor::aarch64::GicMsiController::V2m(v2m) = topology.gic_msi() { madt.extend_from_slice( acpi_spec::madt::MadtGicMsiFrame::new( 0, @@ -199,6 +208,22 @@ impl AcpiTopology for Aarch64Topology { .as_bytes(), ); } + + // GICv3 ITS for PCIe MSI routing via LPIs. + if let vm_topology::processor::aarch64::GicMsiController::Its(its) = topology.gic_msi() { + madt.extend_from_slice(acpi_spec::madt::MadtGicIts::new(0, its.its_base).as_bytes()); + } + } + + fn needs_iort(_topology: &ProcessorTopology) -> bool { + true + } + + fn iort_its_id(topology: &ProcessorTopology) -> Option { + match topology.gic_msi() { + vm_topology::processor::aarch64::GicMsiController::Its(_) => Some(0), + _ => None, + } } } @@ -324,6 +349,62 @@ impl AcpiTablesBuilder<'_, T> { )) } + fn with_iort(&self, f: F) -> R + where + F: FnOnce(&acpi::builder::Table<'_>) -> R, + { + use acpi_spec::iort; + + let its_id = T::iort_its_id(self.processor_topology); + let has_its = its_id.is_some(); + let its_node_count: u32 = if has_its { 1 } else { 0 }; + let node_count = its_node_count + self.pcie_host_bridges.len() as u32; + let mapping_count: u32 = if has_its { 1 } else { 0 }; + + let mut iort_extra: Vec = Vec::new(); + + // ITS Group node comes first so root complexes can reference it. + // The ITS Group node offset (from table start) is IORT_NODE_OFFSET. + let its_group_offset = iort::IORT_NODE_OFFSET; + if let Some(id) = its_id { + iort_extra.extend_from_slice(iort::IortItsGroup::new(0, 1).as_bytes()); + // Followed by the ITS identifier (u32). + iort_extra.extend_from_slice(&id.to_ne_bytes()); + } + + for bridge in self.pcie_host_bridges { + let rc = iort::IortPciRootComplex::new(bridge.index, bridge.segment, mapping_count); + iort_extra.extend_from_slice(rc.as_bytes()); + + if has_its { + // Single ID mapping: full RID range → ITS Group node. + // output_base uses (segment << 16) so device IDs in the + // ITS namespace are unique across PCI segments. + iort_extra.extend_from_slice( + iort::IortIdMapping::new( + 0, // input_base + 0xFFFF, // id_count (full 16-bit BDF range, minus 1 per IORT spec) + (bridge.segment as u32) << 16, // output_base + its_group_offset, // output_reference + 0, // flags + ) + .as_bytes(), + ); + } + } + + (f)(&acpi::builder::Table::new_dyn( + iort::IORT_REVISION, + None, + &iort::Iort::new(node_count), + &[iort_extra.as_slice()], + )) + } + + fn should_build_iort(&self) -> bool { + T::needs_iort(self.processor_topology) && !self.pcie_host_bridges.is_empty() + } + fn with_pptt(&self, f: F) -> R where F: FnOnce(&acpi::builder::Table<'_>) -> R, @@ -658,6 +739,10 @@ impl AcpiTablesBuilder<'_, T> { if !self.pcie_host_bridges.is_empty() { self.with_mcfg(|t| b.append(t)); + if self.should_build_iort() { + self.with_iort(|t| b.append(t)); + } + let mut ssdt = Ssdt::new(); for bridge in self.pcie_host_bridges { ssdt.add_pcie( @@ -704,6 +789,13 @@ impl AcpiTablesBuilder<'_, T> { self.with_mcfg(|t| t.to_vec(&OEM_INFO)) } + /// Helper method to construct an IORT without constructing the rest of the + /// ACPI tables. Returns `None` if IORT is not needed for this configuration. + pub fn build_iort(&self) -> Option> { + self.should_build_iort() + .then(|| self.with_iort(|t| t.to_vec(&OEM_INFO))) + } + /// Helper method to construct a PPTT without constructing the rest of the /// ACPI tables. /// @@ -873,4 +965,175 @@ mod test { }) .unwrap(); } + + fn new_aarch64_its_topology() -> ProcessorTopology { + use vm_topology::processor::aarch64::Aarch64PlatformConfig; + use vm_topology::processor::aarch64::GicItsInfo; + use vm_topology::processor::aarch64::GicMsiController; + use vm_topology::processor::aarch64::GicVersion; + + TopologyBuilder::new_aarch64(Aarch64PlatformConfig { + gic_distributor_base: 0xffff0000, + gic_version: GicVersion::V3 { + redistributors_base: 0xefff0000, + }, + gic_msi: GicMsiController::Its(GicItsInfo { + its_base: 0xeffc0000, + }), + pmu_gsiv: None, + virt_timer_ppi: 20, + gic_nr_irqs: 992, + }) + .build(2) + .unwrap() + } + + fn new_aarch64_builder<'a>( + mem_layout: &'a MemoryLayout, + processor_topology: &'a ProcessorTopology, + pcie_host_bridges: &'a Vec, + ) -> AcpiTablesBuilder<'a, Aarch64Topology> { + AcpiTablesBuilder { + processor_topology, + mem_layout, + cache_topology: None, + pcie_host_bridges, + arch: AcpiArchConfig::Aarch64 { + hypervisor_vendor_identity: 0, + virt_timer_ppi: 20, + }, + } + } + + fn u32_at(data: &[u8], offset: usize) -> u32 { + u32::from_ne_bytes(data[offset..offset + 4].try_into().unwrap()) + } + + fn checksum(data: &[u8]) -> u8 { + data.iter().fold(0, |sum, byte| sum.wrapping_add(*byte)) + } + + fn contains_signature(data: &[u8], signature: &[u8; 4]) -> bool { + data.windows(signature.len()) + .any(|window| window == signature) + } + + #[test] + fn test_aarch64_iort_with_its() { + use acpi_spec::iort; + + let mem = new_mem(); + let topology = new_aarch64_its_topology(); + let pcie_host_bridges = vec![ + PcieHostBridge { + index: 0, + segment: 0, + start_bus: 0, + end_bus: 255, + ecam_range: MemoryRange::new(0..256 * 256 * 4096), + low_mmio: MemoryRange::new(0xdc000000..0xe0000000), + high_mmio: MemoryRange::new(0x1000000000..0x1040000000), + }, + PcieHostBridge { + index: 7, + segment: 3, + start_bus: 32, + end_bus: 63, + ecam_range: MemoryRange::new(5 * GB..5 * GB + 32 * 256 * 4096), + low_mmio: MemoryRange::new(0xe0000000..0xe4000000), + high_mmio: MemoryRange::new(0x1040000000..0x1080000000), + }, + ]; + let builder = new_aarch64_builder(&mem, &topology, &pcie_host_bridges); + + let data = builder.build_iort().unwrap(); + + // IORT header + assert_eq!(&data[0..4], b"IORT"); + assert_eq!(u32_at(&data, 4) as usize, data.len()); + assert_eq!(checksum(&data), 0); + + // 3 nodes: 1 ITS Group + 2 Root Complexes + assert_eq!(u32_at(&data, 36), 3); + assert_eq!(u32_at(&data, 40), iort::IORT_NODE_OFFSET); + + // First node: ITS Group at IORT_NODE_OFFSET + let its_node = iort::IORT_NODE_OFFSET as usize; + assert_eq!(data[its_node], iort::IORT_NODE_TYPE_ITS_GROUP); + // its_count = 1 + assert_eq!(u32_at(&data, its_node + 16), 1); + // ITS identifier = 0 + assert_eq!(u32_at(&data, its_node + 20), 0); + + // Second node: Root Complex 0 (after ITS Group: 20 + 4 = 24 bytes) + let rc0 = its_node + 24; + assert_eq!(data[rc0], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX); + assert_eq!(u32_at(&data, rc0 + 4), 0); // identifier + assert_eq!(u32_at(&data, rc0 + 8), 1); // mapping_count + // pci_segment_number at offset 28 from node start + assert_eq!(u32_at(&data, rc0 + 28), 0); + // ID mapping follows the root complex node (36 bytes in) + let mapping0 = rc0 + 36; + assert_eq!(u32_at(&data, mapping0), 0); // input_base + assert_eq!(u32_at(&data, mapping0 + 4), 0xFFFF); // id_count + assert_eq!(u32_at(&data, mapping0 + 8), 0); // output_base (seg 0 << 16) + assert_eq!(u32_at(&data, mapping0 + 12), iort::IORT_NODE_OFFSET); // -> ITS group + + // Third node: Root Complex 7 + let rc1 = mapping0 + 20; + assert_eq!(data[rc1], iort::IORT_NODE_TYPE_PCI_ROOT_COMPLEX); + assert_eq!(u32_at(&data, rc1 + 4), 7); // identifier + assert_eq!(u32_at(&data, rc1 + 28), 3); // pci_segment_number + let mapping1 = rc1 + 36; + assert_eq!(u32_at(&data, mapping1 + 8), 3 << 16); // output_base (seg 3 << 16) + } + + #[test] + fn test_iort_not_built_for_x86() { + let mem = new_mem(); + let topology = TopologyBuilder::new_x86().build(1).unwrap(); + let pcie_host_bridges = vec![PcieHostBridge { + index: 0, + segment: 0, + start_bus: 0, + end_bus: 255, + ecam_range: MemoryRange::new(0..256 * 256 * 4096), + low_mmio: MemoryRange::new(0xdc000000..0xe0000000), + high_mmio: MemoryRange::new(0x1000000000..0x1040000000), + }]; + let builder = new_builder(&mem, &topology, &pcie_host_bridges); + assert!(builder.build_iort().is_none()); + + let tables = builder.build_acpi_tables(0x100000, |_, _| {}); + assert!(!contains_signature(&tables.tables, b"IORT")); + } + + #[test] + fn test_iort_not_built_without_pcie() { + let mem = new_mem(); + let topology = new_aarch64_its_topology(); + let empty: Vec = Vec::new(); + let builder = new_aarch64_builder(&mem, &topology, &empty); + assert!(builder.build_iort().is_none()); + } + + #[test] + fn test_aarch64_acpi_tables_include_iort() { + let mem = new_mem(); + let topology = new_aarch64_its_topology(); + let pcie_host_bridges = vec![PcieHostBridge { + index: 0, + segment: 0, + start_bus: 0, + end_bus: 255, + ecam_range: MemoryRange::new(0..256 * 256 * 4096), + low_mmio: MemoryRange::new(0xdc000000..0xe0000000), + high_mmio: MemoryRange::new(0x1000000000..0x1040000000), + }]; + let builder = new_aarch64_builder(&mem, &topology, &pcie_host_bridges); + + let tables = builder.build_acpi_tables(0x100000, |_, _| {}); + assert!(contains_signature(&tables.tables, b"MCFG")); + assert!(contains_signature(&tables.tables, b"IORT")); + } } diff --git a/vmm_core/src/device_builder.rs b/vmm_core/src/device_builder.rs index 56dfdcfda4..9b09fa550f 100644 --- a/vmm_core/src/device_builder.rs +++ b/vmm_core/src/device_builder.rs @@ -102,12 +102,17 @@ pub async fn build_pcie_device( mapper: Option<&dyn guestmem::MemoryMapper>, interrupt_target: Option>, irqfd: Option>, + device_id: Option, ) -> anyhow::Result<()> { let dev_name = format!("pcie:{}-{}", port_name, resource.id()); - let device_builder = chipset_builder + let mut device_builder = chipset_builder .arc_mutex_device(dev_name) .on_pcie_port(vmotherboard::BusId::new(&port_name)); + if let Some(id) = device_id { + device_builder = device_builder.with_pci_device_id(id); + } + let (_, msi_conn) = resolve_and_add_pci_device( device_builder, driver_source, diff --git a/vmm_core/virt/src/aarch64/gic_software_device.rs b/vmm_core/virt/src/aarch64/gic_software_device.rs index bebbcb49be..c252e02789 100644 --- a/vmm_core/virt/src/aarch64/gic_software_device.rs +++ b/vmm_core/virt/src/aarch64/gic_software_device.rs @@ -65,7 +65,7 @@ impl MapVpciInterrupt for GicSoftwareDevice { } impl SignalMsi for GicSoftwareDevice { - fn signal_msi(&self, _rid: u32, _address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, _address: u64, data: u32) { if SPI_RANGE.contains(&data) { self.irqcon.set_spi_irq(data, true); } diff --git a/vmm_core/virt/src/aarch64/gic_v2m.rs b/vmm_core/virt/src/aarch64/gic_v2m.rs index bd22bedb78..33a517806b 100644 --- a/vmm_core/virt/src/aarch64/gic_v2m.rs +++ b/vmm_core/virt/src/aarch64/gic_v2m.rs @@ -39,7 +39,7 @@ impl GicV2mSignalMsi { } impl SignalMsi for GicV2mSignalMsi { - fn signal_msi(&self, _rid: u32, address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { if address != self.setspi_addr { tracelimit::warn_ratelimited!( address, diff --git a/vmm_core/virt/src/generic.rs b/vmm_core/virt/src/generic.rs index 5bfcad421a..e31204174a 100644 --- a/vmm_core/virt/src/generic.rs +++ b/vmm_core/virt/src/generic.rs @@ -58,6 +58,10 @@ pub struct PlatformInfo { /// Whether the hypervisor supports GICv3. When `false`, only /// GICv2 is available (e.g., Raspberry Pi 5 with GIC-400). pub supports_gic_v3: bool, + /// Whether the hypervisor supports an in-kernel GICv3 ITS for + /// MSI delivery via LPIs. When `true`, the topology can include + /// a `GicItsInfo` and the backend will create/manage the ITS device. + pub supports_its: bool, } pub trait Hypervisor: 'static { @@ -680,7 +684,7 @@ impl MapVpciInterrupt for UnimplementedDevice { } impl SignalMsi for UnimplementedDevice { - fn signal_msi(&self, _rid: u32, _address: u64, _data: u32) { + fn signal_msi(&self, _devid: Option, _address: u64, _data: u32) { match *self {} } } diff --git a/vmm_core/virt/src/x86/apic_software_device.rs b/vmm_core/virt/src/x86/apic_software_device.rs index 4f13a67d53..5bf2d5369f 100644 --- a/vmm_core/virt/src/x86/apic_software_device.rs +++ b/vmm_core/virt/src/x86/apic_software_device.rs @@ -260,13 +260,13 @@ impl InterruptTable { } impl SignalMsi for ApicSoftwareDevice { - fn signal_msi(&self, _rid: u32, address: u64, _data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, _data: u32) { let mut table = self.table.lock(); let table = &mut *table; let index = InterruptTable::interrupt_index_from_address(address); if let Some(interrupt) = table.entries.get(index) { let target = interrupt.msi_params(); - self.target.signal_msi(0, target.address, target.data) + self.target.signal_msi(None, target.address, target.data) } } } diff --git a/vmm_core/virt_hvf/src/lib.rs b/vmm_core/virt_hvf/src/lib.rs index c49c09dd7f..ae32d12165 100644 --- a/vmm_core/virt_hvf/src/lib.rs +++ b/vmm_core/virt_hvf/src/lib.rs @@ -96,6 +96,7 @@ impl virt::Hypervisor for HvfHypervisor { virt::PlatformInfo { platform_gsiv: None, supports_gic_v3: true, + supports_its: false, } } diff --git a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs index e6788b7038..890a99b13a 100644 --- a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs @@ -29,10 +29,12 @@ use kvm::KVM_DEV_ARM_VGIC_CTRL_INIT; use kvm::KVM_DEV_ARM_VGIC_GRP_ADDR; use kvm::KVM_DEV_ARM_VGIC_GRP_CTRL; use kvm::KVM_DEV_ARM_VGIC_GRP_NR_IRQS; +use kvm::KVM_VGIC_ITS_ADDR_TYPE; use kvm::KVM_VGIC_V2_ADDR_TYPE_CPU; use kvm::KVM_VGIC_V2_ADDR_TYPE_DIST; use kvm::KVM_VGIC_V3_ADDR_TYPE_DIST; use kvm::KVM_VGIC_V3_ADDR_TYPE_REDIST; +use kvm::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS; use kvm::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V2; use kvm::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3; use kvm::kvm_regs; @@ -53,6 +55,7 @@ use virt::vp::Registers; use virt::vp::SystemRegisters; use virt::x86::DebugState; use vm_topology::processor::aarch64::Aarch64VpInfo; +use vm_topology::processor::aarch64::GicMsiController; use vmcore::reference_time::ReferenceTimeSource; use vmcore::vmtime::VmTimeAccess; @@ -220,6 +223,7 @@ use vm_topology::processor::aarch64::GicVersion; pub struct Kvm { kvm: kvm::Kvm, supports_gic_v3: bool, + supports_its: bool, } impl Kvm { @@ -250,9 +254,18 @@ impl Kvm { tracing::info!(supports_gic_v3, "detected KVM GIC version"); + // Probe ITS support: only available with GICv3. + let supports_its = supports_gic_v3 + && probe_vm + .test_create_device(kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS) + .is_ok(); + + tracing::info!(supports_its, "detected KVM ITS support"); + Ok(Self { kvm, supports_gic_v3, + supports_its, }) } } @@ -691,6 +704,44 @@ impl KvmProtoPartition<'_> { Ok(gicv2) } + fn add_its(&mut self, its_base: u64) -> Result { + const ITS_ALIGNMENT: u64 = 0x10000; + if !its_base.is_multiple_of(ITS_ALIGNMENT) { + return Err(KvmError::Misaligned); + } + + let its = self + .vm + .create_device(kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, 0) + .map_err(kvm::Error::CreateDevice)?; + + // SAFETY: passing the right type for the attribute. + unsafe { + its.set_device_attr::( + KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, + &its_base, + 0, + ) + .map_err(kvm::Error::SetDeviceAttr)?; + } + + // Initialize the ITS device. + // + // SAFETY: passing the right type for the attribute. + unsafe { + its.set_device_attr::<()>( + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, + &(), + 0, + ) + .map_err(kvm::Error::SetDeviceAttr)?; + } + + Ok(its) + } + fn set_timer_ppis(&mut self, virt: u32, phys: u32) -> Result<(), KvmError> { // SAFETY: passing the right type for the attribute. unsafe { @@ -746,6 +797,14 @@ impl virt::ProtoPartition for KvmProtoPartition<'_> { GicVersion::V2 { cpu_interface_base } => self.add_gicv2(cpu_interface_base)?, }; + // Create the ITS device after the GIC, if configured. + let gic_msi = self.config.processor_topology.gic_msi(); + let its_device = if let GicMsiController::Its(its_info) = &gic_msi { + Some(self.add_its(its_info.its_base)?) + } else { + None + }; + // Configure the virtual timer PPI from topology. KVM also requires // a physical timer PPI, but we don't expose it to the guest. self.set_timer_ppis( @@ -787,7 +846,8 @@ impl virt::ProtoPartition for KvmProtoPartition<'_> { gsi_routing: Mutex::new(GsiRouting::new()), caps, _gic_device: gic_device, - gic_v2m: self.config.processor_topology.gic_v2m(), + _its_device: its_device, + gic_msi, gic_nr_irqs: self.config.processor_topology.gic_nr_irqs(), synic_ports: Default::default(), }); @@ -834,16 +894,25 @@ impl virt::Partition for KvmPartition { } fn as_signal_msi(&self, _minimum_vtl: Vtl) -> Option> { - let v2m = self.inner.gic_v2m.as_ref()?; - let irqcon = self.inner.clone() as Arc; - Some(Arc::new(virt::aarch64::gic_v2m::GicV2mSignalMsi::new( - v2m, irqcon, - ))) + match &self.inner.gic_msi { + GicMsiController::Its(its) => Some(Arc::new(GicItsSignalMsi { + kvm: self.inner.clone(), + translater_addr: its.its_base + GITS_TRANSLATER_OFFSET, + })), + GicMsiController::V2m(v2m) => { + let irqcon = self.inner.clone() as Arc; + Some(Arc::new(virt::aarch64::gic_v2m::GicV2mSignalMsi::new( + v2m, irqcon, + ))) + } + GicMsiController::None => None, + } } fn irqfd(&self) -> Option> { - // The irqfd implementation requires a GICv2m frame to be present. - self.inner.gic_v2m?; + if matches!(self.inner.gic_msi, GicMsiController::None) { + return None; + } Some(self.irqfd_state.clone()) } @@ -923,11 +992,12 @@ impl MsiRouteBuilder for KvmGicV2mRouteBuilder { partition: &KvmPartitionInner, address: u64, data: u32, + _devid: Option, ) -> Option { - let v2m = partition - .gic_v2m - .as_ref() - .expect("partition does not expose a GICv2m MSI frame"); + let v2m = match &partition.gic_msi { + GicMsiController::V2m(v2m) => v2m, + _ => panic!("partition does not expose a GICv2m MSI frame"), + }; let setspi_addr = v2m.frame_base + GicV2mRegister::SETSPI_NS.0 as u64; if address != setspi_addr { return None; @@ -941,13 +1011,94 @@ impl MsiRouteBuilder for KvmGicV2mRouteBuilder { } } +/// ITS MSI route builder for irqfd-based interrupt delivery. +/// +/// The ITS GITS_TRANSLATER address is `its_base + 0x10040`. All MSI writes +/// target this single address; the device ID distinguishes sources. +struct KvmItsRouteBuilder { + /// GITS_TRANSLATER physical address. + translater_addr: u64, +} + +/// Offset of the GITS_TRANSLATER register from the ITS base. +/// It's in the second 64 KiB page at offset 0x40. +const GITS_TRANSLATER_OFFSET: u64 = 0x10040; + +impl MsiRouteBuilder for KvmItsRouteBuilder { + fn routing_entry( + &self, + _partition: &KvmPartitionInner, + address: u64, + data: u32, + devid: Option, + ) -> Option { + if address != self.translater_addr { + return None; + } + Some(kvm::RoutingEntry::Msi { + address_lo: address as u32, + address_hi: (address >> 32) as u32, + data, + devid, + }) + } +} + +/// A [`SignalMsi`](pci_core::msi::SignalMsi) implementation that injects MSIs +/// via `KVM_SIGNAL_MSI` with the `KVM_MSI_VALID_DEVID` flag for ITS routing. +struct GicItsSignalMsi { + kvm: Arc, + /// GITS_TRANSLATER physical address. + translater_addr: u64, +} + +impl pci_core::msi::SignalMsi for GicItsSignalMsi { + fn signal_msi(&self, devid: Option, address: u64, data: u32) { + if address != self.translater_addr { + tracelimit::warn_ratelimited!( + address, + data, + expected = self.translater_addr, + "unexpected MSI address (expected ITS GITS_TRANSLATER)" + ); + return; + } + let (flags, raw_devid) = match devid { + Some(id) => (kvm::KVM_MSI_VALID_DEVID, id), + None => (0, 0), + }; + let msi = kvm::kvm_msi { + address_lo: address as u32, + address_hi: (address >> 32) as u32, + data, + flags, + devid: raw_devid, + pad: [0; 12], + }; + if let Err(err) = self.kvm.kvm.request_msi(&msi) { + tracelimit::warn_ratelimited!( + ?devid, + data, + err = &err as &dyn std::error::Error, + "failed to signal MSI via ITS" + ); + } + } +} + impl virt::irqfd::IrqFd for KvmIrqFdState { fn new_irqfd_route(&self) -> anyhow::Result> { - assert!( - self.partition.gic_v2m.is_some(), - "GICv2m is required for irqfd support" - ); - Ok(Box::new(self.new_irqfd_route(KvmGicV2mRouteBuilder)?)) + match &self.partition.gic_msi { + GicMsiController::Its(its) => { + Ok(Box::new(self.new_irqfd_route(KvmItsRouteBuilder { + translater_addr: its.its_base + GITS_TRANSLATER_OFFSET, + })?)) + } + GicMsiController::V2m(_) => Ok(Box::new(self.new_irqfd_route(KvmGicV2mRouteBuilder)?)), + GicMsiController::None => { + anyhow::bail!("no MSI controller configured for irqfd") + } + } } } @@ -994,6 +1145,7 @@ impl virt::Hypervisor for Kvm { virt::PlatformInfo { platform_gsiv: None, supports_gic_v3: self.supports_gic_v3, + supports_its: self.supports_its, } } diff --git a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs index 4c91f12b13..c12c6d083e 100644 --- a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs @@ -851,6 +851,7 @@ impl MsiRouteBuilder for KvmX86MsiRouteBuilder { _partition: &KvmPartitionInner, address: u64, data: u32, + _devid: Option, ) -> Option { let KvmMsi { address_lo, @@ -861,6 +862,7 @@ impl MsiRouteBuilder for KvmX86MsiRouteBuilder { address_lo, address_hi, data, + devid: None, }) } } @@ -883,6 +885,7 @@ impl IoApicRouting for KvmPartitionInner { address_lo, address_hi, data, + devid: None, }), None => { tracelimit::warn_ratelimited!( @@ -1456,7 +1459,7 @@ impl GuestEventPort for KvmGuestEventPort { } impl SignalMsi for KvmPartitionInner { - fn signal_msi(&self, _rid: u32, address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { self.request_msi(MsiRequest { address, data }); } } diff --git a/vmm_core/virt_kvm/src/gsi.rs b/vmm_core/virt_kvm/src/gsi.rs index f070d81311..97eb79710e 100644 --- a/vmm_core/virt_kvm/src/gsi.rs +++ b/vmm_core/virt_kvm/src/gsi.rs @@ -216,6 +216,7 @@ pub(crate) trait MsiRouteBuilder: Send + Sync { partition: &KvmPartitionInner, address: u64, data: u32, + devid: Option, ) -> Option; } @@ -224,9 +225,9 @@ impl IrqFdRoute for KvmIrqFdRoute { &self.event } - fn enable(&self, address: u64, data: u32) { + fn enable(&self, address: u64, data: u32, devid: Option) { if let Some(partition) = self.route.partition.upgrade() { - if let Some(entry) = self.builder.routing_entry(&partition, address, data) { + if let Some(entry) = self.builder.routing_entry(&partition, address, data, devid) { self.route.inner.enable(&partition, entry); } else { tracelimit::warn_ratelimited!( diff --git a/vmm_core/virt_kvm/src/lib.rs b/vmm_core/virt_kvm/src/lib.rs index 91609f3214..de9901ce33 100644 --- a/vmm_core/virt_kvm/src/lib.rs +++ b/vmm_core/virt_kvm/src/lib.rs @@ -110,9 +110,14 @@ struct KvmPartitionInner { #[cfg(guest_arch = "aarch64")] #[inspect(skip)] _gic_device: kvm::Device, + /// The ITS device fd, kept alive for the VM lifetime. #[cfg(guest_arch = "aarch64")] #[inspect(skip)] - gic_v2m: Option, + _its_device: Option, + /// MSI controller configuration (v2m, ITS, or none). + #[cfg(guest_arch = "aarch64")] + #[inspect(skip)] + gic_msi: vm_topology::processor::aarch64::GicMsiController, /// Total configured GIC interrupt count (SGIs + PPIs + SPIs). #[cfg(guest_arch = "aarch64")] gic_nr_irqs: u32, diff --git a/vmm_core/virt_mshv/src/aarch64/mod.rs b/vmm_core/virt_mshv/src/aarch64/mod.rs index 038ded7646..9e95dfbe91 100644 --- a/vmm_core/virt_mshv/src/aarch64/mod.rs +++ b/vmm_core/virt_mshv/src/aarch64/mod.rs @@ -57,6 +57,7 @@ impl virt::Hypervisor for LinuxMshv { platform_gsiv: None, // TODO: query from hypervisor supports_gic_v3: true, + supports_its: false, } } @@ -192,7 +193,7 @@ impl virt::Partition for MshvPartition { } fn request_msi(&self, _vtl: Vtl, request: MsiRequest) { - self.inner.signal_msi(0, request.address, request.data); + self.inner.signal_msi(None, request.address, request.data); } fn request_yield(&self, vp_index: VpIndex) { @@ -294,7 +295,7 @@ impl virt::DeviceBuilder for MshvPartition { } impl SignalMsi for MshvPartitionInner { - fn signal_msi(&self, _rid: u32, _address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, _address: u64, data: u32) { self.set_spi_irq(data, true); } } diff --git a/vmm_core/virt_mshv/src/irqfd.rs b/vmm_core/virt_mshv/src/irqfd.rs index 45ba2e4fcf..da17abdd9b 100644 --- a/vmm_core/virt_mshv/src/irqfd.rs +++ b/vmm_core/virt_mshv/src/irqfd.rs @@ -240,7 +240,7 @@ impl IrqFdRoute for MshvIrqFdRoute { &self.event } - fn enable(&self, address: u64, data: u32) { + fn enable(&self, address: u64, data: u32, _devid: Option) { let mut armed = self.armed.lock(); let route = MsiRoute { address_lo: address as u32, diff --git a/vmm_core/virt_mshv/src/x86_64/mod.rs b/vmm_core/virt_mshv/src/x86_64/mod.rs index 3114f350d3..89848d5ac2 100644 --- a/vmm_core/virt_mshv/src/x86_64/mod.rs +++ b/vmm_core/virt_mshv/src/x86_64/mod.rs @@ -479,7 +479,7 @@ impl MshvPartitionInner { } impl SignalMsi for MshvPartitionInner { - fn signal_msi(&self, _rid: u32, address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { self.request_msi(MsiRequest { address, data }); } } diff --git a/vmm_core/virt_whp/src/device.rs b/vmm_core/virt_whp/src/device.rs index f170b4f84e..af7dce3804 100644 --- a/vmm_core/virt_whp/src/device.rs +++ b/vmm_core/virt_whp/src/device.rs @@ -106,7 +106,7 @@ impl Drop for Device { } impl SignalMsi for Device { - fn signal_msi(&self, _rid: u32, address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { if let Err(err) = self.device().interrupt(address, data) { tracelimit::warn_ratelimited!( address, diff --git a/vmm_core/virt_whp/src/lib.rs b/vmm_core/virt_whp/src/lib.rs index 71432b757d..7611128055 100644 --- a/vmm_core/virt_whp/src/lib.rs +++ b/vmm_core/virt_whp/src/lib.rs @@ -120,7 +120,7 @@ struct WhpPartitionInner { isolation: IsolationType, #[cfg(guest_arch = "aarch64")] #[inspect(skip)] - gic_v2m: Option, + gic_msi: vm_topology::processor::aarch64::GicMsiController, synic_ports: virt::synic::SynicPortMap, } @@ -554,7 +554,10 @@ impl virt::Partition for WhpPartition { #[cfg(guest_arch = "aarch64")] fn as_signal_msi(&self, minimum_vtl: Vtl) -> Option> { - let v2m = self.inner.gic_v2m.as_ref()?; + let v2m = match &self.inner.gic_msi { + vm_topology::processor::aarch64::GicMsiController::V2m(v2m) => v2m, + _ => return None, + }; let irqcon = self.with_vtl(minimum_vtl).clone() as Arc; Some(Arc::new(virt::aarch64::gic_v2m::GicV2mSignalMsi::new( v2m, irqcon, @@ -774,6 +777,7 @@ impl virt::Hypervisor for Whp { virt::PlatformInfo { platform_gsiv: Some(WHP_PMU_GSIV), supports_gic_v3: true, + supports_its: false, } } } @@ -1144,7 +1148,7 @@ impl WhpPartitionInner { hvstate, isolation: proto_config.isolation, #[cfg(guest_arch = "aarch64")] - gic_v2m: proto_config.processor_topology.gic_v2m(), + gic_msi: proto_config.processor_topology.gic_msi(), synic_ports: Default::default(), }; @@ -1344,7 +1348,10 @@ impl VtlPartition { // (GICD_TYPER.LPIS=0) so Linux uses the GICv2m MSI frame // instead of ITS for PCIe MSIs. Otherwise keep LPI // enabled (1 ID bit minimum). - GicLpiIntIdBits: if config.processor_topology.gic_v2m().is_some() { + GicLpiIntIdBits: if matches!( + config.processor_topology.gic_msi(), + vm_topology::processor::aarch64::GicMsiController::V2m(_) + ) { 0 } else { 1 diff --git a/vmm_core/virt_whp/src/synic.rs b/vmm_core/virt_whp/src/synic.rs index e2a591af0c..e4fb04f789 100644 --- a/vmm_core/virt_whp/src/synic.rs +++ b/vmm_core/virt_whp/src/synic.rs @@ -441,7 +441,7 @@ mod x86 { use virt::irqcon::MsiRequest; impl SignalMsi for WhpPartitionAndVtl { - fn signal_msi(&self, _rid: u32, address: u64, data: u32) { + fn signal_msi(&self, _devid: Option, address: u64, data: u32) { if let Err(err) = self .partition .interrupt(self.vtl, MsiRequest { address, data }) diff --git a/vmm_core/vmotherboard/src/base_chipset.rs b/vmm_core/vmotherboard/src/base_chipset.rs index 4c2d602870..3d19582bd9 100644 --- a/vmm_core/vmotherboard/src/base_chipset.rs +++ b/vmm_core/vmotherboard/src/base_chipset.rs @@ -918,9 +918,15 @@ mod weak_mutex_pci { port: u8, name: Arc, dev: Weak>, + device_id: Option, ) -> Result<(), PcieConflict> { self.lock() - .add_pcie_device(port, name.clone(), Box::new(WeakMutexPciDeviceWrapper(dev))) + .add_pcie_device( + port, + name.clone(), + Box::new(WeakMutexPciDeviceWrapper(dev)), + device_id, + ) .map_err(|existing_dev_name| PcieConflict { reason: PcieConflictReason::ExistingDev(existing_dev_name), conflict_dev: name, @@ -939,9 +945,15 @@ mod weak_mutex_pci { port: u8, name: Arc, dev: Weak>, + device_id: Option, ) -> Result<(), PcieConflict> { self.lock() - .add_pcie_device(port, &name, Box::new(WeakMutexPciDeviceWrapper(dev))) + .add_pcie_device( + port, + &name, + Box::new(WeakMutexPciDeviceWrapper(dev)), + device_id, + ) .map_err(|err| PcieConflict { reason: PcieConflictReason::ExistingDev(err.to_string().into()), conflict_dev: name, diff --git a/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/device.rs b/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/device.rs index 52b1dae6d1..91eed2a311 100644 --- a/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/device.rs +++ b/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/device.rs @@ -75,6 +75,7 @@ pub struct ArcMutexChipsetDeviceBuilder<'a, 'b, T> { pci_addr: Option<(u8, u8, u8)>, pci_bus_id: Option, pcie_port: Option, + pcie_device_id: Option, external_pci: bool, } @@ -102,6 +103,7 @@ where pci_addr: None, pci_bus_id: None, pcie_port: None, + pcie_device_id: None, external_pci: false, } } @@ -131,6 +133,14 @@ where self } + /// For PCIe devices: set the shared device identity for RID/device ID + /// tracking. The downstream port will update this with the device's + /// RID when the guest programs the secondary bus number. + pub fn with_pci_device_id(mut self, device_id: pcie::bus_range::AssignedBusRange) -> Self { + self.pcie_device_id = Some(device_id); + self + } + /// For PCI devices: do not register the device with any PCI bus. This is /// used when the device is hooked up to a bus (such as a VPCI bus) outside /// of the vmotherboard infrastructure. @@ -175,7 +185,8 @@ where } if let Some(bus_id_port) = self.pcie_port { - self.services.register_static_pcie(bus_id_port); + self.services + .register_static_pcie(bus_id_port, self.pcie_device_id.take()); } else { // static pci registration let bdf = match (self.pci_addr, dev.suggested_bdf()) { diff --git a/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/pci.rs b/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/pci.rs index c6df3cde7e..f9b096c317 100644 --- a/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/pci.rs +++ b/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/pci.rs @@ -84,6 +84,7 @@ pub trait RegisterWeakMutexPcie: Send { port: u8, name: Arc, device: Weak>, + device_id: Option, ) -> Result<(), PcieConflict>; /// Enumerate the downstream ports. @@ -94,6 +95,7 @@ pub struct WeakMutexPcieDeviceEntry { pub bus_id_port: BusIdPcieDownstreamPort, pub name: Arc, pub dev: Weak>, + pub device_id: Option, } #[derive(Default)] @@ -111,6 +113,7 @@ impl BusResolverWeakMutexPcie { bus_id_port, name, dev, + device_id, } in self.devices { let (port_number, bus_id_enumerator) = match self.ports.get(&bus_id_port) { @@ -135,7 +138,7 @@ impl BusResolverWeakMutexPcie { } }; - match enumerator.add_pcie_device(*port_number, name, dev) { + match enumerator.add_pcie_device(*port_number, name, dev, device_id) { Ok(()) => {} Err(conflict) => { errs.push(conflict); diff --git a/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/services.rs b/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/services.rs index fc5c4bcf88..7446246431 100644 --- a/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/services.rs +++ b/vmm_core/vmotherboard/src/chipset/backing/arc_mutex/services.rs @@ -195,11 +195,16 @@ impl<'a, 'b> ArcMutexChipsetServices<'a, 'b> { ); } - pub fn register_static_pcie(&mut self, bus_id: BusIdPcieDownstreamPort) { + pub fn register_static_pcie( + &mut self, + bus_id: BusIdPcieDownstreamPort, + device_id: Option, + ) { self.builder.register_weak_mutex_pcie_device( bus_id, self.dev_name.clone(), self.dev.clone(), + device_id, ); } diff --git a/vmm_core/vmotherboard/src/chipset/builder/mod.rs b/vmm_core/vmotherboard/src/chipset/builder/mod.rs index 74872d0006..271c440c68 100644 --- a/vmm_core/vmotherboard/src/chipset/builder/mod.rs +++ b/vmm_core/vmotherboard/src/chipset/builder/mod.rs @@ -243,6 +243,7 @@ impl<'a> ChipsetBuilder<'a> { bus_id_port: BusIdPcieDownstreamPort, name: Arc, dev: Weak>, + device_id: Option, ) { self.inner .lock() @@ -253,6 +254,7 @@ impl<'a> ChipsetBuilder<'a> { bus_id_port, name, dev, + device_id, }); } diff --git a/vmm_core/vmotherboard/src/lib.rs b/vmm_core/vmotherboard/src/lib.rs index e5c0070c10..4038d094f8 100644 --- a/vmm_core/vmotherboard/src/lib.rs +++ b/vmm_core/vmotherboard/src/lib.rs @@ -25,6 +25,7 @@ pub use self::chipset::DynamicDeviceUnit; // _all_ device instantiation into `vmotherboard` itself. pub use self::chipset::ChipsetBuilder; pub use self::chipset::backing::arc_mutex::device::ArcMutexChipsetDeviceBuilder; +pub use pcie::bus_range::AssignedBusRange; use chipset_device::ChipsetDevice; use inspect::InspectMut;