diff --git a/vm/devices/net/mana_driver/src/queues.rs b/vm/devices/net/mana_driver/src/queues.rs index c69e2ea7a8..8a623d9a57 100644 --- a/vm/devices/net/mana_driver/src/queues.rs +++ b/vm/devices/net/mana_driver/src/queues.rs @@ -69,7 +69,8 @@ pub struct DoorbellPage { } impl DoorbellPage { - pub(crate) fn null() -> Self { + /// Returns a null doorbell page that silently discards all writes. + pub fn null() -> Self { Self { doorbell: Arc::new(NullDoorbell), doorbell_id: 0, diff --git a/vm/devices/net/net_mana/src/lib.rs b/vm/devices/net/net_mana/src/lib.rs index 621129b5a1..2a7aeb3ab1 100644 --- a/vm/devices/net/net_mana/src/lib.rs +++ b/vm/devices/net/net_mana/src/lib.rs @@ -14,9 +14,11 @@ use gdma_defs::Cqe; use gdma_defs::CqeParams; use gdma_defs::GDMA_EQE_COMPLETION; use gdma_defs::Sge; +use gdma_defs::WQE_ALIGNMENT; use gdma_defs::WqeHeader; use gdma_defs::bnic::CQE_RX_OKAY; use gdma_defs::bnic::CQE_TX_GDMA_ERR; +use gdma_defs::bnic::CQE_TX_INVALID_ETH_TYPE; use gdma_defs::bnic::CQE_TX_INVALID_OOB; use gdma_defs::bnic::CQE_TX_OKAY; use gdma_defs::bnic::MANA_LONG_PKT_FMT; @@ -642,6 +644,10 @@ struct PostedTx { id: TxId, wqe_len: u32, bounced_len_with_padding: u32, + /// GPA of the first data segment, used for error tracing. + head_gpa: u64, + /// Length of the first data segment, used for error tracing. + head_len: u32, } #[derive(Default, Inspect)] @@ -835,6 +841,40 @@ impl ManaQueue { } } + fn trace_tx_eth_frame( + &mut self, + tracing_level: tracing::Level, + wqe_offset: u32, + guest_memory: &GuestMemory, + ) { + let Some(packet) = self.posted_tx.front() else { + return; + }; + // Determine the location of the first SGE. If `bounced_len_with_padding` > 0, + // some part of the packet was bounced. + let head_gpa = packet.head_gpa; + let head_len = packet.head_len; + if packet.bounced_len_with_padding > 0 { + // Look for the head SGE in the bounce buffer. + let found_in_bounce_buffer = trace_tx_eth_frame_from_bounce_buffer( + &mut self.tx_wq, + &self.tx_bounce_buffer, + tracing_level, + wqe_offset, + ); + // In DirectDma tail-coalesce path, the head segment is not always bounced. + // If not found in the bounce buffer, read it from guest memory. + // Only attempt tracing when the head segment is at least an Ethernet II header + // long. Otherwise the EtherType must be in a later segment. Not walking the SGL, + // as that should be rare. + if !found_in_bounce_buffer && head_len as usize >= MIN_ETH_HEADER_LEN { + trace_tx_eth_frame_from_guest_memory(tracing_level, head_gpa, guest_memory); + } + } else if head_len as usize >= MIN_ETH_HEADER_LEN { + trace_tx_eth_frame_from_guest_memory(tracing_level, head_gpa, guest_memory); + } + } + fn trace_rx_wqe_from_offset(&mut self, wqe_offset: u32) { let size = size_of::(); let bytes = self.rx_wq.read(wqe_offset, size); @@ -1061,14 +1101,14 @@ impl Queue for ManaQueue { fn tx_poll( &mut self, - _pool: &mut dyn BufferAccess, + pool: &mut dyn BufferAccess, done: &mut [TxId], ) -> Result { let mut i = 0; while i < done.len() { let id = if let Some(cqe) = self.tx_cq.pop() { let tx_oob = ManaTxCompOob::read_from_prefix(&cqe.data[..]).unwrap().0; // TODO: zerocopy: use-rest-of-range (https://github.com/microsoft/openvmm/issues/759) - self.handle_tx_cqe(&tx_oob, cqe.params, done.len())? + self.handle_tx_cqe(&tx_oob, cqe.params, done.len(), pool.guest_memory())? } else if let Some(id) = self.dropped_tx.pop_front() { self.stats.tx_dropped.increment(); id @@ -1119,6 +1159,7 @@ impl ManaQueue { tx_oob: &ManaTxCompOob, cqe_params: CqeParams, done_len: usize, + guest_memory: &GuestMemory, ) -> Result { match tx_oob.cqe_hdr.cqe_type() { CQE_TX_OKAY => { @@ -1139,6 +1180,14 @@ impl ManaQueue { self.stats.tx_errors.increment(); self.trace_tx(tracing::Level::WARN, cqe_params, tx_oob, done_len); } + CQE_TX_INVALID_ETH_TYPE => { + // The hardware rejected the packet due to an unsupported EtherType. + // Non-fatal: only the individual packet is dropped, the queue continues. + self.stats.tx_errors.increment(); + let wqe_offset = tx_oob.offsets.tx_wqe_offset(); + self.trace_tx(tracing::Level::WARN, cqe_params, tx_oob, done_len); + self.trace_tx_eth_frame(tracing::Level::WARN, wqe_offset, guest_memory); + } ty => { tracelimit::error_ratelimited!( ty, @@ -1413,6 +1462,8 @@ impl ManaQueue { id: meta.id, wqe_len, bounced_len_with_padding: bounce_buffer.commit(), + head_gpa: head.gpa, + head_len: head.len, }; Ok(Some(tx)) } @@ -1576,6 +1627,99 @@ impl ContiguousBufferManager { pub fn as_slice(&self) -> &[AtomicU8] { self.mem.as_slice() } + + /// Given a GPA (DMA address) from an SGE, returns the corresponding + /// offset in the bounce buffer, or `None` if the GPA does not belong + /// to this buffer. + fn gpa_to_offset(&self, gpa: u64) -> Option { + let page_pfn = gpa / PAGE_SIZE64; + let offset_in_page = (gpa - page_pfn * PAGE_SIZE64) as u32; + let page_idx = self.mem.pfns().iter().position(|&pfn| pfn == page_pfn)? as u32; + Some(page_idx * PAGE_SIZE32 + offset_in_page) + } +} + +// Logs the EtherType of an Ethernet II frame. +// MAC addresses are omitted for customer privacy. +fn log_eth_type(tracing_level: tracing::Level, eth_type_bytes: [u8; 2]) { + tracelimit::event_ratelimited!( + tracing_level, + eth_type = format_args!("{:#06x}", u16::from_be_bytes(eth_type_bytes)), + "tx ethernet frame" + ); +} + +/// Byte offset of the EtherType field within an Ethernet II frame (after +/// the 6-byte destination MAC and 6-byte source MAC addresses). +const ETH_TYPE_OFFSET: usize = 12; + +/// Minimum length of an Ethernet II header (dst MAC 6 + src MAC 6 + EtherType 2). +const MIN_ETH_HEADER_LEN: usize = 14; + +/// Attempts to read the first SGE from a TX WQE and log the EtherType +/// from the bounce buffer. Returns `true` if the EtherType was found and logged. +fn trace_tx_eth_frame_from_bounce_buffer( + tx_wq: &mut Wq, + tx_bounce_buffer: &ContiguousBufferManager, + tracing_level: tracing::Level, + wqe_offset: u32, +) -> bool { + // Reading 64 bytes total: two 32-byte (WQE_ALIGNMENT) chunks. This covers + // the 8-byte WQE header plus up to 56 bytes after it, which is enough to + // reach the first 16-byte SGE even when the inline client OOB consumes up + // to 40 bytes after the header. + // Reading in two chunks handles ring-buffer wrap-around because Wq::read + // does not wrap, so each read must stay within a single alignment slot. + let mut bytes = tx_wq.read(wqe_offset, WQE_ALIGNMENT); + bytes.extend_from_slice(&tx_wq.read(wqe_offset.wrapping_add(1), WQE_ALIGNMENT)); + let Ok((wqe_header, _)) = WqeHeader::read_from_prefix(&bytes) else { + return false; + }; + + if wqe_header.params.num_sgl_entries() == 0 { + return false; + } + + let sge_start = size_of::() + wqe_header.sgl_offset(); + let Some(sge_bytes) = bytes.get(sge_start..) else { + return false; + }; + let Ok((sge, _)) = Sge::read_from_prefix(sge_bytes) else { + return false; + }; + + if (sge.size as usize) < MIN_ETH_HEADER_LEN { + return false; + } + let Some(buf_offset) = tx_bounce_buffer.gpa_to_offset(sge.address) else { + return false; + }; + let start = buf_offset as usize + ETH_TYPE_OFFSET; + let Some(slot) = tx_bounce_buffer.as_slice().get(start..start + 2) else { + return false; + }; + let mut eth_type_bytes = [0u8; 2]; + slot.atomic_read(&mut eth_type_bytes); + log_eth_type(tracing_level, eth_type_bytes); + true +} + +/// Reads the EtherType directly from guest memory at the given GPA. +/// Returns `true` if the EtherType was found and logged. +fn trace_tx_eth_frame_from_guest_memory( + tracing_level: tracing::Level, + head_gpa: u64, + guest_memory: &GuestMemory, +) -> bool { + let mut eth_type_bytes = [0u8; 2]; + let Some(addr) = head_gpa.checked_add(ETH_TYPE_OFFSET as u64) else { + return false; + }; + if guest_memory.read_at(addr, &mut eth_type_bytes).is_err() { + return false; + } + log_eth_type(tracing_level, eth_type_bytes); + true } impl Inspect for ContiguousBufferManager { @@ -1590,7 +1734,28 @@ impl Inspect for ContiguousBufferManager { mod tests { use super::*; use anyhow::{Result, anyhow, ensure}; + use gdma_defs::WqeParams; + use mana_driver::queues::DoorbellPage; + use test_with_tracing::test; + use user_driver::DmaClient; use user_driver_emulated_mock::DeviceTestMemory; + use zerocopy::IntoBytes; + + /// Builds a send-WQ containing a single WQE with a single SGE. + fn build_tx_wq_with_sge(dma_client: &dyn DmaClient, sge: Sge) -> Wq { + let wqe_header = WqeHeader { + reserved: [0; 3], + last_vbytes: 0, + params: WqeParams::new() + .with_num_sgl_entries(1) + .with_inline_client_oob_size(gdma_defs::CLIENT_OOB_8), + }; + let wq_mem = dma_client.allocate_dma_buffer(4096).unwrap(); + wq_mem.write_at(0, wqe_header.as_bytes()); // [WqeHeader (8B)] + wq_mem.write_at(8, &[0u8; 8]); // [ShortOOB (8B)] (placeholder) + wq_mem.write_at(16, sge.as_bytes()); // [SGE (16B)] + Wq::new_sq(wq_mem, DoorbellPage::null(), 0) + } #[test] fn page_counts_powers_of_two_only() -> Result<()> { @@ -1614,4 +1779,170 @@ mod tests { Ok(()) } + + #[test] + fn test_gpa_to_offset_round_trip() { + // Allocate a multi-page bounce buffer so we can exercise different + // page indices and offsets within pages. + let page_count: u32 = 4; + let dtm = DeviceTestMemory::new((page_count * 2).into(), false, "test_gpa_to_offset"); + let mut bounce = ContiguousBufferManager::new(dtm.dma_client(), page_count).unwrap(); + let pfns = bounce.mem.pfns().to_vec(); + assert_eq!(pfns.len(), page_count as usize); + + // Drive GPAs through the production `reserve()` path and check that + // `gpa_to_offset` recovers the same offset that `reserve` reported. + // Hopefully prevents any drift between `reserve` and `gpa_to_offset`. + let alloc_sizes = [1u32, 12, 13, PAGE_SIZE32 / 2, PAGE_SIZE32 - 1]; + let mut tx = bounce.start_allocation(); + let mut reserved = Vec::new(); + for &len in &alloc_sizes { + let buf = tx.allocate(len).unwrap(); + reserved.push(buf.reserve()); + } + tx.commit(); + for r in &reserved { + assert_eq!( + bounce.gpa_to_offset(r.gpa), + Some(r.offset), + "round-trip failed for offset={} gpa={:#x}", + r.offset, + r.gpa + ); + } + + // A GPA outside this buffer return None. + assert_eq!(bounce.gpa_to_offset(0xDEAD_0000), None); + + // A GPA one page before the first mapped page returns None. + let below = pfns[0].saturating_sub(1) * PAGE_SIZE64; + if below != pfns[0] * PAGE_SIZE64 { + assert_eq!(bounce.gpa_to_offset(below), None); + } + } + + #[test] + fn test_trace_tx_eth_frame_bounce_buffer() { + let mem = DeviceTestMemory::new(16, false, "test_trace_tx_eth_frame"); + let dma_client = mem.dma_client(); + + let bounce_buffer = ContiguousBufferManager::new(dma_client.clone(), 1).unwrap(); + // EtherType 0x0800 (IPv4) at bytes 12-13. + bounce_buffer.mem.write_at(12, &0x0800u16.to_be_bytes()); + + let sge_gpa = bounce_buffer.mem.pfns()[0] * PAGE_SIZE64; // offset 0 + let mut wq = build_tx_wq_with_sge( + dma_client.as_ref(), + Sge { + address: sge_gpa, + mem_key: 0, + size: 1500, + }, + ); + + // Should parse the WQE, resolve the SGE GPA in the bounce buffer, + // and emit a rate-limited trace. + // The trace `eth_type: 0x0800` is visible when the test is run with `--nocapture`. + assert!(trace_tx_eth_frame_from_bounce_buffer( + &mut wq, + &bounce_buffer, + tracing::Level::WARN, + 0, + )); + + // SGE with out-of-range GPA should log nothing. + let mut wq = build_tx_wq_with_sge( + dma_client.as_ref(), + Sge { + address: 0xDEAD_0000, + mem_key: 0, + size: 1500, + }, + ); + + assert!(!trace_tx_eth_frame_from_bounce_buffer( + &mut wq, + &bounce_buffer, + tracing::Level::WARN, + 0, + )); + } + + #[test] + fn test_trace_tx_eth_frame_bounce_buffer_ring_wrap() { + let mem = DeviceTestMemory::new(16, false, "test_wrap"); + let dma_client = mem.dma_client(); + + let bounce_buffer = ContiguousBufferManager::new(dma_client.clone(), 1).unwrap(); + // EtherType 0x0800 (IPv4) at bytes 12-13. + bounce_buffer.mem.write_at(12, &0x0800u16.to_be_bytes()); + + let sge_gpa = bounce_buffer.mem.pfns()[0] * PAGE_SIZE64; + + // Use CLIENT_OOB_32 so the WQE prefix is exactly 64 bytes: + // WqeHeader(8) + sgl_offset(40) + SGE(16) = 64 + // Placing this at the last 32-byte slot forces the SGE to wrap + // around to the beginning of the ring. + let wqe_header = WqeHeader { + reserved: [0; 3], + last_vbytes: 0, + params: WqeParams::new() + .with_num_sgl_entries(1) + .with_inline_client_oob_size(gdma_defs::CLIENT_OOB_32), + }; + let sge = Sge { + address: sge_gpa, + mem_key: 0, + size: 1500, + }; + + // Build the 64-byte WQE prefix linearly, then split it across the + // ring boundary at offset 4064. + let mut wqe_buf = [0u8; 64]; + wqe_buf[..8].copy_from_slice(wqe_header.as_bytes()); + // OOB + padding (bytes 8..48) left as zeros. + wqe_buf[48..64].copy_from_slice(sge.as_bytes()); + + let wq_mem = dma_client.allocate_dma_buffer(4096).unwrap(); + let wrap_offset: usize = 4096 - 32; + wq_mem.write_at(wrap_offset, &wqe_buf[..32]); // last 32 bytes of ring + wq_mem.write_at(0, &wqe_buf[32..]); // wraps to beginning + let mut wq = Wq::new_sq(wq_mem, DoorbellPage::null(), 0); + + // wqe_offset is in WQE alignment units (32 bytes). + let wqe_offset = (wrap_offset / 32) as u32; + assert!(trace_tx_eth_frame_from_bounce_buffer( + &mut wq, + &bounce_buffer, + tracing::Level::WARN, + wqe_offset, + )); + } + + #[test] + fn test_trace_tx_eth_frame_guest_memory() { + use guestmem::GuestMemory; + + let mut data = vec![0u8; 4096]; + // Write EtherType 0x86DD (IPv6) at offset 12. + data[12] = 0x86; + data[13] = 0xDD; + let guest_memory = GuestMemory::allocate(4096); + guest_memory.write_at(0, &data).unwrap(); + + // Should read the EtherType out of guest memory and emit a rate-limited trace. + // The trace `eth_type: 0x86dd` is visible when the test is run with `--nocapture`. + assert!(trace_tx_eth_frame_from_guest_memory( + tracing::Level::WARN, + 0, + &guest_memory + )); + + // An out-of-range GPA should log nothing. + assert!(!trace_tx_eth_frame_from_guest_memory( + tracing::Level::WARN, + u64::MAX - 1, + &guest_memory + )); + } } diff --git a/vm/devices/net/net_mana/src/test.rs b/vm/devices/net/net_mana/src/test.rs index 244874ce8c..40128428cd 100644 --- a/vm/devices/net/net_mana/src/test.rs +++ b/vm/devices/net/net_mana/src/test.rs @@ -1106,7 +1106,8 @@ async fn tx_spurious_cqe_panics(driver: DefaultDriver) { let mut oob = ManaTxCompOob::new_zeroed(); oob.cqe_hdr.set_cqe_type(CQE_TX_OKAY); - let _ = queue.handle_tx_cqe(&oob, CqeParams::new(), 8); + let guest_memory = guestmem::GuestMemory::allocate(4096); + let _ = queue.handle_tx_cqe(&oob, CqeParams::new(), 8, &guest_memory); } #[async_test] @@ -1121,13 +1122,16 @@ async fn tx_cqe_gdma_err_returns_try_restart(driver: DefaultDriver) { id: TxId(42), wqe_len: 0, bounced_len_with_padding: 0, + head_gpa: 0, + head_len: 0, }); let mut oob = ManaTxCompOob::new_zeroed(); oob.cqe_hdr.set_cqe_type(CQE_TX_GDMA_ERR); // CQE_TX_GDMA_ERR returns TryRestart without popping posted_tx. - let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8); + let guest_memory = guestmem::GuestMemory::allocate(4096); + let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8, &guest_memory); assert!( matches!(result, Err(TxError::TryRestart(_))), "expected TryRestart, got {result:?}" @@ -1152,13 +1156,16 @@ async fn tx_cqe_invalid_oob_completes_packet(driver: DefaultDriver) { id: TxId(7), wqe_len: 0, bounced_len_with_padding: 0, + head_gpa: 0, + head_len: 0, }); let mut oob = ManaTxCompOob::new_zeroed(); oob.cqe_hdr.set_cqe_type(CQE_TX_INVALID_OOB); // CQE_TX_INVALID_OOB logs an error but still pops posted_tx. - let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8); + let guest_memory = guestmem::GuestMemory::allocate(4096); + let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8, &guest_memory); assert_eq!(result.unwrap().0, 7); assert_eq!(queue.stats.tx_errors.get(), 1); assert!(queue.posted_tx.is_empty()); @@ -1179,12 +1186,15 @@ async fn tx_cqe_okay_completes_packet(driver: DefaultDriver) { id: TxId(99), wqe_len: 0, bounced_len_with_padding: 0, + head_gpa: 0, + head_len: 0, }); let mut oob = ManaTxCompOob::new_zeroed(); oob.cqe_hdr.set_cqe_type(CQE_TX_OKAY); - let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8); + let guest_memory = guestmem::GuestMemory::allocate(4096); + let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8, &guest_memory); assert_eq!(result.unwrap().0, 99); assert_eq!(queue.stats.tx_packets.get(), 1); assert!(queue.posted_tx.is_empty()); @@ -1193,3 +1203,55 @@ async fn tx_cqe_okay_completes_packet(driver: DefaultDriver) { endpoint.vport.destroy(arena).await; endpoint.stop().await; } + +#[async_test] +async fn tx_cqe_invalid_eth_type_completes_packet(driver: DefaultDriver) { + use crate::PostedTx; + use gdma_defs::bnic::CQE_TX_INVALID_ETH_TYPE; + + let (mut queue, arena, mut endpoint) = new_test_queue(&driver).await; + + // Place an Ethernet header with EtherType 0x88CC (LLDP) at GPA 0 in + // guest memory; the CQE_TX_INVALID_ETH_TYPE arm will attempt to read + // it via `trace_tx_eth_frame` -> `trace_tx_eth_frame_from_guest_memory`. + let guest_memory = guestmem::GuestMemory::allocate(4096); + let mut eth_header = [0u8; 14]; + eth_header[12] = 0x88; + eth_header[13] = 0xCC; + guest_memory.write_at(0, ð_header).unwrap(); + + queue.posted_tx.push_back(PostedTx { + id: TxId(123), + wqe_len: 0, + bounced_len_with_padding: 0, + head_gpa: 0, + head_len: 14, + }); + + let mut oob = ManaTxCompOob::new_zeroed(); + oob.cqe_hdr.set_cqe_type(CQE_TX_INVALID_ETH_TYPE); + + // CQE_TX_INVALID_ETH_TYPE logs a warning, attempts to trace the + // EtherType, and pops posted_tx like a normal completion. + let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8, &guest_memory); + assert_eq!(result.unwrap().0, 123); + assert_eq!(queue.stats.tx_errors.get(), 1); + assert!(queue.posted_tx.is_empty()); + + // A short head segment must skip the guest-memory read without panicking. + queue.posted_tx.push_back(PostedTx { + id: TxId(124), + wqe_len: 0, + bounced_len_with_padding: 0, + head_gpa: 0, + head_len: 4, // shorter than MIN_ETH_HEADER_LEN (14) + }); + let result = queue.handle_tx_cqe(&oob, CqeParams::new(), 8, &guest_memory); + assert_eq!(result.unwrap().0, 124); + assert_eq!(queue.stats.tx_errors.get(), 2); + assert!(queue.posted_tx.is_empty()); + + drop(queue); + endpoint.vport.destroy(arena).await; + endpoint.stop().await; +}