diff --git a/lib/opte/src/engine/port/mod.rs b/lib/opte/src/engine/port/mod.rs index 69c7c584..f216fe9c 100644 --- a/lib/opte/src/engine/port/mod.rs +++ b/lib/opte/src/engine/port/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! A virtual switch port. @@ -44,6 +44,7 @@ use super::rule::HdrTransformError; use super::rule::PresavedMeoi; use super::rule::Rule; use super::rule::TransformFlags; +use super::tcp::INCIPIENT_EXPIRE_TTL; use super::tcp::KEEPALIVE_EXPIRE_TTL; use super::tcp::TIME_WAIT_EXPIRE_TTL; use super::tcp_state::TcpFlowState; @@ -2992,14 +2993,16 @@ impl Dump for TcpFlowEntryState { /// Expiry behaviour for TCP flows dependent on the connection FSM. #[derive(Debug)] pub struct TcpExpiry { - time_wait_ttl: Ttl, + incipient_ttl: Ttl, + quiescent_ttl: Ttl, keepalive_ttl: Ttl, } impl Default for TcpExpiry { fn default() -> Self { Self { - time_wait_ttl: TIME_WAIT_EXPIRE_TTL, + incipient_ttl: INCIPIENT_EXPIRE_TTL, + quiescent_ttl: TIME_WAIT_EXPIRE_TTL, keepalive_ttl: KEEPALIVE_EXPIRE_TTL, } } @@ -3012,8 +3015,16 @@ impl ExpiryPolicy for TcpExpiry { now: Moment, ) -> bool { let ttl = match entry.state().tcp_state() { - TcpState::TimeWait => self.time_wait_ttl, - _ => self.keepalive_ttl, + TcpState::TimeWait + | TcpState::LastAck + | TcpState::CloseWait + | TcpState::FinWait1 + | TcpState::FinWait2 => self.quiescent_ttl, + TcpState::SynSent | TcpState::SynRcvd | TcpState::Listen => { + self.incipient_ttl + } + TcpState::Established => self.keepalive_ttl, + TcpState::Closed => Ttl::new_seconds(0), }; ttl.is_expired(entry.last_hit(), now) } diff --git a/lib/opte/src/engine/tcp.rs b/lib/opte/src/engine/tcp.rs index a4875539..905dc918 100644 --- a/lib/opte/src/engine/tcp.rs +++ b/lib/opte/src/engine/tcp.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2025 Oxide Computer Company +// Copyright 2026 Oxide Computer Company //! TCP headers. @@ -16,11 +16,18 @@ pub const TCP_HDR_OFFSET_SHIFT: u8 = 4; pub const TCP_PORT_RDP: u16 = 3389; pub const TCP_PORT_SSH: u16 = 22; -/// The duration after which a connection in TIME-WAIT should be -/// considered free for either side to reuse. +/// The duration after which we can remove a TCP state entry which is still in +/// the three-way handshake. /// -/// This value is chosen by Windows and MacOS, which is larger -/// than Linux's default 60s. Allowances for tuned servers and/or +/// This value is set very low to prevent SYN-flood like traffic (or many +/// unacknowledged SYNs from the guest) from holding TCP flow entry slots for +/// the full [`KEEPALIVE_EXPIRE_SECS`]. +pub const INCIPIENT_EXPIRE_SECS: u64 = 5; +/// The duration after which a connection in TIME-WAIT or another closing state +/// should be considered free for either side to reuse. +/// +/// This value is chosen from the TIME-WAIT duratio of Windows and MacOS, which +/// is larger than Linux's default 60s. Allowances for tuned servers and/or /// more aggressive reuse via RFCs 1323/7323 and/or 6191 are made in /// `tcp_state`. pub const TIME_WAIT_EXPIRE_SECS: u64 = 120; @@ -31,6 +38,7 @@ pub const TIME_WAIT_EXPIRE_SECS: u64 = 120; /// keepalive, when interval + probe count will result in a timeout after /// 8mins (illumos) / 11mins (linux). pub const KEEPALIVE_EXPIRE_SECS: u64 = 8_000; +pub const INCIPIENT_EXPIRE_TTL: Ttl = Ttl::new_seconds(INCIPIENT_EXPIRE_SECS); pub const TIME_WAIT_EXPIRE_TTL: Ttl = Ttl::new_seconds(TIME_WAIT_EXPIRE_SECS); pub const KEEPALIVE_EXPIRE_TTL: Ttl = Ttl::new_seconds(KEEPALIVE_EXPIRE_SECS); diff --git a/lib/opte/src/engine/tcp_state.rs b/lib/opte/src/engine/tcp_state.rs index 33ee93e9..cd40f83a 100644 --- a/lib/opte/src/engine/tcp_state.rs +++ b/lib/opte/src/engine/tcp_state.rs @@ -148,26 +148,32 @@ impl TcpFlowState { match self.tcp_state { Closed => { - // We have a new inbound SYN. We assume for now the - // guest is listening on the given port by moving to - // the LISTEN state. - if flags.contains(IngotTcpFlags::SYN) { - return Some(Listen); - } - - // We pontentially have a legitimate inbound data + // We potentially have a legitimate inbound data // segment for an ESTABLISHED connection that // previously expired in OPTE but is still active in - // the guest. We immeidately move this to the + // the guest. We immediately move this to the // ESTABLISHED state even though that might be a lie. // We rely on the fact that the guest will immediately // respond with an ACK or RST. In the future we could // instead keep this in some type of probationary // state (or separate table). + // + // Alternately, we've received a SYN-ACK, but don't have + // state indicating that we sent an initial SYN because + // the remote half took longer than the incipient expiry + // period to respond. In this case, this is identical to + // the transition from `SynSent`. if flags.contains(IngotTcpFlags::ACK) { return Some(Established); } + // We have a new inbound SYN. We assume for now the + // guest is listening on the given port by moving to + // the LISTEN state. + if flags.contains(IngotTcpFlags::SYN) { + return Some(Listen); + } + None } diff --git a/lib/oxide-vpc/tests/integration_tests.rs b/lib/oxide-vpc/tests/integration_tests.rs index e23f424a..7472883c 100644 --- a/lib/oxide-vpc/tests/integration_tests.rs +++ b/lib/oxide-vpc/tests/integration_tests.rs @@ -15,6 +15,7 @@ use common::icmp::*; use common::*; +use opte::api::L4Info; use opte::api::MacAddr; use opte::api::OpteError; use opte::api::TcpState; @@ -48,6 +49,7 @@ use opte::engine::port::DropReason; use opte::engine::port::ProcessError; use opte::engine::port::ProcessResult; use opte::engine::rule::MappingResource; +use opte::engine::tcp::INCIPIENT_EXPIRE_SECS; use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS; use opte::ingot::ethernet::Ethertype; use opte::ingot::geneve::GeneveRef; @@ -3708,6 +3710,103 @@ fn early_tcp_invalidation() { assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap()); } +// We have agressive TCP flow entry expiry for flows in the three-way +// handshake, to ensure that they do not consume table entry space for +// extremely long periods of time in potential SYN-flood DOS scenarios. +// +// However, a slow handshake should still function using the underlying +// LFT entries where, e.g., the default firewall disposition is in use. +#[test] +fn tcp_invalidation_does_not_block_connection() { + let g1_cfg = g1_cfg(); + let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None); + g1.port.start(); + set!(g1, "port_state=running"); + + // Ensure we only have the default rules: allow all outbound, block + // all inbound. + firewall::set_fw_rules( + &g1.port, + &SetFwRulesReq { port_name: g1.port.name().to_string(), rules: vec![] }, + ) + .unwrap(); + update!( + g1, + [ + "incr:epoch", + "set:firewall.flows.in=0, firewall.flows.out=0", + "set:firewall.rules.out=0, firewall.rules.in=0", + ] + ); + + let g1_phys = TestIpPhys { + ip: g1_cfg.phys_ip, + mac: g1_cfg.guest_mac, + vni: g1_cfg.vni, + }; + + let dst_ip = Ipv4Addr::from_const([172, 30, 0, 6]); + g1.vpc_map.add(dst_ip.into(), g1_cfg.phys_addr()); + + // Attempt to connect to a hypothetical TCP recipient in the same VPC, + // on the same sled. This will create new TCP state and setup inbound + // LFTs for a SYN-ACK to use. + let mut pkt1_m = http_syn2( + g1_cfg.guest_mac, + g1_cfg.ipv4().private_ip, + GW_MAC_ADDR, + dst_ip, + ); + let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap(); + let flow = pkt1.flow(); + let remote_port = if let Some(L4Info::Ports(a)) = flow.l4_info() { + a.src_port + } else { + panic!() + }; + let res = g1.port.process(Out, pkt1); + expect_modified!(res, pkt1_m); + incr!( + g1, + [ + "firewall.flows.out, firewall.flows.in", + "uft.out", + "stats.port.out_modified, stats.port.out_uft_miss", + ] + ); + assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap()); + + // Assume that the recipient takes some time to get back to us, but not + // long enough to expire the UFT/LFTs. The TCP state will expire. + let t0 = Moment::now(); + let t1 = t0 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1); + g1.port.expire_flows_at(t1).unwrap(); + assert_eq!(None, g1.port.tcp_state(&flow)); + + // The SYN-ACK arrives, and we allow it through. This creates a new + // instance of TCP state. + let mut pkt2_m = http_syn_ack2( + BS_MAC_ADDR, + dst_ip, + g1_cfg.guest_mac, + g1_cfg.ipv4().private_ip, + remote_port, + ); + pkt2_m = encap(pkt2_m, g1_phys, g1_phys); + let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap(); + let res = g1.port.process(In, pkt2); + expect_modified!(res, pkt2_m); + incr!(g1, ["stats.port.in_modified, stats.port.in_uft_miss, uft.in"]); + assert_eq!(TcpState::Established, g1.port.tcp_state(&flow).unwrap()); + + // Receiving a SYN-ACK moves the connection into established. We'd expect + // this normally from `SynSent`, if the state hadn't been lost. This state + // will survive a short wait. + let t2 = t1 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1); + g1.port.expire_flows_at(t2).unwrap(); + assert_eq!(Some(TcpState::Established), g1.port.tcp_state(&flow)); +} + #[test] fn ephemeral_ip_preferred_over_snat_outbound() { let ip_cfg = IpCfg::DualStack {