Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/opte-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pub use ulp::*;
///
/// We rely on CI and the check-api-version.sh script to verify that
/// this number is incremented anytime the oxide-api code changes.
pub const API_VERSION: u64 = 39;
pub const API_VERSION: u64 = 40;
Comment thread
rcgoodfellow marked this conversation as resolved.
Outdated

/// Major version of the OPTE package.
pub const MAJOR_VERSION: u64 = 0;
Expand Down
21 changes: 16 additions & 5 deletions lib/opte/src/engine/port/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2025 Oxide Computer Company
// Copyright 2026 Oxide Computer Company

//! A virtual switch port.

Expand Down Expand Up @@ -44,6 +44,7 @@ use super::rule::HdrTransformError;
use super::rule::PresavedMeoi;
use super::rule::Rule;
use super::rule::TransformFlags;
use super::tcp::INCIPIENT_EXPIRE_TTL;
use super::tcp::KEEPALIVE_EXPIRE_TTL;
use super::tcp::TIME_WAIT_EXPIRE_TTL;
use super::tcp_state::TcpFlowState;
Expand Down Expand Up @@ -2992,14 +2993,16 @@ impl Dump for TcpFlowEntryState {
/// Expiry behaviour for TCP flows dependent on the connection FSM.
#[derive(Debug)]
pub struct TcpExpiry {
time_wait_ttl: Ttl,
incipient_ttl: Ttl,
quiescent_ttl: Ttl,
keepalive_ttl: Ttl,
}

impl Default for TcpExpiry {
fn default() -> Self {
Self {
time_wait_ttl: TIME_WAIT_EXPIRE_TTL,
incipient_ttl: INCIPIENT_EXPIRE_TTL,
quiescent_ttl: TIME_WAIT_EXPIRE_TTL,
keepalive_ttl: KEEPALIVE_EXPIRE_TTL,
}
}
Expand All @@ -3012,8 +3015,16 @@ impl ExpiryPolicy<TcpFlowEntryState> for TcpExpiry {
now: Moment,
) -> bool {
let ttl = match entry.state().tcp_state() {
TcpState::TimeWait => self.time_wait_ttl,
_ => self.keepalive_ttl,
TcpState::TimeWait
| TcpState::CloseWait
| TcpState::FinWait1
| TcpState::FinWait2 => self.quiescent_ttl,
TcpState::SynSent
| TcpState::SynRcvd
| TcpState::Listen
| TcpState::LastAck => self.incipient_ttl,
TcpState::Established => self.keepalive_ttl,
TcpState::Closed => Ttl::new_seconds(0),
};
ttl.is_expired(entry.last_hit(), now)
}
Expand Down
18 changes: 13 additions & 5 deletions lib/opte/src/engine/tcp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2025 Oxide Computer Company
// Copyright 2026 Oxide Computer Company

//! TCP headers.

Expand All @@ -16,11 +16,18 @@ pub const TCP_HDR_OFFSET_SHIFT: u8 = 4;
pub const TCP_PORT_RDP: u16 = 3389;
pub const TCP_PORT_SSH: u16 = 22;

/// The duration after which a connection in TIME-WAIT should be
/// considered free for either side to reuse.
/// The duration after which we can remove a TCP state entry which is still in
/// the three-way handshake.
///
/// This value is chosen by Windows and MacOS, which is larger
/// than Linux's default 60s. Allowances for tuned servers and/or
/// This value is set very low to prevent SYN-flood like traffic (or many
/// unacknowledged SYNs from the guest) from holding TCP flow entry slots for
/// the full [`KEEPALIVE_EXPIRE_SECS`].
pub const INCIPIENT_EXPIRE_SECS: u64 = 5;
/// The duration after which a connection in TIME-WAIT or another closing state
/// should be considered free for either side to reuse.
///
/// This value is chosen from the TIME-WAIT duratio of Windows and MacOS, which
Comment thread
rcgoodfellow marked this conversation as resolved.
Outdated
Comment thread
rcgoodfellow marked this conversation as resolved.
Outdated
/// is larger than Linux's default 60s. Allowances for tuned servers and/or
/// more aggressive reuse via RFCs 1323/7323 and/or 6191 are made in
/// `tcp_state`.
pub const TIME_WAIT_EXPIRE_SECS: u64 = 120;
Expand All @@ -31,6 +38,7 @@ pub const TIME_WAIT_EXPIRE_SECS: u64 = 120;
/// keepalive, when interval + probe count will result in a timeout after
/// 8mins (illumos) / 11mins (linux).
pub const KEEPALIVE_EXPIRE_SECS: u64 = 8_000;
pub const INCIPIENT_EXPIRE_TTL: Ttl = Ttl::new_seconds(INCIPIENT_EXPIRE_SECS);
pub const TIME_WAIT_EXPIRE_TTL: Ttl = Ttl::new_seconds(TIME_WAIT_EXPIRE_SECS);
pub const KEEPALIVE_EXPIRE_TTL: Ttl = Ttl::new_seconds(KEEPALIVE_EXPIRE_SECS);

Expand Down
24 changes: 15 additions & 9 deletions lib/opte/src/engine/tcp_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,26 +148,32 @@ impl TcpFlowState {

match self.tcp_state {
Closed => {
// We have a new inbound SYN. We assume for now the
// guest is listening on the given port by moving to
// the LISTEN state.
if flags.contains(IngotTcpFlags::SYN) {
return Some(Listen);
}

// We pontentially have a legitimate inbound data
// We potentially have a legitimate inbound data
// segment for an ESTABLISHED connection that
// previously expired in OPTE but is still active in
// the guest. We immeidately move this to the
// the guest. We immediately move this to the
// ESTABLISHED state even though that might be a lie.
// We rely on the fact that the guest will immediately
// respond with an ACK or RST. In the future we could
// instead keep this in some type of probationary
// state (or separate table).
//
// Alternately, we've received a SYN-ACK, but don't have
// state indicating that we sent an initial SYN because
// the remote half took longer than the incipient expiry
// period to respond. In this case, this is identical to
// the transition from `SynSent`.
if flags.contains(IngotTcpFlags::ACK) {
return Some(Established);
}

// We have a new inbound SYN. We assume for now the
// guest is listening on the given port by moving to
// the LISTEN state.
if flags.contains(IngotTcpFlags::SYN) {
return Some(Listen);
}
Comment thread
bnaecker marked this conversation as resolved.

None
}

Expand Down
99 changes: 99 additions & 0 deletions lib/oxide-vpc/tests/integration_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

use common::icmp::*;
use common::*;
use opte::api::L4Info;
use opte::api::MacAddr;
use opte::api::OpteError;
use opte::api::TcpState;
Expand Down Expand Up @@ -48,6 +49,7 @@ use opte::engine::port::DropReason;
use opte::engine::port::ProcessError;
use opte::engine::port::ProcessResult;
use opte::engine::rule::MappingResource;
use opte::engine::tcp::INCIPIENT_EXPIRE_SECS;
use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS;
use opte::ingot::ethernet::Ethertype;
use opte::ingot::geneve::GeneveRef;
Expand Down Expand Up @@ -3708,6 +3710,103 @@ fn early_tcp_invalidation() {
assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap());
}

// We have agressive TCP flow entry expiry for flows in the three-way
// handshake, to ensure that they do not consume table entry space for
// extremely long periods of time in potential SYN-flood DOS scenarios.
//
// However, a slow handshake should still function using the underlying
// LFT entries where, e.g., the default firewall disposition is in use.
#[test]
fn tcp_invalidation_does_not_block_connection() {
let g1_cfg = g1_cfg();
let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None);
g1.port.start();
set!(g1, "port_state=running");

// Ensure we only have the default rules: allow all outbound, block
// all inbound.
firewall::set_fw_rules(
&g1.port,
&SetFwRulesReq { port_name: g1.port.name().to_string(), rules: vec![] },
)
.unwrap();
update!(
g1,
[
"incr:epoch",
"set:firewall.flows.in=0, firewall.flows.out=0",
"set:firewall.rules.out=0, firewall.rules.in=0",
]
);

let g1_phys = TestIpPhys {
ip: g1_cfg.phys_ip,
mac: g1_cfg.guest_mac,
vni: g1_cfg.vni,
};

let dst_ip = Ipv4Addr::from_const([172, 30, 0, 6]);
g1.vpc_map.add(dst_ip.into(), g1_cfg.phys_addr());

// Attempt to connect to a hypothetical TCP recipient in the same VPC,
// on the same sled. This will create new TCP state and setup inbound
// LFTs for a SYN-ACK to use.
let mut pkt1_m = http_syn2(
g1_cfg.guest_mac,
g1_cfg.ipv4().private_ip,
GW_MAC_ADDR,
dst_ip,
);
let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap();
let flow = pkt1.flow();
let remote_port = if let Some(L4Info::Ports(a)) = flow.l4_info() {
a.src_port
} else {
panic!()
};
let res = g1.port.process(Out, pkt1);
expect_modified!(res, pkt1_m);
incr!(
g1,
[
"firewall.flows.out, firewall.flows.in",
"uft.out",
"stats.port.out_modified, stats.port.out_uft_miss",
]
);
assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap());

// Assume that the recipient takes some time to get back to us, but not
// long enough to expire the UFT/LFTs. The TCP state will expire.
let t0 = Moment::now();
let t1 = t0 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1);
g1.port.expire_flows_at(t1).unwrap();
assert_eq!(None, g1.port.tcp_state(&flow));

// The SYN-ACK arrives, and we allow it through. This creates a new
// instance of TCP state.
let mut pkt2_m = http_syn_ack2(
BS_MAC_ADDR,
dst_ip,
g1_cfg.guest_mac,
g1_cfg.ipv4().private_ip,
remote_port,
);
pkt2_m = encap(pkt2_m, g1_phys, g1_phys);
let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap();
let res = g1.port.process(In, pkt2);
expect_modified!(res, pkt2_m);
incr!(g1, ["stats.port.in_modified, stats.port.in_uft_miss, uft.in"]);
assert_eq!(TcpState::Established, g1.port.tcp_state(&flow).unwrap());

// Receiving a SYN-ACK moves the connection into established. We'd expect
// this normally from `SynSent`, if the state hadn't been lost. This state
// will survive a short wait.
let t2 = t1 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1);
g1.port.expire_flows_at(t2).unwrap();
assert_eq!(Some(TcpState::Established), g1.port.tcp_state(&flow));
}

#[test]
fn ephemeral_ip_preferred_over_snat_outbound() {
let ip_cfg = IpCfg::DualStack {
Expand Down