Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions lib/opte/src/engine/port/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2025 Oxide Computer Company
// Copyright 2026 Oxide Computer Company

//! A virtual switch port.

Expand Down Expand Up @@ -44,6 +44,7 @@ use super::rule::HdrTransformError;
use super::rule::PresavedMeoi;
use super::rule::Rule;
use super::rule::TransformFlags;
use super::tcp::INCIPIENT_EXPIRE_TTL;
use super::tcp::KEEPALIVE_EXPIRE_TTL;
use super::tcp::TIME_WAIT_EXPIRE_TTL;
use super::tcp_state::TcpFlowState;
Expand Down Expand Up @@ -2992,14 +2993,16 @@ impl Dump for TcpFlowEntryState {
/// Expiry behaviour for TCP flows dependent on the connection FSM.
#[derive(Debug)]
pub struct TcpExpiry {
time_wait_ttl: Ttl,
incipient_ttl: Ttl,
quiescent_ttl: Ttl,
keepalive_ttl: Ttl,
}

impl Default for TcpExpiry {
fn default() -> Self {
Self {
time_wait_ttl: TIME_WAIT_EXPIRE_TTL,
incipient_ttl: INCIPIENT_EXPIRE_TTL,
quiescent_ttl: TIME_WAIT_EXPIRE_TTL,
keepalive_ttl: KEEPALIVE_EXPIRE_TTL,
}
}
Expand All @@ -3012,8 +3015,16 @@ impl ExpiryPolicy<TcpFlowEntryState> for TcpExpiry {
now: Moment,
) -> bool {
let ttl = match entry.state().tcp_state() {
TcpState::TimeWait => self.time_wait_ttl,
_ => self.keepalive_ttl,
TcpState::TimeWait
| TcpState::LastAck
Comment thread
rcgoodfellow marked this conversation as resolved.
Outdated
| TcpState::CloseWait
| TcpState::FinWait1
| TcpState::FinWait2 => self.quiescent_ttl,
TcpState::SynSent | TcpState::SynRcvd | TcpState::Listen => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm slightly concerned about LISTEN being here. IIUC, this will expire connections that make the handshake and then don't do much for a while. I dunno how bad that is, but it seems like a "normal" connection and one we'd want to apply the existing logic to, which would map this to keepalive_ttl.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LISTEN is the handshake state for passive open, I think we should be ok here. We only exit the handshake through established.

self.incipient_ttl
}
TcpState::Established => self.keepalive_ttl,
TcpState::Closed => Ttl::new_seconds(0),
};
ttl.is_expired(entry.last_hit(), now)
}
Expand Down
18 changes: 13 additions & 5 deletions lib/opte/src/engine/tcp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2025 Oxide Computer Company
// Copyright 2026 Oxide Computer Company

//! TCP headers.

Expand All @@ -16,11 +16,18 @@ pub const TCP_HDR_OFFSET_SHIFT: u8 = 4;
pub const TCP_PORT_RDP: u16 = 3389;
pub const TCP_PORT_SSH: u16 = 22;

/// The duration after which a connection in TIME-WAIT should be
/// considered free for either side to reuse.
/// The duration after which we can remove a TCP state entry which is still in
/// the three-way handshake.
///
/// This value is chosen by Windows and MacOS, which is larger
/// than Linux's default 60s. Allowances for tuned servers and/or
/// This value is set very low to prevent SYN-flood like traffic (or many
/// unacknowledged SYNs from the guest) from holding TCP flow entry slots for
/// the full [`KEEPALIVE_EXPIRE_SECS`].
pub const INCIPIENT_EXPIRE_SECS: u64 = 5;
/// The duration after which a connection in TIME-WAIT or another closing state
/// should be considered free for either side to reuse.
///
/// This value is chosen from the TIME-WAIT duratio of Windows and MacOS, which
Comment thread
rcgoodfellow marked this conversation as resolved.
Outdated
Comment thread
rcgoodfellow marked this conversation as resolved.
Outdated
/// is larger than Linux's default 60s. Allowances for tuned servers and/or
/// more aggressive reuse via RFCs 1323/7323 and/or 6191 are made in
/// `tcp_state`.
pub const TIME_WAIT_EXPIRE_SECS: u64 = 120;
Expand All @@ -31,6 +38,7 @@ pub const TIME_WAIT_EXPIRE_SECS: u64 = 120;
/// keepalive, when interval + probe count will result in a timeout after
/// 8mins (illumos) / 11mins (linux).
pub const KEEPALIVE_EXPIRE_SECS: u64 = 8_000;
pub const INCIPIENT_EXPIRE_TTL: Ttl = Ttl::new_seconds(INCIPIENT_EXPIRE_SECS);
pub const TIME_WAIT_EXPIRE_TTL: Ttl = Ttl::new_seconds(TIME_WAIT_EXPIRE_SECS);
pub const KEEPALIVE_EXPIRE_TTL: Ttl = Ttl::new_seconds(KEEPALIVE_EXPIRE_SECS);

Expand Down
24 changes: 15 additions & 9 deletions lib/opte/src/engine/tcp_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,26 +148,32 @@ impl TcpFlowState {

match self.tcp_state {
Closed => {
// We have a new inbound SYN. We assume for now the
// guest is listening on the given port by moving to
// the LISTEN state.
if flags.contains(IngotTcpFlags::SYN) {
return Some(Listen);
}

// We pontentially have a legitimate inbound data
// We potentially have a legitimate inbound data
// segment for an ESTABLISHED connection that
// previously expired in OPTE but is still active in
// the guest. We immeidately move this to the
// the guest. We immediately move this to the
// ESTABLISHED state even though that might be a lie.
// We rely on the fact that the guest will immediately
// respond with an ACK or RST. In the future we could
// instead keep this in some type of probationary
// state (or separate table).
//
// Alternately, we've received a SYN-ACK, but don't have
// state indicating that we sent an initial SYN because
// the remote half took longer than the incipient expiry
// period to respond. In this case, this is identical to
// the transition from `SynSent`.
if flags.contains(IngotTcpFlags::ACK) {
return Some(Established);
}

// We have a new inbound SYN. We assume for now the
// guest is listening on the given port by moving to
// the LISTEN state.
if flags.contains(IngotTcpFlags::SYN) {
return Some(Listen);
}
Comment thread
bnaecker marked this conversation as resolved.

None
}

Expand Down
99 changes: 99 additions & 0 deletions lib/oxide-vpc/tests/integration_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

use common::icmp::*;
use common::*;
use opte::api::L4Info;
use opte::api::MacAddr;
use opte::api::OpteError;
use opte::api::TcpState;
Expand Down Expand Up @@ -48,6 +49,7 @@ use opte::engine::port::DropReason;
use opte::engine::port::ProcessError;
use opte::engine::port::ProcessResult;
use opte::engine::rule::MappingResource;
use opte::engine::tcp::INCIPIENT_EXPIRE_SECS;
use opte::engine::tcp::TIME_WAIT_EXPIRE_SECS;
use opte::ingot::ethernet::Ethertype;
use opte::ingot::geneve::GeneveRef;
Expand Down Expand Up @@ -3708,6 +3710,103 @@ fn early_tcp_invalidation() {
assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap());
}

// We have agressive TCP flow entry expiry for flows in the three-way
// handshake, to ensure that they do not consume table entry space for
// extremely long periods of time in potential SYN-flood DOS scenarios.
//
// However, a slow handshake should still function using the underlying
// LFT entries where, e.g., the default firewall disposition is in use.
#[test]
fn tcp_invalidation_does_not_block_connection() {
let g1_cfg = g1_cfg();
let mut g1 = oxide_net_setup("g1_port", &g1_cfg, None, None);
g1.port.start();
set!(g1, "port_state=running");

// Ensure we only have the default rules: allow all outbound, block
// all inbound.
firewall::set_fw_rules(
&g1.port,
&SetFwRulesReq { port_name: g1.port.name().to_string(), rules: vec![] },
)
.unwrap();
update!(
g1,
[
"incr:epoch",
"set:firewall.flows.in=0, firewall.flows.out=0",
"set:firewall.rules.out=0, firewall.rules.in=0",
]
);

let g1_phys = TestIpPhys {
ip: g1_cfg.phys_ip,
mac: g1_cfg.guest_mac,
vni: g1_cfg.vni,
};

let dst_ip = Ipv4Addr::from_const([172, 30, 0, 6]);
g1.vpc_map.add(dst_ip.into(), g1_cfg.phys_addr());

// Attempt to connect to a hypothetical TCP recipient in the same VPC,
// on the same sled. This will create new TCP state and setup inbound
// LFTs for a SYN-ACK to use.
let mut pkt1_m = http_syn2(
g1_cfg.guest_mac,
g1_cfg.ipv4().private_ip,
GW_MAC_ADDR,
dst_ip,
);
let pkt1 = parse_outbound(&mut pkt1_m, VpcParser {}).unwrap();
let flow = pkt1.flow();
let remote_port = if let Some(L4Info::Ports(a)) = flow.l4_info() {
a.src_port
} else {
panic!()
};
let res = g1.port.process(Out, pkt1);
expect_modified!(res, pkt1_m);
incr!(
g1,
[
"firewall.flows.out, firewall.flows.in",
"uft.out",
"stats.port.out_modified, stats.port.out_uft_miss",
]
);
assert_eq!(TcpState::SynSent, g1.port.tcp_state(&flow).unwrap());

// Assume that the recipient takes some time to get back to us, but not
// long enough to expire the UFT/LFTs. The TCP state will expire.
let t0 = Moment::now();
let t1 = t0 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1);
g1.port.expire_flows_at(t1).unwrap();
assert_eq!(None, g1.port.tcp_state(&flow));

// The SYN-ACK arrives, and we allow it through. This creates a new
// instance of TCP state.
let mut pkt2_m = http_syn_ack2(
BS_MAC_ADDR,
dst_ip,
g1_cfg.guest_mac,
g1_cfg.ipv4().private_ip,
remote_port,
);
pkt2_m = encap(pkt2_m, g1_phys, g1_phys);
let pkt2 = parse_inbound(&mut pkt2_m, VpcParser {}).unwrap();
let res = g1.port.process(In, pkt2);
expect_modified!(res, pkt2_m);
incr!(g1, ["stats.port.in_modified, stats.port.in_uft_miss, uft.in"]);
assert_eq!(TcpState::Established, g1.port.tcp_state(&flow).unwrap());

// Receiving a SYN-ACK moves the connection into established. We'd expect
// this normally from `SynSent`, if the state hadn't been lost. This state
// will survive a short wait.
let t2 = t1 + Duration::from_secs(INCIPIENT_EXPIRE_SECS + 1);
g1.port.expire_flows_at(t2).unwrap();
assert_eq!(Some(TcpState::Established), g1.port.tcp_state(&flow));
}

#[test]
fn ephemeral_ip_preferred_over_snat_outbound() {
let ip_cfg = IpCfg::DualStack {
Expand Down