diff --git a/.github/buildomat/jobs/linux.sh b/.github/buildomat/jobs/linux.sh index 8fc9e7ee..5a21404a 100755 --- a/.github/buildomat/jobs/linux.sh +++ b/.github/buildomat/jobs/linux.sh @@ -28,6 +28,26 @@ #: series = "linux" #: name = "mgadm.sha256.txt" #: from_output = "/work/release/mgadm.sha256.txt" +#: +#: [[publish]] +#: series = "linux" +#: name = "ddmd" +#: from_output = "/work/release/ddmd" +#: +#: [[publish]] +#: series = "linux" +#: name = "ddmd.sha256.txt" +#: from_output = "/work/release/ddmd.sha256.txt" +#: +#: [[publish]] +#: series = "linux" +#: name = "ddmadm" +#: from_output = "/work/release/ddmadm" +#: +#: [[publish]] +#: series = "linux" +#: name = "ddmadm.sha256.txt" +#: from_output = "/work/release/ddmadm.sha256.txt" set -o errexit set -o pipefail @@ -64,3 +84,21 @@ popd cp target/debug/mgadm /work/debug cp target/release/mgadm /work/release digest /work/release/mgadm > /work/release/mgadm.sha256.txt + +banner "ddmd" +pushd ddmd +cargo build --bin ddmd --no-default-features +cargo build --bin ddmd --no-default-features --release +popd +cp target/debug/ddmd /work/debug +cp target/release/ddmd /work/release +digest /work/release/ddmd > /work/release/ddmd.sha256.txt + +banner "ddmadm" +pushd ddmadm +cargo build --bin ddmadm +cargo build --bin ddmadm --release +popd +cp target/debug/ddmadm /work/debug +cp target/release/ddmadm /work/release +digest /work/release/ddmadm > /work/release/ddmadm.sha256.txt diff --git a/Cargo.lock b/Cargo.lock index 252173c9..7ced62a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1319,6 +1319,7 @@ dependencies = [ "chrono", "ddm-api", "ddm-types", + "ddm-types-versions", "dpd-client", "dropshot 0.17.0", "hostname 0.4.2", @@ -1341,6 +1342,7 @@ dependencies = [ "sled", "slog", "socket2", + "tempfile", "thiserror 2.0.18", "tokio", "uuid", @@ -3798,15 +3800,18 @@ dependencies = [ "clap", "libc", "libnet", + "omicron-common", "oximeter", "oximeter-producer", "oxnet", "schemars 0.8.22", "serde", + "serde_json", "slog", "slog-async", "slog-bunyan", "smf 0.10.0 (git+https://github.com/illumos/smf-rs?branch=main)", + "thiserror 2.0.18", "uuid", ] @@ -3882,6 +3887,7 @@ version = "0.1.0" dependencies = [ "bfd", "bgp", + "mg-common", "rdb", "schemars 0.8.22", "serde", diff --git a/ddm-admin-client/src/lib.rs b/ddm-admin-client/src/lib.rs index ea45cea2..2f65f6c9 100644 --- a/ddm-admin-client/src/lib.rs +++ b/ddm-admin-client/src/lib.rs @@ -39,3 +39,39 @@ impl std::hash::Hash for types::TunnelOrigin { self.metric.hash(state); } } + +impl std::cmp::PartialEq for types::Vni { + fn eq(&self, other: &Self) -> bool { + self.0.eq(&other.0) + } +} + +impl std::cmp::Eq for types::Vni {} + +impl std::hash::Hash for types::Vni { + fn hash(&self, state: &mut H) { + self.0.hash(state); + } +} + +impl std::cmp::PartialEq for types::MulticastOrigin { + fn eq(&self, other: &Self) -> bool { + self.overlay_group.eq(&other.overlay_group) + && self.underlay_group.eq(&other.underlay_group) + && self.vni.eq(&other.vni) + && self.source.eq(&other.source) + } +} + +impl std::cmp::Eq for types::MulticastOrigin {} + +/// Metric is excluded from identity so that metric changes update +/// an existing entry rather than creating a duplicate. +impl std::hash::Hash for types::MulticastOrigin { + fn hash(&self, state: &mut H) { + self.overlay_group.hash(state); + self.underlay_group.hash(state); + self.vni.hash(state); + self.source.hash(state); + } +} diff --git a/ddm-api/src/lib.rs b/ddm-api/src/lib.rs index 546797dc..b86622ce 100644 --- a/ddm-api/src/lib.rs +++ b/ddm-api/src/lib.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use ddm_types_versions::latest; +use ddm_types_versions::v1; use dropshot::HttpError; use dropshot::HttpResponseOk; use dropshot::HttpResponseUpdatedNoContent; @@ -10,7 +11,7 @@ use dropshot::Path; use dropshot::RequestContext; use dropshot::TypedBody; use dropshot_api_manager_types::api_versions; -use mg_common::net::TunnelOrigin; +use mg_common::net::{MulticastOrigin, TunnelOrigin}; use oxnet::Ipv6Net; use std::collections::{HashMap, HashSet}; @@ -26,6 +27,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (2, MULTICAST_SUPPORT), (1, INITIAL), ]); @@ -45,17 +47,48 @@ api_versions!([ pub trait DdmAdminApi { type Context; - #[endpoint { method = GET, path = "/peers" }] + #[endpoint { + method = GET, + path = "/peers", + versions = VERSION_MULTICAST_SUPPORT.. + }] async fn get_peers( ctx: RequestContext, ) -> Result>, HttpError>; + /// Returns peers without interface name information. + #[endpoint { + method = GET, + path = "/peers", + versions = ..VERSION_MULTICAST_SUPPORT + }] + async fn get_peers_v1( + ctx: RequestContext, + ) -> Result>, HttpError>; + #[endpoint { method = DELETE, path = "/peers/{addr}" }] async fn expire_peer( ctx: RequestContext, params: Path, ) -> Result; + /// Set peer information for a given interface index, bypassing the state machine. + /// + /// Intended for test fixtures that run `ddmd` with `--no-state-machine`. + /// In a normal run, discovery writes peer entries keyed by interface + /// index whenever it processes an advertisement, so any directly-injected + /// entry for an active interface will be overwritten the next time a + /// peer is observed there. + #[endpoint { + method = PUT, + path = "/peer", + versions = VERSION_MULTICAST_SUPPORT.. + }] + async fn put_peer( + ctx: RequestContext, + request: TypedBody, + ) -> Result; + #[endpoint { method = GET, path = "/originated" }] async fn get_originated( ctx: RequestContext, @@ -100,6 +133,44 @@ pub trait DdmAdminApi { request: TypedBody>, ) -> Result; + #[endpoint { + method = GET, + path = "/originated_multicast_groups", + versions = VERSION_MULTICAST_SUPPORT.. + }] + async fn get_originated_multicast_groups( + ctx: RequestContext, + ) -> Result>, HttpError>; + + #[endpoint { + method = GET, + path = "/multicast_groups", + versions = VERSION_MULTICAST_SUPPORT.. + }] + async fn get_multicast_groups( + ctx: RequestContext, + ) -> Result>, HttpError>; + + #[endpoint { + method = PUT, + path = "/multicast_group", + versions = VERSION_MULTICAST_SUPPORT.. + }] + async fn advertise_multicast_groups( + ctx: RequestContext, + request: TypedBody>, + ) -> Result; + + #[endpoint { + method = DELETE, + path = "/multicast_group", + versions = VERSION_MULTICAST_SUPPORT.. + }] + async fn withdraw_multicast_groups( + ctx: RequestContext, + request: TypedBody>, + ) -> Result; + #[endpoint { method = PUT, path = "/sync" }] async fn sync( ctx: RequestContext, diff --git a/ddm-types/versions/src/latest.rs b/ddm-types/versions/src/latest.rs index 08057601..dcee5b6f 100644 --- a/ddm-types/versions/src/latest.rs +++ b/ddm-types/versions/src/latest.rs @@ -8,16 +8,20 @@ pub mod admin { pub use crate::v1::admin::EnableStatsRequest; pub use crate::v1::admin::ExpirePathParams; pub use crate::v1::admin::PrefixMap; + pub use crate::v2::admin::PutPeerRequest; } pub mod db { - pub use crate::v1::db::PeerInfo; pub use crate::v1::db::PeerStatus; pub use crate::v1::db::RouterKind; pub use crate::v1::db::TunnelRoute; + pub use crate::v2::db::MulticastRoute; + pub use crate::v2::db::PeerInfo; } pub mod exchange { pub use crate::v1::exchange::PathVector; pub use crate::v1::exchange::PathVectorV2; + pub use crate::v2::exchange::MulticastPathHop; + pub use crate::v2::exchange::MulticastPathVector; } diff --git a/ddm-types/versions/src/lib.rs b/ddm-types/versions/src/lib.rs index 9f8dcdc7..f723aaf0 100644 --- a/ddm-types/versions/src/lib.rs +++ b/ddm-types/versions/src/lib.rs @@ -32,3 +32,5 @@ pub mod latest; #[path = "initial/mod.rs"] pub mod v1; +#[path = "multicast_support/mod.rs"] +pub mod v2; diff --git a/ddm-types/versions/src/multicast_support/admin.rs b/ddm-types/versions/src/multicast_support/admin.rs new file mode 100644 index 00000000..dc217dc1 --- /dev/null +++ b/ddm-types/versions/src/multicast_support/admin.rs @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use super::db::PeerInfo; + +/// Body for `PUT /peer`. Sets `info` at the slot keyed by `if_index` +/// (interface index) in the in-memory peer map. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)] +pub struct PutPeerRequest { + pub if_index: u32, + pub info: PeerInfo, +} diff --git a/ddm-types/versions/src/multicast_support/db.rs b/ddm-types/versions/src/multicast_support/db.rs new file mode 100644 index 00000000..8fedfe64 --- /dev/null +++ b/ddm-types/versions/src/multicast_support/db.rs @@ -0,0 +1,89 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use std::net::Ipv6Addr; + +use mg_common::net::MulticastOrigin; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::v1::db::{PeerStatus, RouterKind}; +use crate::v2::exchange::MulticastPathHop; + +/// A multicast route learned via DDM. +/// +/// Carries a MulticastOrigin (overlay group + ff04::/64 underlay +/// mapping) and the path vector from the originating subscriber +/// through intermediate transit routers. +// The path enables loop detection and (in multi-rack topologies) +// replication optimizations (RFD 488) in the future. +// +// Equality and hashing consider only `origin` and `nexthop` so that +// a route update with a longer path replaces the existing entry in +// hash-based collections. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub struct MulticastRoute { + /// The multicast group origin information. + pub origin: MulticastOrigin, + + /// Underlay nexthop address (DDM peer that advertised this route). + /// Used to associate the route with a peer for expiration. + pub nexthop: Ipv6Addr, + + /// Path vector from the originating subscriber outward. + /// Each hop records the router that redistributed this + /// subscription announcement. Used for loop detection on pull + /// and for future replication optimization in multi-rack + /// topologies. + #[serde(default)] + pub path: Vec, +} + +impl PartialEq for MulticastRoute { + fn eq(&self, other: &Self) -> bool { + self.origin == other.origin && self.nexthop == other.nexthop + } +} + +impl Eq for MulticastRoute {} + +impl std::hash::Hash for MulticastRoute { + fn hash(&self, state: &mut H) { + self.origin.hash(state); + self.nexthop.hash(state); + } +} + +impl From for MulticastOrigin { + fn from(x: MulticastRoute) -> Self { + x.origin + } +} + +/// Peer information with an optional interface name. +/// +// Adds the `if_name` field to identify which underlay interface the peer +// was discovered on. +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)] +pub struct PeerInfo { + pub status: PeerStatus, + pub addr: Ipv6Addr, + pub host: String, + pub kind: RouterKind, + /// Interface name the peer was discovered on (e.g., "tfportrear0_0"). + #[serde(default)] + pub if_name: Option, +} + +/// Downconvert v2 PeerInfo to v1 PeerInfo by dropping `if_name`. +impl From for crate::v1::db::PeerInfo { + fn from(p: PeerInfo) -> Self { + Self { + status: p.status, + addr: p.addr, + host: p.host, + kind: p.kind, + } + } +} diff --git a/ddm-types/versions/src/multicast_support/exchange.rs b/ddm-types/versions/src/multicast_support/exchange.rs new file mode 100644 index 00000000..ca0cb161 --- /dev/null +++ b/ddm-types/versions/src/multicast_support/exchange.rs @@ -0,0 +1,74 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::net::Ipv6Addr; + +/// A single hop in the multicast path, carrying metadata needed for +/// replication optimization. +// Unlike unicast paths which only need hostnames, multicast hops carry +// additional information for computing optimal replication points +// (RFD 488). +#[derive( + Debug, Clone, PartialEq, Eq, Hash, Deserialize, Serialize, JsonSchema, +)] +pub struct MulticastPathHop { + /// Router identifier (hostname). + pub router_id: String, + + /// The underlay address of this router (for replication targeting). + pub underlay_addr: Ipv6Addr, + + /// Number of downstream subscribers reachable via this hop. + /// Used for load-aware replication decisions in multi-rack + /// topologies. + #[serde(default)] + pub downstream_subscriber_count: u32, +} + +impl MulticastPathHop { + /// Create a hop with the given router identity and a zero subscriber + /// count. The count will be populated once transit routers track + /// downstream subscriber counts for load-aware replication (RFD 488). + pub fn new(router_id: String, underlay_addr: Ipv6Addr) -> Self { + Self { + router_id, + underlay_addr, + downstream_subscriber_count: 0, + } + } +} + +/// Multicast group subscription announcement propagating through DDM. +/// +/// Contains a MulticastOrigin (overlay group + ff04::/64 underlay +/// mapping) and the path from the original subscriber outward. +// Currently, this is used for loop detection: if our router_id appears in the +// path, the announcement has already traversed us and is dropped. The path +// structure also carries topology information for future replication +// optimizations (RFD 488). +#[derive( + Debug, Clone, PartialEq, Eq, Hash, Deserialize, Serialize, JsonSchema, +)] +pub struct MulticastPathVector { + /// The multicast group origin information. + pub origin: mg_common::net::MulticastOrigin, + + /// The path from the original subscriber to the current router. + /// Ordered from subscriber outward (subscriber router first). + pub path: Vec, +} + +impl MulticastPathVector { + /// Append a hop to this path vector. + pub fn with_hop(&self, hop: MulticastPathHop) -> Self { + let mut path = self.path.clone(); + path.push(hop); + Self { + origin: self.origin.clone(), + path, + } + } +} diff --git a/ddm-types/versions/src/multicast_support/mod.rs b/ddm-types/versions/src/multicast_support/mod.rs new file mode 100644 index 00000000..9ac0ecc6 --- /dev/null +++ b/ddm-types/versions/src/multicast_support/mod.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types from API version 2 (MULTICAST_SUPPORT) that add multicast +//! group management to the DDM admin API. + +pub mod admin; +pub mod db; +pub mod exchange; diff --git a/ddm/Cargo.toml b/ddm/Cargo.toml index f505632d..1e7f5a85 100644 --- a/ddm/Cargo.toml +++ b/ddm/Cargo.toml @@ -5,6 +5,7 @@ edition = "2024" [dev-dependencies] pretty_assertions.workspace = true +tempfile = "3" [dependencies] slog.workspace = true @@ -21,10 +22,6 @@ hyper.workspace = true hyper-util.workspace = true http-body-util.workspace = true serde_json.workspace = true -libnet.workspace = true -dpd-client.workspace = true -opte-ioctl.workspace = true -oxide-vpc.workspace = true sled.workspace = true mg-common.workspace = true chrono.workspace = true @@ -35,3 +32,16 @@ oxnet.workspace = true uuid.workspace = true ddm-api.workspace = true ddm-types.workspace = true +ddm-types-versions.workspace = true + +# illumos-only deps used by the routing state machine and platform sys layer. +# Gated by the `illumos` feature so non-illumos builds (e.g. Linux test +# fixtures running ddmd with `--no-state-machine`) link cleanly. +libnet = { workspace = true, optional = true } +dpd-client = { workspace = true, optional = true } +opte-ioctl = { workspace = true, optional = true } +oxide-vpc = { workspace = true, optional = true } + +[features] +default = ["illumos"] +illumos = ["dep:libnet", "dep:dpd-client", "dep:opte-ioctl", "dep:oxide-vpc"] diff --git a/ddm/src/admin.rs b/ddm/src/admin.rs index 6d49a368..1322eb10 100644 --- a/ddm/src/admin.rs +++ b/ddm/src/admin.rs @@ -6,14 +6,14 @@ use crate::db::Db; use crate::sm::{AdminEvent, Event, PrefixSet, SmContext}; use ddm_api::DdmAdminApi; use ddm_api::ddm_admin_api_mod; -use ddm_types::admin::{EnableStatsRequest, ExpirePathParams, PrefixMap}; -use ddm_types::db::{PeerInfo, TunnelRoute}; +use ddm_types::admin::{ + EnableStatsRequest, ExpirePathParams, PrefixMap, PutPeerRequest, +}; +use ddm_types::db::{MulticastRoute, PeerInfo, TunnelRoute}; use ddm_types::exchange::PathVector; use dropshot::ApiDescription; use dropshot::ApiDescriptionBuildErrors; use dropshot::ConfigDropshot; -use dropshot::ConfigLogging; -use dropshot::ConfigLoggingLevel; use dropshot::HttpError; use dropshot::HttpResponseOk; use dropshot::HttpResponseUpdatedNoContent; @@ -21,9 +21,9 @@ use dropshot::Path; use dropshot::RequestContext; use dropshot::TypedBody; use mg_common::lock; -use mg_common::net::TunnelOrigin; +use mg_common::net::{MulticastOrigin, TunnelOrigin}; use oxnet::Ipv6Net; -use slog::{Logger, error, info}; +use slog::{Logger, error, info, o}; use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, SocketAddr, SocketAddrV4, SocketAddrV6}; use std::sync::Arc; @@ -35,6 +35,8 @@ use tokio::task::JoinHandle; pub const DDM_STATS_PORT: u16 = 8001; +const UNIT_API_SERVER: &str = "api_server"; + #[derive(Default)] pub struct RouterStats { pub originated_underlay_prefixes: AtomicU64, @@ -68,11 +70,11 @@ pub fn handler( ..Default::default() }; - let ds_log = ConfigLogging::StderrTerminal { - level: ConfigLoggingLevel::Error, - } - .to_logger("admin") - .map_err(|e| e.to_string())?; + let ds_log = log.new(o!( + "component" => crate::COMPONENT_DDM, + "module" => crate::MOD_ADMIN, + "unit" => UNIT_API_SERVER, + )); let api = api_description().map_err(|e| e.to_string())?; @@ -112,8 +114,23 @@ impl DdmAdminApi for DdmAdminApiImpl { async fn get_peers( ctx: RequestContext, ) -> Result>, HttpError> { + Ok(HttpResponseOk(do_get_peers(ctx.context()))) + } + + async fn get_peers_v1( + ctx: RequestContext, + ) -> Result< + HttpResponseOk>, + HttpError, + > { let ctx = lock!(ctx.context()); - Ok(HttpResponseOk(ctx.db.peers())) + let peers = ctx + .db + .peers() + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(); + Ok(HttpResponseOk(peers)) } async fn expire_peer( @@ -135,6 +152,14 @@ impl DdmAdminApi for DdmAdminApiImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn put_peer( + ctx: RequestContext, + request: TypedBody, + ) -> Result { + do_put_peer(ctx.context(), request.into_inner()); + Ok(HttpResponseUpdatedNoContent()) + } + async fn get_originated( ctx: RequestContext, ) -> Result>, HttpError> { @@ -333,6 +358,71 @@ impl DdmAdminApi for DdmAdminApiImpl { Ok(HttpResponseUpdatedNoContent()) } + async fn get_originated_multicast_groups( + ctx: RequestContext, + ) -> Result>, HttpError> { + let ctx = lock!(ctx.context()); + let originated = ctx + .db + .originated_mcast() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(originated)) + } + + async fn get_multicast_groups( + ctx: RequestContext, + ) -> Result>, HttpError> { + let ctx = lock!(ctx.context()); + let imported = ctx.db.imported_mcast(); + Ok(HttpResponseOk(imported)) + } + + async fn advertise_multicast_groups( + ctx: RequestContext, + request: TypedBody>, + ) -> Result { + let ctx = lock!(ctx.context()); + let groups = request.into_inner(); + slog::info!(ctx.log, "advertise multicast groups: {groups:#?}"); + ctx.db + .originate_mcast(&groups) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + + for e in &ctx.event_channels { + e.send(Event::Admin(AdminEvent::Announce(PrefixSet::Multicast( + groups.clone(), + )))) + .map_err(|e| { + HttpError::for_internal_error(format!("admin event send: {e}")) + })?; + } + + Ok(HttpResponseUpdatedNoContent()) + } + + async fn withdraw_multicast_groups( + ctx: RequestContext, + request: TypedBody>, + ) -> Result { + let ctx = lock!(ctx.context()); + let groups = request.into_inner(); + slog::info!(ctx.log, "withdraw multicast groups: {groups:#?}"); + ctx.db + .withdraw_mcast(&groups) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + + for e in &ctx.event_channels { + e.send(Event::Admin(AdminEvent::Withdraw(PrefixSet::Multicast( + groups.clone(), + )))) + .map_err(|e| { + HttpError::for_internal_error(format!("admin event send: {e}")) + })?; + } + + Ok(HttpResponseUpdatedNoContent()) + } + async fn sync( ctx: RequestContext, ) -> Result { @@ -400,3 +490,93 @@ pub fn api_description() { ddm_admin_api_mod::api_description::() } + +/// Snapshot the current peer table, keyed by interface index. +pub(crate) fn do_get_peers( + ctx: &Arc>, +) -> HashMap { + let ctx = lock!(ctx); + ctx.db.peers() +} + +/// Insert or replace the peer entry at `request.if_index`. Tests bypass +/// the dropshot endpoint and call this directly; production goes through +/// [`DdmAdminApiImpl::put_peer`]. +pub(crate) fn do_put_peer( + ctx: &Arc>, + request: PutPeerRequest, +) { + let PutPeerRequest { if_index, info } = request; + let ctx = lock!(ctx); + ctx.db.set_peer(if_index, info); +} + +#[cfg(test)] +mod tests { + use super::{HandlerContext, RouterStats, do_get_peers, do_put_peer}; + use crate::db::Db; + use ddm_types::admin::PutPeerRequest; + use ddm_types::db::{PeerInfo, PeerStatus, RouterKind}; + use slog::{Discard, Logger, o}; + use std::sync::{Arc, Mutex}; + use tempfile::TempDir; + + fn build_context(tmpdir: &TempDir) -> Arc> { + let log = Logger::root(Discard, o!()); + let db_path = tmpdir.path().join("ddm").to_str().unwrap().to_string(); + let db = Db::new(&db_path, log.clone()).expect("open db"); + Arc::new(Mutex::new(HandlerContext { + event_channels: vec![], + db, + stats: Arc::new(RouterStats::default()), + peers: vec![], + stats_handler: Arc::new(Mutex::new(None)), + log, + })) + } + + #[test] + fn put_peer_round_trips() { + let tmpdir = TempDir::new().expect("tempdir"); + let ctx = build_context(&tmpdir); + + let info = PeerInfo { + status: PeerStatus::Active, + addr: "fd00::1".parse().unwrap(), + host: "test-sled-1".to_string(), + kind: RouterKind::Server, + if_name: Some("tfportrear0_0".to_string()), + }; + + do_put_peer( + &ctx, + PutPeerRequest { + if_index: 7, + info: info.clone(), + }, + ); + + let peers = do_get_peers(&ctx); + assert_eq!(peers.len(), 1); + let got = peers.get(&7).expect("peer at if_index 7"); + assert_eq!(got, &info); + + // Overwriting at the same `if_index` replaces the entry rather + // than creating a second one. + let info2 = PeerInfo { + addr: "fd00::2".parse().unwrap(), + host: "test-sled-1-replaced".to_string(), + ..info + }; + do_put_peer( + &ctx, + PutPeerRequest { + if_index: 7, + info: info2.clone(), + }, + ); + let peers = do_get_peers(&ctx); + assert_eq!(peers.len(), 1, "overwrite at same if_index keeps map size",); + assert_eq!(peers[&7].addr, info2.addr); + } +} diff --git a/ddm/src/db.rs b/ddm/src/db.rs index 13338cc4..30eec234 100644 --- a/ddm/src/db.rs +++ b/ddm/src/db.rs @@ -2,9 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use ddm_types::db::{PeerInfo, TunnelRoute}; +use ddm_types::db::{MulticastRoute, PeerInfo, TunnelRoute}; use mg_common::lock; -use mg_common::net::TunnelOrigin; +use mg_common::net::{MulticastOrigin, TunnelOrigin}; use oxnet::{IpNet, Ipv6Net}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -21,6 +21,10 @@ const ORIGINATE: &str = "originate"; /// tunnel endpoints. const TUNNEL_ORIGINATE: &str = "tunnel_originate"; +/// The handle used to open a persistent key-value tree for originated +/// multicast groups. +const MCAST_ORIGINATE: &str = "mcast_originate"; + #[derive(thiserror::Error, Debug)] pub enum Error { #[error("datastore error {0}")] @@ -48,6 +52,7 @@ pub struct DbData { pub peers: HashMap, pub imported: HashSet, pub imported_tunnel: HashSet, + pub imported_mcast: HashSet, } unsafe impl Sync for Db {} @@ -85,6 +90,14 @@ impl Db { lock!(self.data).imported_tunnel.len() } + pub fn imported_mcast(&self) -> HashSet { + lock!(self.data).imported_mcast.clone() + } + + pub fn imported_mcast_count(&self) -> usize { + lock!(self.data).imported_mcast.len() + } + pub fn import(&self, r: &HashSet) { lock!(self.data).imported.extend(r.clone()); } @@ -93,6 +106,10 @@ impl Db { lock!(self.data).imported_tunnel.extend(r.clone()); } + pub fn import_mcast(&self, r: &HashSet) { + lock!(self.data).imported_mcast.extend(r.clone()); + } + pub fn delete_import(&self, r: &HashSet) { let imported = &mut lock!(self.data).imported; for x in r { @@ -107,6 +124,38 @@ impl Db { } } + pub fn delete_import_mcast(&self, r: &HashSet) { + let imported = &mut lock!(self.data).imported_mcast; + for x in r { + imported.remove(x); + } + } + + /// Atomically import and delete multicast routes under a single lock, + /// returning the effective difference (additions + removals) against the + /// state before mutation. + /// + /// This avoids a TOCTOU race where concurrent mutations between separate + /// lock acquisitions could produce an incorrect view difference. + pub fn update_imported_mcast( + &self, + import: &HashSet, + remove: &HashSet, + ) -> (HashSet, HashSet) { + let mut data = lock!(self.data); + + let before = data.imported_mcast.clone(); + data.imported_mcast.extend(import.iter().cloned()); + + for x in remove { + data.imported_mcast.remove(x); + } + + let to_add = data.imported_mcast.difference(&before).cloned().collect(); + let to_del = before.difference(&data.imported_mcast).cloned().collect(); + (to_add, to_del) + } + pub fn originate(&self, prefixes: &HashSet) -> Result<(), Error> { let tree = self.persistent_data.open_tree(ORIGINATE)?; for p in prefixes { @@ -129,6 +178,19 @@ impl Db { Ok(()) } + pub fn originate_mcast( + &self, + origins: &HashSet, + ) -> Result<(), Error> { + let tree = self.persistent_data.open_tree(MCAST_ORIGINATE)?; + for o in origins { + let entry = serde_json::to_string(o)?; + tree.insert(entry.as_str(), "")?; + } + tree.flush()?; + Ok(()) + } + pub fn originated(&self) -> Result, Error> { let tree = self.persistent_data.open_tree(ORIGINATE)?; let result = tree @@ -178,6 +240,7 @@ impl Db { return None; } }; + let value = String::from_utf8_lossy(&key); let value: TunnelOrigin = match serde_json::from_str(&value) { Ok(item) => item, @@ -199,6 +262,44 @@ impl Db { Ok(self.originated_tunnel()?.len()) } + pub fn originated_mcast(&self) -> Result, Error> { + let tree = self.persistent_data.open_tree(MCAST_ORIGINATE)?; + let result = tree + .scan_prefix(vec![]) + .filter_map(|item| { + let (key, _value) = match item { + Ok(item) => item, + Err(e) => { + error!( + self.log, + "db: error fetching ddm mcast origin entry: {e}" + ); + return None; + } + }; + + let value = String::from_utf8_lossy(&key); + let value: MulticastOrigin = match serde_json::from_str(&value) + { + Ok(item) => item, + Err(e) => { + error!( + self.log, + "db: error parsing ddm mcast origin: {e}" + ); + return None; + } + }; + Some(value) + }) + .collect(); + Ok(result) + } + + pub fn originated_mcast_count(&self) -> Result { + Ok(self.originated_mcast()?.len()) + } + pub fn withdraw(&self, prefixes: &HashSet) -> Result<(), Error> { let tree = self.persistent_data.open_tree(ORIGINATE)?; for p in prefixes { @@ -221,6 +322,19 @@ impl Db { Ok(()) } + pub fn withdraw_mcast( + &self, + origins: &HashSet, + ) -> Result<(), Error> { + let tree = self.persistent_data.open_tree(MCAST_ORIGINATE)?; + for o in origins { + let entry = serde_json::to_string(o)?; + tree.remove(entry.as_str())?; + } + tree.flush()?; + Ok(()) + } + /// Set peer info at the given index. Returns true if peer information was /// changed. pub fn set_peer(&self, index: u32, info: PeerInfo) -> bool { @@ -233,7 +347,11 @@ impl Db { pub fn remove_nexthop_routes( &self, nexthop: Ipv6Addr, - ) -> (HashSet, HashSet) { + ) -> ( + HashSet, + HashSet, + HashSet, + ) { let mut data = lock!(self.data); // Routes are generally held in sets to prevent duplication and provide // handy set-algebra operations. @@ -256,7 +374,18 @@ impl Db { for x in &tnl_removed { data.imported_tunnel.remove(x); } - (removed, tnl_removed) + + let mut mcast_removed = HashSet::new(); + for x in &data.imported_mcast { + if x.nexthop == nexthop { + mcast_removed.insert(x.clone()); + } + } + for x in &mcast_removed { + data.imported_mcast.remove(x); + } + + (removed, tnl_removed, mcast_removed) } pub fn remove_peer(&self, index: u32) { diff --git a/ddm/src/discovery/mod.rs b/ddm/src/discovery/mod.rs new file mode 100644 index 00000000..c3feb1bb --- /dev/null +++ b/ddm/src/discovery/mod.rs @@ -0,0 +1,118 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This module implements the ddm router discovery mechanisms. These +//! mechanisms are responsible for three primary things +//! +//! 1. Soliciting other routers through UDP/IPv6 link local multicast. +//! 2. Sending out router advertisements in response to solicitations. +//! 3. Continuously soliciting link-local at a configurable rate to keep +//! sessions alive and sending out notifications when peering arrangements +//! expire due to not getting a solicitation response within a configurable +//! time threshold. +//! +//! [`Version`] and [`DiscoveryError`] are platform-agnostic and stay in this +//! module so the state machine type definitions in [`crate::sm`] continue to +//! compile when the routing runtime is gated out (e.g. Linux test fixtures +//! running ddmd with `--no-state-machine`). The runtime helpers that drive +//! the protocol over UDPv6 sockets live in the [`runtime`] submodule and +//! are illumos-only. +//! +//! ## Protocol +//! +//! The general sequence of events is depicted in the following diagram. +//! +//! *==========* *==========* +//! | violin | | piano | +//! *==========* *==========* +//! | | +//! | solicit(ff02::dd) | +//! |-------------------------->| +//! | advertise(fe80::47) | +//! |<--------------------------| +//! | | +//! | ... | +//! | | +//! | | +//! | solicit(ff02::dd) | +//! |-------------------------->| +//! | advertise(fe80::47) | +//! |<--------------------------| +//! | | +//! | solicit(ff02::dd) | +//! |-------------------------->| +//! | solicit(ff02::dd) | +//! |-------------------------->| +//! | solicit(ff02::dd) | +//! |-------------------------->| +//! | | +//! +----| | +//! expire | | | +//! piano | | | +//! +--->| | +//! +//! This shows violin sending a link-local multicast solicitation over the wire. +//! That solicitation is received by piano and piano respons with an +//! advertisement to violin's link-local unicast address. From this point +//! forward solicitations and responses continue. Each time violin gets a +//! response from piano, it updates the last seen timestamp for piano. If at +//! some point piano stops responding to solicitations and the last seen +//! timestamp is older than the expiration threshold, violin will expire the +//! session and send out a notification to the ddm state machine that started +//! it. Violin will continue to send out solicitations in case piano comes back. +//! +//! In the event that piano undergoes renumbering e.g. it's link-local unicast +//! address changes, this will be detected by violin and an advertisement update +//! will be sent to the ddm state machine through the notification channel +//! provided to the discovery subsystem. +//! +//! The DDM discovery multicast address is ff02::dd. Discovery packets are sent +//! over UDP using port number 0xddd. +//! +//! ## Packets +//! +//! Discovery packets follow a very simple format +//! +//! 1 2 3 +//! 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +//! | version |S A r r r r r r| router kind | hostname len | +//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +//! | hostname : +//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +//! : .... : +//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +//! +//! The first byte indicates the version. The only valid version at present is +//! version 1. The second byte is a flags bitfield. The first position `S` +//! indicates a solicitation. The second position `A` indicates and +//! advertisement. All other positions are reserved for future use. The third +//! byte indicates the kind of router. Current values are 0 for a server router +//! and 1 for a transit routers. The fourth byte is a hostname length followed +//! directly by a hostname of up to 255 bytes in length. + +use thiserror::Error; + +#[cfg(all(feature = "illumos", target_os = "illumos"))] +mod runtime; + +#[cfg(all(feature = "illumos", target_os = "illumos"))] +pub(crate) use runtime::handler; + +#[derive(Debug, Copy, Clone)] +#[repr(u8)] +pub enum Version { + V2 = 2, + V3 = 3, + V4 = 4, +} + +#[derive(Error, Debug)] +pub enum DiscoveryError { + #[error("io error: {0}")] + Io(#[from] std::io::Error), + + #[error("serialization error: {0}")] + Serialization(#[from] ispf::Error), +} diff --git a/ddm/src/discovery.rs b/ddm/src/discovery/runtime.rs similarity index 72% rename from ddm/src/discovery.rs rename to ddm/src/discovery/runtime.rs index fc4a84e8..8c675664 100644 --- a/ddm/src/discovery.rs +++ b/ddm/src/discovery/runtime.rs @@ -2,92 +2,14 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! This file implements the ddm router discovery mechanisms. These mechanisms -//! are responsible for three primary things -//! -//! 1. Soliciting other routers through UDP/IPv6 link local multicast. -//! 2. Sending out router advertisements in response to solicitations. -//! 3. Continuously soliciting link-local at a configurable rate to keep -//! sessions alive and sending out notifications when peering arrangements -//! expire due to not getting a solicitation response within a configurable -//! time threshold. -//! -//! ## Protocol -//! -//! The general sequence of events is depicted in the following diagram. -//! -//! *==========* *==========* -//! | violin | | piano | -//! *==========* *==========* -//! | | -//! | solicit(ff02::dd) | -//! |-------------------------->| -//! | advertise(fe80::47) | -//! |<--------------------------| -//! | | -//! | ... | -//! | | -//! | | -//! | solicit(ff02::dd) | -//! |-------------------------->| -//! | advertise(fe80::47) | -//! |<--------------------------| -//! | | -//! | solicit(ff02::dd) | -//! |-------------------------->| -//! | solicit(ff02::dd) | -//! |-------------------------->| -//! | solicit(ff02::dd) | -//! |-------------------------->| -//! | | -//! +----| | -//! expire | | | -//! piano | | | -//! +--->| | -//! -//! This shows violin sending a link-local multicast solicitation over the wire. -//! That solicitation is received by piano and piano respons with an -//! advertisement to violin's link-local unicast address. From this point -//! forward solicitations and responses continue. Each time violin gets a -//! response from piano, it updates the last seen timestamp for piano. If at -//! some point piano stops responding to solicitations and the last seen -//! timestamp is older than the expiration threshold, violin will expire the -//! session and send out a notification to the ddm state machine that started -//! it. Violin will continue to send out solicitations in case piano comes back. -//! -//! In the event that piano undergoes renumbering e.g. it's link-local unicast -//! address changes, this will be detected by violin and an advertisement update -//! will be sent to the ddm state machine through the notification channel -//! provided to the discovery subsystem. -//! -//! The DDM discovery multicast address is ff02::dd. Discovery packets are sent -//! over UDP using port number 0xddd. -//! -//! ## Packets -//! -//! Discovery packets follow a very simple format -//! -//! 1 2 3 -//! 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -//! | version |S A r r r r r r| router kind | hostname len | -//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -//! | hostname : -//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -//! : .... : -//! +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -//! -//! The first byte indicates the version. The only valid version at present is -//! version 1. The second byte is a flags bitfield. The first position `S` -//! indicates a solicitation. The second position `A` indicates and -//! advertisement. All other positions are reserved for future use. The third -//! byte indicates the kind of router. Current values are 0 for a server router -//! and 1 for a transit routers. The fourth byte is a hostname length followed -//! directly by a hostname of up to 255 bytes in length. +//! Runtime helpers for ddm router discovery: link-local UDPv6 sockets, +//! solicitation/advertisement loops, neighbor liveness, and the +//! [`handler`] entry point invoked by the routing state machine. +//! illumos-only. +use super::{DiscoveryError, Version}; use crate::db::Db; use crate::sm::{Config, Event, NeighborEvent, SessionStats}; -use crate::util::u8_slice_assume_init_ref; use crate::{dbg, err, inf, trc, wrn}; use ddm_types::db::{PeerInfo, PeerStatus, RouterKind}; use mg_common::lock; @@ -101,31 +23,22 @@ use std::sync::mpsc::Sender; use std::sync::{Arc, RwLock}; use std::thread::{sleep, spawn}; use std::time::{Duration, Instant}; -use thiserror::Error; const DDM_MADDR: Ipv6Addr = Ipv6Addr::new(0xff02, 0, 0, 0, 0, 0, 0, 0xdd); const DDM_PORT: u16 = 0xddd; const SOLICIT: u8 = 1; const ADVERTISE: u8 = 1 << 1; -#[derive(Debug, Copy, Clone)] -#[repr(u8)] -pub enum Version { - V2 = 2, - V3 = 3, -} - -#[derive(Error, Debug)] -pub enum DiscoveryError { - #[error("io error: {0}")] - Io(#[from] std::io::Error), - - #[error("serialization error: {0}")] - Serialization(#[from] ispf::Error), +/// Reinterpret an initialized prefix of `[MaybeUninit]` as `[u8]`. +/// +/// TODO: trade for `MaybeUninit::slice_assume_init_ref` when it stabilizes. +#[inline(always)] +const unsafe fn u8_slice_assume_init_ref(slice: &[MaybeUninit]) -> &[u8] { + unsafe { &*(slice as *const [MaybeUninit] as *const [u8]) } } #[derive(Debug, Serialize, Deserialize)] -pub struct DiscoveryPacket { +struct DiscoveryPacket { version: u8, flags: u8, kind: RouterKind, @@ -134,34 +47,28 @@ pub struct DiscoveryPacket { } impl DiscoveryPacket { - pub fn new_solicitation(hostname: String, kind: RouterKind) -> Self { + fn new_solicitation(hostname: String, kind: RouterKind) -> Self { Self { - version: Version::V2 as u8, + version: Version::V4 as u8, flags: SOLICIT, hostname, kind, } } - pub fn new_advertisement(hostname: String, kind: RouterKind) -> Self { + fn new_advertisement(hostname: String, kind: RouterKind) -> Self { Self { - version: Version::V2 as u8, + version: Version::V4 as u8, flags: ADVERTISE, hostname, kind, } } - pub fn is_solicitation(&self) -> bool { + fn is_solicitation(&self) -> bool { (self.flags & SOLICIT) != 0 } - pub fn is_advertisement(&self) -> bool { + fn is_advertisement(&self) -> bool { (self.flags & ADVERTISE) != 0 } - pub fn set_solicitation(&mut self) { - self.flags &= SOLICIT; - } - pub fn set_advertisement(&mut self) { - self.flags &= ADVERTISE; - } } #[derive(Clone)] @@ -461,12 +368,12 @@ fn handle_advertisement( let version = match version { 2 => Version::V2, 3 => Version::V3, + 4 => Version::V4, x => { err!( ctx.log, ctx.config.if_name, - "unknown protocol version {}, known versions are: 1, 2", - x + "unknown protocol version {x}, known versions are: 2, 3, 4" ); return; } @@ -526,6 +433,7 @@ fn handle_advertisement( addr: *sender, host: hostname, kind, + if_name: Some(ctx.config.if_name.clone()), }, ); if updated { diff --git a/ddm/src/exchange/mod.rs b/ddm/src/exchange/mod.rs new file mode 100644 index 00000000..0e384607 --- /dev/null +++ b/ddm/src/exchange/mod.rs @@ -0,0 +1,585 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This module implements the ddm router prefix exchange mechanisms. These +//! mechanisms are responsible for announcing and withdrawing prefix sets to +//! and from peers. +//! +//! The module has a set of request initiators and request handlers for +//! announcing, withdrawing, and synchronizing routes with a given peer. +//! Communication between peers is over HTTP(s) requests. +//! +//! This module only contains basic mechanisms for prefix information exchange +//! with peers. How those mechanisms are used in the overall state machine +//! model of a ddm router is defined in the state machine implementation in +//! [`crate::sm`]. +//! +//! The wire types ([`Update`], [`UnderlayUpdate`], [`TunnelUpdate`], +//! [`MulticastUpdate`], and their versioned counterparts) are +//! platform-agnostic and stay in this module. The runtime helpers that drive +//! the HTTP exchange protocol and program forwarding state live in the +//! [`runtime`] submodule and are illumos-only, since they call into +//! [`crate::sys`] to install routes. + +use ddm_types::exchange::{ + MulticastPathHop, MulticastPathVector, PathVector, PathVectorV2, +}; +use mg_common::net::{TunnelOrigin, TunnelOriginV2}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::collections::HashSet; +use thiserror::Error; + +#[cfg(all(feature = "illumos", target_os = "illumos"))] +mod runtime; + +#[cfg(all(feature = "illumos", target_os = "illumos"))] +pub(crate) use runtime::{ + announce_multicast, announce_tunnel, announce_underlay, do_pull_v4, + handler, pull, withdraw_multicast, withdraw_tunnel, withdraw_underlay, +}; + +/// THIS TYPE IS FOR DDM PROTOCOL VERSION 1. IT SHALL NEVER CHANGE. THIS TYPE +/// CAN BE REMOVED WHEN DDMV1 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS +/// DEFINITION SHALL NEVER CHANGE. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct UpdateV1 { + pub announce: HashSet, + pub withdraw: HashSet, +} + +/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE +/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS +/// DEFINITION SHALL NEVER CHANGE. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct UpdateV2 { + pub underlay: Option, + pub tunnel: Option, +} + +/// THIS TYPE IS FOR DDM PROTOCOL VERSION 3. IT SHALL NEVER CHANGE. THIS TYPE +/// CAN BE REMOVED WHEN DDMV3 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS +/// DEFINITION SHALL NEVER CHANGE. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct UpdateV3 { + pub underlay: Option, + pub tunnel: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct Update { + pub underlay: Option, + pub tunnel: Option, + pub multicast: Option, +} + +impl From for Update { + fn from(value: UpdateV1) -> Self { + Update { + tunnel: None, + underlay: Some(UnderlayUpdate { + announce: value.announce, + withdraw: value.withdraw, + }), + multicast: None, + } + } +} + +impl From for Update { + fn from(value: UpdateV2) -> Self { + Update { + tunnel: value.tunnel.map(TunnelUpdate::from), + underlay: value.underlay.map(UnderlayUpdate::from), + // V2 protocol doesn't support multicast + multicast: None, + } + } +} + +impl From for UpdateV1 { + fn from(value: Update) -> Self { + let (announce, withdraw) = match value.underlay { + Some(underlay) => (underlay.announce, underlay.withdraw), + None => (HashSet::new(), HashSet::new()), + }; + UpdateV1 { announce, withdraw } + } +} + +impl From for UpdateV2 { + fn from(value: Update) -> Self { + UpdateV2 { + tunnel: value.tunnel.map(TunnelUpdateV2::from), + underlay: value.underlay.map(UnderlayUpdateV2::from), + } + } +} + +impl From for Update { + fn from(value: UpdateV3) -> Self { + Update { + underlay: value.underlay, + tunnel: value.tunnel, + multicast: None, + } + } +} + +impl From for UpdateV3 { + fn from(value: Update) -> Self { + UpdateV3 { + underlay: value.underlay, + tunnel: value.tunnel, + } + } +} + +impl From for Update { + fn from(u: UnderlayUpdate) -> Self { + Update { + underlay: Some(u), + tunnel: None, + multicast: None, + } + } +} + +impl From for Update { + fn from(t: TunnelUpdate) -> Self { + Update { + underlay: None, + tunnel: Some(t), + multicast: None, + } + } +} + +impl From for Update { + fn from(m: MulticastUpdate) -> Self { + Update { + underlay: None, + tunnel: None, + multicast: Some(m), + } + } +} + +/// THIS TYPE IS FOR DDM PROTOCOL VERSION 3. IT SHALL NEVER CHANGE. THIS TYPE +/// CAN BE REMOVED WHEN DDMV3 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS +/// DEFINITION SHALL NEVER CHANGE. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct PullResponseV3 { + pub underlay: Option>, + pub tunnel: Option>, +} + +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct PullResponse { + pub underlay: Option>, + pub tunnel: Option>, + pub multicast: Option>, +} + +/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE +/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS +/// DEFINITION SHALL NEVER CHANGE. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct PullResponseV2 { + pub underlay: Option>, + pub tunnel: Option>, +} + +impl From for PullResponse { + fn from(value: PullResponseV2) -> Self { + PullResponse { + underlay: value + .underlay + .map(|x| x.into_iter().map(PathVector::from).collect()), + tunnel: value + .tunnel + .map(|x| x.into_iter().map(TunnelOrigin::from).collect()), + // V2 protocol doesn't support multicast + multicast: None, + } + } +} + +impl From for PullResponse { + fn from(value: PullResponseV3) -> Self { + PullResponse { + underlay: value.underlay, + tunnel: value.tunnel, + multicast: None, + } + } +} + +impl From> for PullResponse { + fn from(value: HashSet) -> Self { + PullResponse { + underlay: Some(value), + tunnel: None, + multicast: None, + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct UnderlayUpdate { + pub announce: HashSet, + pub withdraw: HashSet, +} + +/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE +/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS +/// DEFINITION SHALL NEVER CHANGE. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct UnderlayUpdateV2 { + pub announce: HashSet, + pub withdraw: HashSet, +} + +impl From for UnderlayUpdateV2 { + fn from(value: UnderlayUpdate) -> Self { + UnderlayUpdateV2 { + announce: value + .announce + .into_iter() + .map(PathVectorV2::from) + .collect(), + withdraw: value + .withdraw + .into_iter() + .map(PathVectorV2::from) + .collect(), + } + } +} + +impl From for UnderlayUpdate { + fn from(value: UnderlayUpdateV2) -> Self { + UnderlayUpdate { + announce: value + .announce + .into_iter() + .map(PathVector::from) + .collect(), + withdraw: value + .withdraw + .into_iter() + .map(PathVector::from) + .collect(), + } + } +} + +impl UnderlayUpdate { + pub fn announce(prefixes: HashSet) -> Self { + Self { + announce: prefixes, + ..Default::default() + } + } + pub fn withdraw(prefixes: HashSet) -> Self { + Self { + withdraw: prefixes, + ..Default::default() + } + } + pub fn with_path_element(&self, element: String) -> Self { + Self { + announce: self + .announce + .iter() + .map(|x| { + let mut pv = x.clone(); + pv.path.push(element.clone()); + pv + }) + .collect(), + withdraw: self + .withdraw + .iter() + .map(|x| { + let mut pv = x.clone(); + pv.path.push(element.clone()); + pv + }) + .collect(), + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct TunnelUpdate { + pub announce: HashSet, + pub withdraw: HashSet, +} + +/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE +/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS +/// DEFINITION SHALL NEVER CHANGE. +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct TunnelUpdateV2 { + pub announce: HashSet, + pub withdraw: HashSet, +} + +impl From for TunnelUpdate { + fn from(value: TunnelUpdateV2) -> Self { + TunnelUpdate { + announce: value + .announce + .into_iter() + .map(TunnelOrigin::from) + .collect(), + withdraw: value + .withdraw + .into_iter() + .map(TunnelOrigin::from) + .collect(), + } + } +} + +impl From for TunnelUpdateV2 { + fn from(value: TunnelUpdate) -> Self { + TunnelUpdateV2 { + announce: value + .announce + .into_iter() + .map(TunnelOriginV2::from) + .collect(), + withdraw: value + .withdraw + .into_iter() + .map(TunnelOriginV2::from) + .collect(), + } + } +} + +impl TunnelUpdate { + pub fn announce(prefixes: HashSet) -> Self { + Self { + announce: prefixes, + ..Default::default() + } + } + pub fn withdraw(prefixes: HashSet) -> Self { + Self { + withdraw: prefixes, + ..Default::default() + } + } +} + +/// Multicast group subscription updates. +/// +/// Each entry carries a [`MulticastPathVector`] containing a +/// [`MulticastOrigin`] (overlay group + ff04::/64 underlay mapping) +/// and the path vector for loop detection. +/// +/// [`MulticastOrigin`]: mg_common::net::MulticastOrigin +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] +pub struct MulticastUpdate { + pub announce: HashSet, + pub withdraw: HashSet, +} + +impl MulticastUpdate { + pub fn announce(groups: HashSet) -> Self { + Self { + announce: groups, + ..Default::default() + } + } + pub fn withdraw(groups: HashSet) -> Self { + Self { + withdraw: groups, + ..Default::default() + } + } + + /// Add a hop to all path vectors in this update. + pub fn with_hop(&self, hop: MulticastPathHop) -> Self { + Self { + announce: self + .announce + .iter() + .map(|pv| pv.with_hop(hop.clone())) + .collect(), + withdraw: self + .withdraw + .iter() + .map(|pv| pv.with_hop(hop.clone())) + .collect(), + } + } +} + +#[derive(Error, Debug)] +pub enum ExchangeError { + #[error("io error: {0}")] + Io(#[from] std::io::Error), + + #[error("hyper error: {0}")] + Hyper(#[from] hyper::Error), + + #[error("hyper client error: {0}")] + HyperClient(#[from] hyper_util::client::legacy::Error), + + #[error("timeout error: {0}")] + Timeout(#[from] tokio::time::error::Elapsed), + + #[error("json error: {0}")] + SerdeJson(#[from] serde_json::Error), +} + +#[cfg(test)] +mod tests { + use super::*; + use ddm_types::exchange::MulticastPathHop; + use mg_common::net::{MulticastOrigin, UnderlayMulticastIpv6, Vni}; + use std::net::Ipv6Addr; + + fn sample_multicast_update() -> MulticastUpdate { + let origin = MulticastOrigin { + overlay_group: "233.252.0.1".parse().unwrap(), + underlay_group: UnderlayMulticastIpv6::new( + "ff04::1".parse().unwrap(), + ) + .unwrap(), + vni: Vni::try_from(77u32).unwrap(), + metric: 0, + source: None, + }; + let pv = MulticastPathVector { + origin, + path: vec![MulticastPathHop::new( + "router-1".into(), + Ipv6Addr::LOCALHOST, + )], + }; + MulticastUpdate::announce([pv].into_iter().collect()) + } + + #[test] + fn v4_update_round_trips() { + let update = Update { + underlay: None, + tunnel: None, + multicast: Some(sample_multicast_update()), + }; + let json = serde_json::to_string(&update).unwrap(); + let back: Update = serde_json::from_str(&json).unwrap(); + assert!(back.multicast.is_some()); + assert_eq!(back.multicast.unwrap().announce.len(), 1,); + } + + #[test] + fn v4_update_deserializes_as_v3_drops_multicast() { + let update = Update { + underlay: None, + tunnel: None, + multicast: Some(sample_multicast_update()), + }; + let json = serde_json::to_string(&update).unwrap(); + // A V3 peer would deserialize this as UpdateV3, silently + // dropping the unknown multicast field. + let v3: UpdateV3 = serde_json::from_str(&json).unwrap(); + assert!(v3.underlay.is_none()); + assert!(v3.tunnel.is_none()); + } + + #[test] + fn v3_update_deserializes_as_v4_multicast_none() { + let v3 = UpdateV3 { + underlay: None, + tunnel: None, + }; + let json = serde_json::to_string(&v3).unwrap(); + // A V4 peer receiving a V3 update gets multicast: None. + let update: Update = serde_json::from_str(&json).unwrap(); + assert!(update.multicast.is_none()); + } + + #[test] + fn v4_pull_response_round_trips() { + let origin = MulticastOrigin { + overlay_group: "ff0e::1".parse().unwrap(), + underlay_group: UnderlayMulticastIpv6::new( + "ff04::2".parse().unwrap(), + ) + .unwrap(), + vni: Vni::try_from(77u32).unwrap(), + metric: 0, + source: None, + }; + let pv = MulticastPathVector { + origin, + path: vec![], + }; + let resp = PullResponse { + underlay: None, + tunnel: None, + multicast: Some([pv].into_iter().collect()), + }; + let json = serde_json::to_string(&resp).unwrap(); + let back: PullResponse = serde_json::from_str(&json).unwrap(); + assert!(back.multicast.is_some()); + } + + #[test] + fn v4_pull_response_deserializes_as_v3() { + let origin = MulticastOrigin { + overlay_group: "233.252.0.1".parse().unwrap(), + underlay_group: UnderlayMulticastIpv6::new( + "ff04::1".parse().unwrap(), + ) + .unwrap(), + vni: Vni::try_from(77u32).unwrap(), + metric: 0, + source: None, + }; + let pv = MulticastPathVector { + origin, + path: vec![], + }; + let resp = PullResponse { + underlay: None, + tunnel: None, + multicast: Some([pv].into_iter().collect()), + }; + let json = serde_json::to_string(&resp).unwrap(); + // V3 peer drops the multicast field. + let v3: PullResponseV3 = serde_json::from_str(&json).unwrap(); + assert!(v3.underlay.is_none()); + assert!(v3.tunnel.is_none()); + } + + #[test] + fn v3_pull_response_deserializes_as_v4() { + let v3 = PullResponseV3 { + underlay: None, + tunnel: None, + }; + let json = serde_json::to_string(&v3).unwrap(); + let resp: PullResponse = serde_json::from_str(&json).unwrap(); + assert!(resp.multicast.is_none()); + } + + #[test] + fn from_conversions_strip_multicast() { + let update = Update { + underlay: None, + tunnel: None, + multicast: Some(sample_multicast_update()), + }; + let v3 = UpdateV3::from(update); + let back = Update::from(v3); + assert!(back.multicast.is_none()); + } +} diff --git a/ddm/src/exchange.rs b/ddm/src/exchange/runtime.rs similarity index 62% rename from ddm/src/exchange.rs rename to ddm/src/exchange/runtime.rs index 2c1cc876..ea069f1d 100644 --- a/ddm/src/exchange.rs +++ b/ddm/src/exchange/runtime.rs @@ -2,29 +2,25 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! This file implements the ddm router prefix exchange mechanisms. These -//! mechanisms are responsible for announcing and withdrawing prefix sets to and -//! from peers. -//! -//! This file has a set of request initiators and request handlers for -//! announcing, withdrawing and synchronizing routes with a a given peer. -//! Communication between peers is over HTTP(s) requests. -//! -//! This file only contains basic mechanisms for prefix information exchange -//! with peers. How those mechanisms are used in the overall state machine model -//! of a ddm router is defined in the state machine implementation in sm.rs. -//! - +//! Runtime helpers for the ddm prefix exchange protocol: HTTP push/pull +//! initiators and dropshot endpoint handlers, and the route programming +//! plumbing that drains received updates into the local DB and the +//! forwarding platform via [`crate::sys`]. illumos-only. + +use super::{ + ExchangeError, MulticastUpdate, PullResponse, PullResponseV2, + PullResponseV3, TunnelUpdate, UnderlayUpdate, Update, UpdateV2, UpdateV3, +}; use crate::db::{Route, effective_route_set}; use crate::discovery::Version; use crate::sm::{Config, Event, PeerEvent, SmContext}; use crate::{dbg, err, inf, wrn}; -use ddm_types::db::{RouterKind, TunnelRoute}; -use ddm_types::exchange::{PathVector, PathVectorV2}; +use ddm_types::db::{MulticastRoute, RouterKind, TunnelRoute}; +use ddm_types::exchange::{ + MulticastPathHop, MulticastPathVector, PathVector, PathVectorV2, +}; use dropshot::ApiDescription; use dropshot::ConfigDropshot; -use dropshot::ConfigLogging; -use dropshot::ConfigLoggingLevel; use dropshot::HttpError; use dropshot::HttpResponseOk; use dropshot::HttpResponseUpdatedNoContent; @@ -37,18 +33,17 @@ use hyper::body::Bytes; use hyper_util::client::legacy::Client; use hyper_util::rt::TokioExecutor; use mg_common::net::{TunnelOrigin, TunnelOriginV2}; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; -use slog::Logger; +use slog::{Logger, o}; use std::collections::HashSet; use std::net::{Ipv6Addr, SocketAddrV6}; use std::sync::Arc; use std::sync::atomic::Ordering; use std::time::Duration; -use thiserror::Error; use tokio::sync::Mutex; use tokio::time::timeout; +const UNIT_EXCHANGE_SERVER: &str = "exchange_server"; + #[derive(Clone)] pub struct HandlerContext { ctx: SmContext, @@ -56,302 +51,19 @@ pub struct HandlerContext { log: Logger, } -/// THIS TYPE IS FOR DDM PROTOCOL VERSION 1. IT SHALL NEVER CHANGE. THIS TYPE -/// CAN BE REMOVED WHEN DDMV1 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS -/// DEFINITION SHALL NEVER CHANGE. -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct UpdateV1 { - pub announce: HashSet, - pub withdraw: HashSet, -} - -/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE -/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS -/// DEFINITION SHALL NEVER CHANGE. -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct UpdateV2 { - pub underlay: Option, - pub tunnel: Option, -} - -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct Update { - pub underlay: Option, - pub tunnel: Option, -} - -impl From for Update { - fn from(value: UpdateV1) -> Self { - Update { - tunnel: None, - underlay: Some(UnderlayUpdate { - announce: value.announce, - withdraw: value.withdraw, - }), - } - } -} - -impl From for Update { - fn from(value: UpdateV2) -> Self { - Update { - tunnel: value.tunnel.map(TunnelUpdate::from), - underlay: value.underlay.map(UnderlayUpdate::from), - } - } -} - -impl From for UpdateV1 { - fn from(value: Update) -> Self { - let (announce, withdraw) = match value.underlay { - Some(underlay) => (underlay.announce, underlay.withdraw), - None => (HashSet::new(), HashSet::new()), - }; - UpdateV1 { announce, withdraw } - } -} - -impl From for UpdateV2 { - fn from(value: Update) -> Self { - UpdateV2 { - tunnel: value.tunnel.map(TunnelUpdateV2::from), - underlay: value.underlay.map(UnderlayUpdateV2::from), - } - } -} - -impl From for Update { - fn from(u: UnderlayUpdate) -> Self { - Update { - underlay: Some(u), - tunnel: None, - } - } -} - -impl From for Update { - fn from(t: TunnelUpdate) -> Self { - Update { - underlay: None, - tunnel: Some(t), - } - } -} - impl Update { + /// Build an `Update` whose underlay/tunnel/multicast halves carry the + /// announcements from `pr`. Used by [`pull`] to project a pull response + /// back into the update event stream. fn announce(pr: PullResponse) -> Self { Self { underlay: pr.underlay.map(UnderlayUpdate::announce), tunnel: pr.tunnel.map(TunnelUpdate::announce), + multicast: pr.multicast.map(MulticastUpdate::announce), } } } -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct PullResponse { - pub underlay: Option>, - pub tunnel: Option>, -} - -/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE -/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS -/// DEFINITION SHALL NEVER CHANGE. -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct PullResponseV2 { - pub underlay: Option>, - pub tunnel: Option>, -} - -impl From for PullResponse { - fn from(value: PullResponseV2) -> Self { - PullResponse { - underlay: value - .underlay - .map(|x| x.into_iter().map(PathVector::from).collect()), - tunnel: value - .tunnel - .map(|x| x.into_iter().map(TunnelOrigin::from).collect()), - } - } -} - -impl From> for PullResponse { - fn from(value: HashSet) -> Self { - PullResponse { - underlay: Some(value), - tunnel: None, - } - } -} - -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct UnderlayUpdate { - pub announce: HashSet, - pub withdraw: HashSet, -} - -/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE -/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS -/// DEFINITION SHALL NEVER CHANGE. -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct UnderlayUpdateV2 { - pub announce: HashSet, - pub withdraw: HashSet, -} - -impl From for UnderlayUpdateV2 { - fn from(value: UnderlayUpdate) -> Self { - UnderlayUpdateV2 { - announce: value - .announce - .into_iter() - .map(PathVectorV2::from) - .collect(), - withdraw: value - .withdraw - .into_iter() - .map(PathVectorV2::from) - .collect(), - } - } -} - -impl From for UnderlayUpdate { - fn from(value: UnderlayUpdateV2) -> Self { - UnderlayUpdate { - announce: value - .announce - .into_iter() - .map(PathVector::from) - .collect(), - withdraw: value - .withdraw - .into_iter() - .map(PathVector::from) - .collect(), - } - } -} - -impl UnderlayUpdate { - pub fn announce(prefixes: HashSet) -> Self { - Self { - announce: prefixes, - ..Default::default() - } - } - pub fn withdraw(prefixes: HashSet) -> Self { - Self { - withdraw: prefixes, - ..Default::default() - } - } - pub fn with_path_element(&self, element: String) -> Self { - Self { - announce: self - .announce - .iter() - .map(|x| { - let mut pv = x.clone(); - pv.path.push(element.clone()); - pv - }) - .collect(), - withdraw: self - .withdraw - .iter() - .map(|x| { - let mut pv = x.clone(); - pv.path.push(element.clone()); - pv - }) - .collect(), - } - } -} - -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct TunnelUpdate { - pub announce: HashSet, - pub withdraw: HashSet, -} - -/// THIS TYPE IS FOR DDM PROTOCOL VERSION 2. IT SHALL NEVER CHANGE. THIS TYPE -/// CAN BE REMOVED WHEN DDMV2 CLIENTS AND SERVERS NO LONGER EXIST BUT ITS -/// DEFINITION SHALL NEVER CHANGE. -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, Default)] -pub struct TunnelUpdateV2 { - pub announce: HashSet, - pub withdraw: HashSet, -} - -impl From for TunnelUpdate { - fn from(value: TunnelUpdateV2) -> Self { - TunnelUpdate { - announce: value - .announce - .into_iter() - .map(TunnelOrigin::from) - .collect(), - withdraw: value - .withdraw - .into_iter() - .map(TunnelOrigin::from) - .collect(), - } - } -} - -impl From for TunnelUpdateV2 { - fn from(value: TunnelUpdate) -> Self { - TunnelUpdateV2 { - announce: value - .announce - .into_iter() - .map(TunnelOriginV2::from) - .collect(), - withdraw: value - .withdraw - .into_iter() - .map(TunnelOriginV2::from) - .collect(), - } - } -} - -impl TunnelUpdate { - pub fn announce(prefixes: HashSet) -> Self { - Self { - announce: prefixes, - ..Default::default() - } - } - pub fn withdraw(prefixes: HashSet) -> Self { - Self { - withdraw: prefixes, - ..Default::default() - } - } -} - -#[derive(Error, Debug)] -pub enum ExchangeError { - #[error("io error: {0}")] - Io(#[from] std::io::Error), - - #[error("hyper error: {0}")] - Hyper(#[from] hyper::Error), - - #[error("hyper client error: {0}")] - HyperClient(#[from] hyper_util::client::legacy::Error), - - #[error("timeout error: {0}")] - Timeout(#[from] tokio::time::error::Elapsed), - - #[error("json error: {0}")] - SerdeJson(#[from] serde_json::Error), -} - pub(crate) fn announce_underlay( ctx: &SmContext, config: Config, @@ -404,15 +116,52 @@ pub(crate) fn withdraw_tunnel( send_update(ctx, config, update.into(), addr, version, rt, log) } -pub(crate) fn do_pull( +pub(crate) fn announce_multicast( + ctx: &SmContext, + config: Config, + groups: HashSet, + addr: Ipv6Addr, + version: Version, + rt: Arc, + log: Logger, +) -> Result<(), ExchangeError> { + let update = MulticastUpdate::announce(groups); + send_update(ctx, config, update.into(), addr, version, rt, log) +} + +pub(crate) fn withdraw_multicast( + ctx: &SmContext, + config: Config, + groups: HashSet, + addr: Ipv6Addr, + version: Version, + rt: Arc, + log: Logger, +) -> Result<(), ExchangeError> { + let update = MulticastUpdate::withdraw(groups); + send_update(ctx, config, update.into(), addr, version, rt, log) +} + +pub(crate) fn do_pull_v4( ctx: &SmContext, addr: &Ipv6Addr, rt: &Arc, ) -> Result { - let uri = format!( - "http://[{}%{}]:{}/v3/pull", - addr, ctx.config.if_index, ctx.config.exchange_port, - ); + let if_index = ctx.config.if_index; + let port = ctx.config.exchange_port; + let uri = format!("http://[{addr}%{if_index}]:{port}/v4/pull"); + let body = do_pull_common(uri, rt)?; + Ok(serde_json::from_slice(&body)?) +} + +pub(crate) fn do_pull_v3( + ctx: &SmContext, + addr: &Ipv6Addr, + rt: &Arc, +) -> Result { + let if_index = ctx.config.if_index; + let port = ctx.config.exchange_port; + let uri = format!("http://[{addr}%{if_index}]:{port}/v3/pull"); let body = do_pull_common(uri, rt)?; Ok(serde_json::from_slice(&body)?) } @@ -464,7 +213,8 @@ pub(crate) fn pull( ) -> Result<(), ExchangeError> { let pr: PullResponse = match version { Version::V2 => do_pull_v2(&ctx, &addr, &rt)?.into(), - Version::V3 => do_pull(&ctx, &addr, &rt)?, + Version::V3 => do_pull_v3(&ctx, &addr, &rt)?.into(), + Version::V4 => do_pull_v4(&ctx, &addr, &rt)?, }; let update = Update::announce(pr); @@ -489,43 +239,14 @@ fn send_update( log: Logger, ) -> Result<(), ExchangeError> { ctx.stats.updates_sent.fetch_add(1, Ordering::Relaxed); - match version { - Version::V2 => { - send_update_v2(ctx, config, update.into(), addr, rt, log) - } - Version::V3 => send_update_v3(ctx, config, update, addr, rt, log), - } -} - -fn send_update_v2( - ctx: &SmContext, - config: Config, - update: UpdateV2, - addr: Ipv6Addr, - rt: Arc, - log: Logger, -) -> Result<(), ExchangeError> { - let payload = serde_json::to_string(&update)?; - let uri = format!( - "http://[{}%{}]:{}/v2/push", - addr, config.if_index, config.exchange_port, - ); - send_update_common(ctx, uri, payload, config, rt, log) -} - -fn send_update_v3( - ctx: &SmContext, - config: Config, - update: Update, - addr: Ipv6Addr, - rt: Arc, - log: Logger, -) -> Result<(), ExchangeError> { - let payload = serde_json::to_string(&update)?; - let uri = format!( - "http://[{}%{}]:{}/v3/push", - addr, config.if_index, config.exchange_port, - ); + let (payload, path) = match version { + Version::V2 => (serde_json::to_string(&UpdateV2::from(update))?, "v2"), + Version::V3 => (serde_json::to_string(&UpdateV3::from(update))?, "v3"), + Version::V4 => (serde_json::to_string(&update)?, "v4"), + }; + let if_index = config.if_index; + let port = config.exchange_port; + let uri = format!("http://[{addr}%{if_index}]:{port}/{path}/push"); send_update_common(ctx, uri, payload, config, rt, log) } @@ -587,11 +308,11 @@ pub fn handler( ..Default::default() }; - let ds_log = ConfigLogging::StderrTerminal { - level: ConfigLoggingLevel::Error, - } - .to_logger("exchange") - .map_err(|e| e.to_string())?; + let ds_log = log.new(o!( + "component" => crate::COMPONENT_DDM, + "module" => crate::MOD_EXCHANGE, + "unit" => UNIT_EXCHANGE_SERVER, + )); inf!(log, ctx.config.if_name, "exchange: listening on {}", sa); @@ -630,9 +351,11 @@ pub fn api_description() -> Result< > { let mut api = ApiDescription::new(); api.register(push_handler_v2)?; - api.register(push_handler)?; + api.register(push_handler_v3)?; + api.register(push_handler_v4)?; api.register(pull_handler_v2)?; - api.register(pull_handler)?; + api.register(pull_handler_v3)?; + api.register(pull_handler_v4)?; Ok(api) } @@ -647,7 +370,16 @@ async fn push_handler_v2( } #[endpoint { method = PUT, path = "/v3/push" }] -async fn push_handler( +async fn push_handler_v3( + ctx: RequestContext>>, + request: TypedBody, +) -> Result { + let update = Update::from(request.into_inner()); + push_handler_common(ctx, update).await +} + +#[endpoint { method = PUT, path = "/v4/push" }] +async fn push_handler_v4( ctx: RequestContext>>, request: TypedBody, ) -> Result { @@ -744,19 +476,15 @@ async fn pull_handler_v2( })) } -#[endpoint { method = GET, path = "/v3/pull" }] -async fn pull_handler( - ctx: RequestContext>>, -) -> Result, HttpError> { - let ctx = ctx.context().lock().await.clone(); - +/// Collect underlay and tunnel routes for pull responses (shared by V3/V4). +fn collect_underlay_tunnel( + ctx: &HandlerContext, +) -> Result<(HashSet, HashSet), HttpError> { let mut underlay = HashSet::new(); let mut tunnel = HashSet::new(); - // Only transit routers redistribute prefixes if ctx.ctx.config.kind == RouterKind::Transit { for route in &ctx.ctx.db.imported() { - // don't redistribute prefixes to their originators if route.nexthop == ctx.peer { continue; } @@ -771,21 +499,20 @@ async fn pull_handler( if route.nexthop == ctx.peer { continue; } - let tv = route.origin; - tunnel.insert(tv); + tunnel.insert(route.origin); } } + let originated = ctx .ctx .db .originated() .map_err(|e| HttpError::for_internal_error(e.to_string()))?; for prefix in &originated { - let pv = PathVector { + underlay.insert(PathVector { destination: *prefix, path: vec![ctx.ctx.hostname.clone()], - }; - underlay.insert(pv); + }); } let originated_tunnel = ctx @@ -794,26 +521,83 @@ async fn pull_handler( .originated_tunnel() .map_err(|e| HttpError::for_internal_error(e.to_string()))?; for prefix in &originated_tunnel { - let tv = TunnelOrigin { + tunnel.insert(TunnelOrigin { overlay_prefix: prefix.overlay_prefix, boundary_addr: prefix.boundary_addr, vni: prefix.vni, metric: prefix.metric, - }; - tunnel.insert(tv); + }); + } + + Ok((underlay, tunnel)) +} + +/// Collect multicast routes for V4 pull responses. +fn collect_multicast( + ctx: &HandlerContext, +) -> Result, HttpError> { + let mut multicast = HashSet::new(); + + if ctx.ctx.config.kind == RouterKind::Transit { + for route in &ctx.ctx.db.imported_mcast() { + if route.nexthop == ctx.peer { + continue; + } + let hop = MulticastPathHop::new( + ctx.ctx.hostname.clone(), + ctx.ctx.config.addr, + ); + let mut path = route.path.clone(); + path.push(hop); + multicast.insert(MulticastPathVector { + origin: route.origin.clone(), + path, + }); + } + } + + let originated_mcast = ctx + .ctx + .db + .originated_mcast() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + for origin in &originated_mcast { + let hop = MulticastPathHop::new( + ctx.ctx.hostname.clone(), + ctx.ctx.config.addr, + ); + multicast.insert(MulticastPathVector { + origin: origin.clone(), + path: vec![hop], + }); } + Ok(multicast) +} + +#[endpoint { method = GET, path = "/v3/pull" }] +async fn pull_handler_v3( + ctx: RequestContext>>, +) -> Result, HttpError> { + let ctx = ctx.context().lock().await.clone(); + let (underlay, tunnel) = collect_underlay_tunnel(&ctx)?; + Ok(HttpResponseOk(PullResponseV3 { + underlay: crate::non_empty(underlay), + tunnel: crate::non_empty(tunnel), + })) +} + +#[endpoint { method = GET, path = "/v4/pull" }] +async fn pull_handler_v4( + ctx: RequestContext>>, +) -> Result, HttpError> { + let ctx = ctx.context().lock().await.clone(); + let (underlay, tunnel) = collect_underlay_tunnel(&ctx)?; + let multicast = collect_multicast(&ctx)?; Ok(HttpResponseOk(PullResponse { - underlay: if underlay.is_empty() { - None - } else { - Some(underlay) - }, - tunnel: if tunnel.is_empty() { - None - } else { - Some(tunnel) - }, + underlay: crate::non_empty(underlay), + tunnel: crate::non_empty(tunnel), + multicast: crate::non_empty(multicast), })) } @@ -831,6 +615,10 @@ fn handle_update(update: &Update, ctx: &HandlerContext) { handle_tunnel_update(tunnel_update, ctx); } + if let Some(multicast_update) = &update.multicast { + handle_multicast_update(multicast_update, ctx); + } + // distribute updates if ctx.ctx.config.kind == RouterKind::Transit { @@ -846,13 +634,24 @@ fn handle_update(update: &Update, ctx: &HandlerContext) { .as_ref() .map(|update| update.with_path_element(ctx.ctx.hostname.clone())); - let push = Update { + // Add our hop info to multicast path vectors before redistribution + let multicast = update.multicast.as_ref().map(|update| { + let hop = MulticastPathHop::new( + ctx.ctx.hostname.clone(), + ctx.ctx.config.addr, + ); + update.with_hop(hop) + }); + + let push = Arc::new(Update { underlay, tunnel: update.tunnel.clone(), - }; + multicast, + }); for ec in &ctx.ctx.event_channels { - ec.send(Event::Peer(PeerEvent::Push(push.clone()))).unwrap(); + ec.send(Event::Peer(PeerEvent::Push(Arc::clone(&push)))) + .unwrap(); } } } @@ -999,3 +798,50 @@ fn handle_underlay_update(update: &UnderlayUpdate, ctx: &HandlerContext) { .imported_underlay_prefixes .store(ctx.ctx.db.imported_count() as u64, Ordering::Relaxed); } + +fn handle_multicast_update(update: &MulticastUpdate, ctx: &HandlerContext) { + let db = &ctx.ctx.db; + let hostname = &ctx.ctx.hostname; + + let mut import = HashSet::new(); + for pv in &update.announce { + // Path-vector RPF: drop if our router_id appears in the path, + // indicating the announcement has already traversed us. + if pv.path.iter().any(|hop| &hop.router_id == hostname) { + dbg!( + ctx.log, + ctx.ctx.config.if_name, + "dropping multicast announce for {:?} - loop detected \ + (path length {})", + pv.origin.overlay_group, + pv.path.len(), + ); + continue; + } + + import.insert(MulticastRoute { + origin: pv.origin.clone(), + nexthop: ctx.peer, + path: pv.path.clone(), + }); + } + + let mut remove = HashSet::new(); + for pv in &update.withdraw { + // Empty path is safe: MulticastRoute's PartialEq/Hash exclude + // the path field, so this matches by (origin, nexthop) only. + remove.insert(MulticastRoute { + origin: pv.origin.clone(), + nexthop: ctx.peer, + path: Vec::new(), + }); + } + + // Atomic import + delete + diff under a single lock. + // + // DDM stores learned multicast state, which feeds back into Omicron, as + // the latter owns OPTE M2P programming via sled-agent (the M2P table is + // global to xde). + // Learned state is queryable via the DDM admin API (get_multicast_groups). + db.update_imported_mcast(&import, &remove); +} diff --git a/ddm/src/lib.rs b/ddm/src/lib.rs index 447109ba..a382d518 100644 --- a/ddm/src/lib.rs +++ b/ddm/src/lib.rs @@ -8,8 +8,20 @@ pub mod discovery; pub mod exchange; pub mod oxstats; pub mod sm; +#[cfg(all(feature = "illumos", target_os = "illumos"))] pub mod sys; -mod util; + +pub const COMPONENT_DDM: &str = "ddm"; +pub const MOD_ADMIN: &str = "admin"; +pub const MOD_EXCHANGE: &str = "exchange"; + +/// Returns `None` if the set is empty, otherwise `Some(s)`. +#[cfg(all(feature = "illumos", target_os = "illumos"))] +pub(crate) fn non_empty( + set: std::collections::HashSet, +) -> Option> { + (!set.is_empty()).then_some(set) +} #[macro_export] macro_rules! err { diff --git a/ddm/src/sm/mod.rs b/ddm/src/sm/mod.rs new file mode 100644 index 00000000..51a2c1bb --- /dev/null +++ b/ddm/src/sm/mod.rs @@ -0,0 +1,195 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! State machine type definitions and the [`StateMachine`] handle. The +//! routing state machine implementation (discovery, solicit, exchange) lives +//! in the [`state`] submodule and is illumos-only, since it programs kernel +//! routes via [`crate::sys`] and reads interface addressing through `libnet`. + +use crate::db::Db; +use crate::discovery::{self, Version}; +use crate::exchange::Update; +use ddm_types::db::RouterKind; +use mg_common::net::{MulticastOrigin, TunnelOrigin}; +use oxnet::Ipv6Net; +use slog::Logger; +use std::collections::HashSet; +use std::net::Ipv6Addr; +use std::sync::atomic::AtomicU64; +use std::sync::mpsc::{Receiver, Sender}; +use std::sync::{Arc, Mutex}; +use thiserror::Error; + +#[cfg(all(feature = "illumos", target_os = "illumos"))] +mod state; + +#[derive(Debug)] +pub enum AdminEvent { + /// Announce a set of IPv6 prefixes + Announce(PrefixSet), + + /// Withdraw a set of IPv6 prefixes + Withdraw(PrefixSet), + + /// Expire the peer at the specified address + Expire(Ipv6Addr), + + /// Synchronize with active peers by pulling their prefixes. + Sync, +} + +#[derive(Debug)] +pub enum PrefixSet { + Underlay(HashSet), + Tunnel(HashSet), + Multicast(HashSet), +} + +#[derive(Debug)] +pub enum PeerEvent { + Push(Arc), +} + +#[derive(Debug)] +pub enum NeighborEvent { + Advertise((Ipv6Addr, Version)), + SolicitFail, + Expire, +} + +#[derive(Debug)] +pub enum Event { + Neighbor(NeighborEvent), + Peer(PeerEvent), + Admin(AdminEvent), +} + +impl From for Event { + fn from(e: NeighborEvent) -> Self { + Self::Neighbor(e) + } +} + +impl From for Event { + fn from(e: PeerEvent) -> Self { + Self::Peer(e) + } +} + +impl From for Event { + fn from(e: AdminEvent) -> Self { + Self::Admin(e) + } +} + +#[derive(Debug)] +pub enum StateType { + Solicit, + Exchange, +} + +#[derive(Debug)] +pub enum EventError { + InvalidEvent(StateType), +} + +#[derive(Debug)] +pub enum EventResponse { + Success, + Prefixes(Vec), +} + +#[derive(Error, Debug)] +pub enum SmError { + #[error("io error: {0}")] + Io(#[from] std::io::Error), + + #[error("discovery error: {0}")] + Discovery(#[from] discovery::DiscoveryError), +} + +#[derive(Clone)] +pub struct Config { + /// Interface this state machine is associated with. + pub if_index: u32, + + /// Interface name this state machine is associated with. + pub if_name: String, + + /// Address object name the state machine uses for peering. Must correspond + /// to IPv6 link local address. + pub aobj_name: String, + + /// Link local Ipv6 address this state machine is associated with + pub addr: Ipv6Addr, + + /// How long to wait between solicitations (milliseconds). + pub solicit_interval: u64, + + /// How often to check for link failure while waiting for discovery messges. + pub discovery_read_timeout: u64, + + /// How long to wait between attempts to get an IP address for a specified + /// address object. + pub ip_addr_wait: u64, + + /// How long to wait without a solicitation response before expiring a peer + /// (milliseconds). + pub expire_threshold: u64, + + /// How long to wait for a response to exchange messages. + pub exchange_timeout: u64, + + /// The kind of router this is, server or transit. + pub kind: RouterKind, + + /// TCP port to use for prefix exchange. + pub exchange_port: u16, + + /// Dendrite dpd config + pub dpd: Option, +} + +#[derive(Clone)] +pub struct DpdConfig { + pub host: String, + pub port: u16, +} + +#[derive(Default)] +pub struct SessionStats { + // Discovery + pub solicitations_sent: AtomicU64, + pub solicitations_received: AtomicU64, + pub advertisements_sent: AtomicU64, + pub advertisements_received: AtomicU64, + pub peer_expirations: AtomicU64, + pub peer_address_changes: AtomicU64, + pub peer_established: AtomicU64, + pub peer_address: Mutex>, + + // Exchange + pub updates_sent: AtomicU64, + pub updates_received: AtomicU64, + pub imported_underlay_prefixes: AtomicU64, + pub imported_tunnel_endpoints: AtomicU64, + pub update_send_fail: AtomicU64, +} + +#[derive(Clone)] +pub struct SmContext { + pub config: Config, + pub db: Db, + pub tx: Sender, + pub event_channels: Vec>, + pub rt: Arc, + pub hostname: String, + pub stats: Arc, + pub log: Logger, +} + +pub struct StateMachine { + pub ctx: SmContext, + pub rx: Option>, +} diff --git a/ddm/src/sm.rs b/ddm/src/sm/state.rs similarity index 71% rename from ddm/src/sm.rs rename to ddm/src/sm/state.rs index 24215795..d6039689 100644 --- a/ddm/src/sm.rs +++ b/ddm/src/sm/state.rs @@ -2,194 +2,32 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use crate::db::Db; -use crate::discovery::Version; -use crate::exchange::{TunnelUpdate, UnderlayUpdate, Update}; +//! Routing state machine implementation. The `Init` -> `Solicit` -> +//! `Exchange` lifecycle drives kernel route programming via [`crate::sys`] +//! and reads interface addressing through `libnet`. This module is +//! illumos-only. + +use super::{ + AdminEvent, Event, NeighborEvent, PeerEvent, PrefixSet, SmContext, SmError, + StateMachine, +}; +use crate::exchange::{MulticastUpdate, TunnelUpdate, UnderlayUpdate, Update}; use crate::{dbg, discovery, err, exchange, inf, wrn}; use ddm_types::db::RouterKind; -use ddm_types::exchange::PathVector; +use ddm_types::exchange::{MulticastPathHop, PathVector}; use libnet::get_ipaddr_info; -use mg_common::net::TunnelOrigin; -use oxnet::Ipv6Net; use slog::Logger; use std::collections::HashSet; -use std::net::{IpAddr, Ipv6Addr}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::mpsc::{Receiver, Sender}; -use std::sync::{Arc, Mutex}; -use std::thread::sleep; -use std::thread::spawn; +use std::net::IpAddr; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::mpsc::Receiver; +use std::thread::{sleep, spawn}; use std::time::Duration; -use thiserror::Error; - -#[derive(Debug)] -pub enum AdminEvent { - /// Announce a set of IPv6 prefixes - Announce(PrefixSet), - - /// Withdraw a set of IPv6 prefixes - Withdraw(PrefixSet), - - /// Expire the peer at the specified address - Expire(Ipv6Addr), - - /// Synchronize with active peers by pulling their prefixes. - Sync, -} - -#[derive(Debug)] -pub enum PrefixSet { - Underlay(HashSet), - Tunnel(HashSet), -} - -#[derive(Debug)] -pub enum PeerEvent { - Push(Update), -} - -#[derive(Debug)] -pub enum NeighborEvent { - Advertise((Ipv6Addr, Version)), - SolicitFail, - Expire, -} - -#[derive(Debug)] -pub enum Event { - Neighbor(NeighborEvent), - Peer(PeerEvent), - Admin(AdminEvent), -} - -impl From for Event { - fn from(e: NeighborEvent) -> Self { - Self::Neighbor(e) - } -} - -impl From for Event { - fn from(e: PeerEvent) -> Self { - Self::Peer(e) - } -} - -impl From for Event { - fn from(e: AdminEvent) -> Self { - Self::Admin(e) - } -} -#[derive(Debug)] -pub enum StateType { - Solicit, - Exchange, -} - -#[derive(Debug)] -pub enum EventError { - InvalidEvent(StateType), -} - -#[derive(Debug)] -pub enum EventResponse { - Success, - Prefixes(Vec), -} - -#[derive(Error, Debug)] -pub enum SmError { - #[error("io error: {0}")] - Io(#[from] std::io::Error), - - #[error("discovery error: {0}")] - Discovery(#[from] discovery::DiscoveryError), -} - -#[derive(Clone)] -pub struct Config { - /// Interface this state machine is associated with. - pub if_index: u32, - - /// Interface name this state machine is associated with. - pub if_name: String, - - /// Address object name the state machine uses for peering. Must correspond - /// to IPv6 link local address. - pub aobj_name: String, - - /// Link local Ipv6 address this state machine is associated with - pub addr: Ipv6Addr, - - /// How long to wait between solicitations (milliseconds). - pub solicit_interval: u64, - - /// How often to check for link failure while waiting for discovery messges. - pub discovery_read_timeout: u64, - - /// How long to wait between attempts to get an IP address for a specified - /// address object. - pub ip_addr_wait: u64, - - /// How long to wait without a solicitation response before expiring a peer - /// (milliseconds). - pub expire_threshold: u64, - - /// How long to wait for a response to exchange messages. - pub exchange_timeout: u64, - - /// The kind of router this is, server or transit. - pub kind: RouterKind, - - /// TCP port to use for prefix exchange. - pub exchange_port: u16, - - /// Dendrite dpd config - pub dpd: Option, -} - -#[derive(Clone)] -pub struct DpdConfig { - pub host: String, - pub port: u16, -} - -#[derive(Default)] -pub struct SessionStats { - // Discovery - pub solicitations_sent: AtomicU64, - pub solicitations_received: AtomicU64, - pub advertisements_sent: AtomicU64, - pub advertisements_received: AtomicU64, - pub peer_expirations: AtomicU64, - pub peer_address_changes: AtomicU64, - pub peer_established: AtomicU64, - pub peer_address: Mutex>, - - // Exchange - pub updates_sent: AtomicU64, - pub updates_received: AtomicU64, - pub imported_underlay_prefixes: AtomicU64, - pub imported_tunnel_endpoints: AtomicU64, - pub update_send_fail: AtomicU64, -} - -#[derive(Clone)] -pub struct SmContext { - pub config: Config, - pub db: Db, - pub tx: Sender, - pub event_channels: Vec>, - pub rt: Arc, - pub hostname: String, - pub stats: Arc, - pub log: Logger, -} - -pub struct StateMachine { - pub ctx: SmContext, - pub rx: Option>, -} +use crate::discovery::Version; +use mg_common::net::TunnelOrigin; +use std::net::Ipv6Addr; impl StateMachine { pub fn run(&mut self) -> Result<(), SmError> { @@ -369,8 +207,8 @@ impl State for Solicit { } } -pub struct Exchange { - pub peer: Ipv6Addr, +struct Exchange { + peer: Ipv6Addr, version: Version, ctx: SmContext, log: Logger, @@ -425,7 +263,7 @@ impl Exchange { ); let interval = 250; // TODO as parameter loop { - match exchange::do_pull( + match exchange::do_pull_v4( &self.ctx, &self.ctx.config.addr, &self.ctx.rt, @@ -455,7 +293,7 @@ impl Exchange { ) { exchange_thread.abort(); self.ctx.db.remove_peer(self.ctx.config.if_index); - let (to_remove, to_remove_tnl) = + let (to_remove, to_remove_tnl, to_remove_mcast) = self.ctx.db.remove_nexthop_routes(self.peer); let mut routes: Vec = Vec::new(); for x in &to_remove { @@ -492,12 +330,9 @@ impl Exchange { self.ctx.event_channels.len() ); - let underlay = if to_remove.is_empty() { - None - } else { - Some(UnderlayUpdate::withdraw( - to_remove - .iter() + let underlay = crate::non_empty(to_remove).map(|set| { + UnderlayUpdate::withdraw( + set.iter() .map(|x| PathVector { destination: x.destination, path: { @@ -507,20 +342,39 @@ impl Exchange { }, }) .collect(), - )) - }; - - let tunnel = if to_remove_tnl.is_empty() { - None - } else { - Some(TunnelUpdate::withdraw( - to_remove_tnl.iter().cloned().map(Into::into).collect(), - )) - }; - - let push = Update { underlay, tunnel }; + ) + }); + + let tunnel = crate::non_empty(to_remove_tnl).map(|set| { + TunnelUpdate::withdraw( + set.iter().cloned().map(Into::into).collect(), + ) + }); + + // Build multicast withdrawal with our hop info. + let multicast = crate::non_empty(to_remove_mcast).map(|set| { + let hop = MulticastPathHop::new( + self.ctx.hostname.clone(), + self.ctx.config.addr, + ); + MulticastUpdate::withdraw( + set.iter() + .map(|route| ddm_types::exchange::MulticastPathVector { + origin: route.origin.clone(), + path: vec![hop.clone()], + }) + .collect(), + ) + }); + + let push = Arc::new(Update { + underlay, + tunnel, + multicast, + }); for ec in &self.ctx.event_channels { - ec.send(Event::Peer(PeerEvent::Push(push.clone()))).unwrap(); + ec.send(Event::Peer(PeerEvent::Push(Arc::clone(&push)))) + .unwrap(); } } pull_stop.store(true, Ordering::Relaxed); @@ -728,6 +582,104 @@ impl State for Exchange { ); } } + Event::Admin(AdminEvent::Announce(PrefixSet::Multicast( + groups, + ))) => { + // Convert `MulticastOrigin` to `MulticastPathVector` with + // our hop info + let hop = MulticastPathHop::new( + self.ctx.hostname.clone(), + self.ctx.config.addr, + ); + let pvs: HashSet<_> = groups + .iter() + .map(|origin| { + ddm_types::exchange::MulticastPathVector { + origin: origin.clone(), + path: vec![hop.clone()], + } + }) + .collect(); + + if let Err(e) = crate::exchange::announce_multicast( + &self.ctx, + self.ctx.config.clone(), + pvs, + self.peer, + self.version, + self.ctx.rt.clone(), + self.log.clone(), + ) { + err!( + self.log, + self.ctx.config.if_name, + "announce multicast: {}", + e, + ); + wrn!( + self.log, + self.ctx.config.if_name, + "expiring peer {} due to failed multicast announce", + self.peer, + ); + self.expire_peer(&exchange_thread, &pull_stop); + return ( + Box::new(Solicit::new( + self.ctx.clone(), + self.log.clone(), + )), + event, + ); + } + } + Event::Admin(AdminEvent::Withdraw(PrefixSet::Multicast( + groups, + ))) => { + // Convert MulticastOrigin to MulticastPathVector for withdrawal + let hop = MulticastPathHop::new( + self.ctx.hostname.clone(), + self.ctx.config.addr, + ); + let pvs: HashSet<_> = groups + .iter() + .map(|origin| { + ddm_types::exchange::MulticastPathVector { + origin: origin.clone(), + path: vec![hop.clone()], + } + }) + .collect(); + + if let Err(e) = crate::exchange::withdraw_multicast( + &self.ctx, + self.ctx.config.clone(), + pvs, + self.peer, + self.version, + self.ctx.rt.clone(), + self.log.clone(), + ) { + err!( + self.log, + self.ctx.config.if_name, + "withdraw multicast: {e}", + ); + wrn!( + self.log, + self.ctx.config.if_name, + "expiring peer {} due to failed multicast withdraw", + self.peer, + ); + self.expire_peer(&exchange_thread, &pull_stop); + return ( + Box::new(Solicit::new( + self.ctx.clone(), + self.log.clone(), + )), + event, + ); + } + } Event::Admin(AdminEvent::Expire(peer)) => { if self.peer == peer { inf!( @@ -770,6 +722,8 @@ impl State for Exchange { self.peer, update, ); + let update = Arc::try_unwrap(update) + .unwrap_or_else(|arc| (*arc).clone()); if let Some(push) = update.underlay { if !push.announce.is_empty() && let Err(e) = crate::exchange::announce_underlay( @@ -817,8 +771,7 @@ impl State for Exchange { err!( self.log, self.ctx.config.if_name, - "withdraw: {}", - e, + "withdraw: {e}", ); wrn!( self.log, @@ -836,6 +789,71 @@ impl State for Exchange { ); } } + // Handle multicast redistribution + if let Some(push) = update.multicast { + if !push.announce.is_empty() + && let Err(e) = crate::exchange::announce_multicast( + &self.ctx, + self.ctx.config.clone(), + push.announce, + self.peer, + self.version, + self.ctx.rt.clone(), + self.log.clone(), + ) + { + err!( + self.log, + self.ctx.config.if_name, + "announce multicast: {e}", + ); + wrn!( + self.log, + self.ctx.config.if_name, + "expiring peer {} due to failed multicast announce", + self.peer, + ); + self.expire_peer(&exchange_thread, &pull_stop); + return ( + Box::new(Solicit::new( + self.ctx.clone(), + self.log.clone(), + )), + event, + ); + } + if !push.withdraw.is_empty() + && let Err(e) = crate::exchange::withdraw_multicast( + &self.ctx, + self.ctx.config.clone(), + push.withdraw, + self.peer, + self.version, + self.ctx.rt.clone(), + self.log.clone(), + ) + { + err!( + self.log, + self.ctx.config.if_name, + "withdraw multicast: {e}", + ); + wrn!( + self.log, + self.ctx.config.if_name, + "expiring peer {} due to failed multicast withdraw", + self.peer, + ); + self.expire_peer(&exchange_thread, &pull_stop); + return ( + Box::new(Solicit::new( + self.ctx.clone(), + self.log.clone(), + )), + event, + ); + } + } } Event::Neighbor(NeighborEvent::Expire) => { wrn!( diff --git a/ddm/src/util.rs b/ddm/src/util.rs deleted file mode 100644 index f3a96c03..00000000 --- a/ddm/src/util.rs +++ /dev/null @@ -1,14 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use std::mem::MaybeUninit; - -//TODO trade for `MaybeUninit::slice_assume_init_ref` when it becomes available -//in stable Rust. -#[inline(always)] -pub(crate) const unsafe fn u8_slice_assume_init_ref( - slice: &[MaybeUninit], -) -> &[u8] { - unsafe { &*(slice as *const [MaybeUninit] as *const [u8]) } -} diff --git a/ddmadm/src/main.rs b/ddmadm/src/main.rs index 800315d8..08ffe515 100644 --- a/ddmadm/src/main.rs +++ b/ddmadm/src/main.rs @@ -60,6 +60,18 @@ enum SubCommand { /// Withdraw prefixes from a DDM router. TunnelWithdraw(TunnelEndpoint), + /// Get multicast groups imported from DDM peers. + MulticastImported, + + /// Get locally originated multicast groups. + MulticastOriginated, + + /// Advertise multicast groups from this router. + MulticastAdvertise(MulticastGroup), + + /// Withdraw multicast groups from this router. + MulticastWithdraw(MulticastGroup), + /// Sync prefix information from peers. Sync, } @@ -84,6 +96,29 @@ struct TunnelEndpoint { pub metric: u64, } +#[derive(Debug, Parser)] +struct MulticastGroup { + /// Overlay multicast group address (e.g. 233.252.0.1 or ff0e::1). + #[arg(short = 'g', long)] + pub overlay_group: IpAddr, + + /// Underlay multicast address (ff04::/64 admin-local scope). + #[arg(short = 'u', long)] + pub underlay_group: Ipv6Addr, + + /// Virtual Network Identifier. + #[arg(short, long)] + pub vni: u32, + + /// Path metric. + #[arg(short, long, default_value_t = 0)] + pub metric: u64, + + /// Source address for (S,G) routes (omit for (*,G)). + #[arg(short, long)] + pub source: Option, +} + #[derive(Debug, Parser)] struct Peer { addr: Ipv6Addr, @@ -242,6 +277,107 @@ async fn run() -> Result<()> { }]) .await?; } + SubCommand::MulticastImported => { + let msg = client.get_multicast_groups().await?; + let mut routes: Vec<_> = msg.into_inner().into_iter().collect(); + routes.sort_by(|a, b| { + a.origin + .overlay_group + .cmp(&b.origin.overlay_group) + .then_with(|| a.origin.source.cmp(&b.origin.source)) + }); + let mut tw = TabWriter::new(stdout()); + writeln!( + &mut tw, + "{}\t{}\t{}\t{}\t{}\t{}", + "Overlay Group".dimmed(), + "Underlay Group".dimmed(), + "VNI".dimmed(), + "Metric".dimmed(), + "Source".dimmed(), + "Path".dimmed(), + )?; + for route in &routes { + let source = match &route.origin.source { + Some(s) => s.to_string(), + None => "(*,G)".to_string(), + }; + let path: Vec<_> = route + .path + .iter() + .rev() + .map(|h| h.router_id.clone()) + .collect(); + writeln!( + &mut tw, + "{}\t{}\t{}\t{}\t{}\t{}", + route.origin.overlay_group, + route.origin.underlay_group, + route.origin.vni, + route.origin.metric, + source, + path.join(" "), + )?; + } + tw.flush()?; + } + SubCommand::MulticastOriginated => { + let msg = client.get_originated_multicast_groups().await?; + let mut origins: Vec<_> = msg.into_inner().into_iter().collect(); + origins.sort_by(|a, b| { + a.overlay_group + .cmp(&b.overlay_group) + .then_with(|| a.source.cmp(&b.source)) + }); + let mut tw = TabWriter::new(stdout()); + writeln!( + &mut tw, + "{}\t{}\t{}\t{}\t{}", + "Overlay Group".dimmed(), + "Underlay Group".dimmed(), + "VNI".dimmed(), + "Metric".dimmed(), + "Source".dimmed(), + )?; + for origin in &origins { + let source = match &origin.source { + Some(s) => s.to_string(), + None => "(*,G)".to_string(), + }; + writeln!( + &mut tw, + "{}\t{}\t{}\t{}\t{}", + origin.overlay_group, + origin.underlay_group, + origin.vni, + origin.metric, + source, + )?; + } + tw.flush()?; + } + SubCommand::MulticastAdvertise(mg) => { + client + .advertise_multicast_groups(&vec![types::MulticastOrigin { + overlay_group: mg.overlay_group, + underlay_group: mg.underlay_group, + vni: types::Vni(mg.vni), + metric: mg.metric, + source: mg.source, + }]) + .await?; + } + SubCommand::MulticastWithdraw(mg) => { + client + .withdraw_multicast_groups(&vec![types::MulticastOrigin { + overlay_group: mg.overlay_group, + underlay_group: mg.underlay_group, + vni: types::Vni(mg.vni), + metric: mg.metric, + source: mg.source, + }]) + .await?; + } SubCommand::Sync => { client.sync().await?; } diff --git a/ddmd/Cargo.toml b/ddmd/Cargo.toml index 5ec0804b..a0f327c6 100644 --- a/ddmd/Cargo.toml +++ b/ddmd/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2024" [dependencies] -ddm = { path = "../ddm" } +ddm = { path = "../ddm", default-features = false } mg-common = { path = "../mg-common" } anyhow.workspace = true clap.workspace = true @@ -19,3 +19,7 @@ dpd-client.workspace = true anstyle.workspace = true uuid.workspace = true smf.workspace = true + +[features] +default = ["illumos"] +illumos = ["ddm/illumos"] diff --git a/ddmd/src/main.rs b/ddmd/src/main.rs index d5db3f3c..71dd8440 100644 --- a/ddmd/src/main.rs +++ b/ddmd/src/main.rs @@ -6,11 +6,13 @@ use clap::Parser; use ddm::admin::{HandlerContext, RouterStats}; use ddm::db::Db; use ddm::sm::{DpdConfig, SmContext, StateMachine}; +#[cfg(all(feature = "illumos", target_os = "illumos"))] use ddm::sys::Route; use ddm_types::db::RouterKind; use signal::handle_signals; -use slog::{Drain, Logger, error}; +use slog::{Drain, Logger, error, warn}; use std::net::{IpAddr, Ipv6Addr}; +#[cfg(all(feature = "illumos", target_os = "illumos"))] use std::sync::mpsc::channel; use std::sync::{Arc, Mutex}; use uuid::Uuid; @@ -100,6 +102,15 @@ struct Arg { /// Id of the sled this router is running on. #[arg(long)] sled_uuid: Option, + + /// Skip the routing state machine (discovery, exchange, route + /// synchronization). Only the admin API server runs, allowing test + /// fixtures to obtain a real ddmd admin endpoint without the kernel-level + /// networking the state machine requires. + /// + /// Analogous to `mgd --no-bgp-dispatcher`. + #[arg(long, default_value_t = false)] + no_state_machine: bool, } #[derive(Debug, Parser, Clone)] @@ -121,11 +132,8 @@ async fn run() { .await .expect("set up refresh signal handler"); - let mut event_channels = Vec::new(); let db = Db::new(&format!("{}/ddmdb", arg.data_dir), log.clone()).unwrap(); - let mut sms = Vec::new(); - let dpd = match arg.dendrite { true => Some(DpdConfig { host: arg.dpd_host.clone(), @@ -140,54 +148,10 @@ async fn run() { .to_string_lossy() .to_string(); - for name in arg.addresses { - let (tx, rx) = channel(); - let config = ddm::sm::Config { - solicit_interval: arg.solicit_interval, - expire_threshold: arg.expire_threshold, - discovery_read_timeout: arg.discovery_read_timeout, - ip_addr_wait: arg.ip_addr_wait, - exchange_timeout: arg.exchange_timeout, - exchange_port: arg.exchange_port, - aobj_name: name.clone(), - if_name: String::new(), - if_index: 0, - kind: arg.kind, - dpd: dpd.clone(), - addr: Ipv6Addr::UNSPECIFIED, - }; - let ctx = SmContext { - config, - db: db.clone(), - event_channels: Vec::new(), - tx: tx.clone(), - log: log.clone(), - hostname: hostname.clone(), - rt: rt.clone(), - stats: Arc::new(ddm::sm::SessionStats::default()), - }; - let sm = StateMachine { ctx, rx: Some(rx) }; - sms.push(sm); - event_channels.push(tx); - } - - // Add an event channel sender for each state machine to every other state - // machine. - for (i, sm) in sms.iter_mut().enumerate() { - for (j, e) in event_channels.iter().enumerate() { - // dont give a state machine an event sender to itself. - if i == j { - continue; - } - sm.ctx.event_channels.push(e.clone()); - } - } + let (sms, event_channels) = + start_state_machines(&arg, &db, &dpd, &hostname, &rt, &log); - for sm in &mut sms { - sm.run().unwrap(); - } - - termination_handler(db.clone(), dpd, rt, log.clone()); + termination_handler(db.clone(), dpd.clone(), rt.clone(), log.clone()); let router_stats = Arc::new(RouterStats::default()); let peers: Vec = sms.iter().map(|x| x.ctx.clone()).collect(); @@ -237,6 +201,128 @@ async fn run() { std::thread::park(); } +/// Build, wire, and start the per-address routing state machines. +/// +/// Returns the running [`StateMachine`] handles plus the sender side of each +/// machine's event channel. When `--no-state-machine` is set the function +/// short-circuits to empty vectors, leaving the daemon to serve only its +/// admin API. The illumos and non-illumos variants share that early-exit +/// branch; only the actual machine setup is platform-specific. +#[cfg(all(feature = "illumos", target_os = "illumos"))] +fn start_state_machines( + arg: &Arg, + db: &Db, + dpd: &Option, + hostname: &str, + rt: &Arc, + log: &Logger, +) -> ( + Vec, + Vec>, +) { + if arg.no_state_machine { + if !arg.addresses.is_empty() { + warn!( + log, + "--no-state-machine set; ignoring {} --addr value(s)", + arg.addresses.len(), + ); + } + return (Vec::new(), Vec::new()); + } + + let mut sms = Vec::new(); + let mut event_channels = Vec::new(); + + for name in &arg.addresses { + let (tx, rx) = channel(); + + let config = ddm::sm::Config { + solicit_interval: arg.solicit_interval, + expire_threshold: arg.expire_threshold, + discovery_read_timeout: arg.discovery_read_timeout, + ip_addr_wait: arg.ip_addr_wait, + exchange_timeout: arg.exchange_timeout, + exchange_port: arg.exchange_port, + aobj_name: name.clone(), + if_name: String::new(), + if_index: 0, + kind: arg.kind, + dpd: dpd.clone(), + addr: Ipv6Addr::UNSPECIFIED, + }; + + let ctx = SmContext { + config, + db: db.clone(), + event_channels: Vec::new(), + tx: tx.clone(), + log: log.clone(), + hostname: hostname.to_string(), + rt: rt.clone(), + stats: Arc::new(ddm::sm::SessionStats::default()), + }; + + let sm = StateMachine { ctx, rx: Some(rx) }; + sms.push(sm); + event_channels.push(tx); + } + + // Add an event channel sender for each state machine to every other state + // machine. + for (i, sm) in sms.iter_mut().enumerate() { + for (j, e) in event_channels.iter().enumerate() { + // dont give a state machine an event sender to itself. + if i == j { + continue; + } + sm.ctx.event_channels.push(e.clone()); + } + } + + for sm in &mut sms { + sm.run().unwrap(); + } + + (sms, event_channels) +} + +/// Non-illumos variant: the routing state machine depends on illumos +/// kernel networking, so on every other platform the function logs a warning +/// and returns empty vectors. Test fixtures should pass `--no-state-machine` +/// to silence the warning. +#[cfg(not(all(feature = "illumos", target_os = "illumos")))] +fn start_state_machines( + arg: &Arg, + _db: &Db, + _dpd: &Option, + _hostname: &str, + _rt: &Arc, + log: &Logger, +) -> ( + Vec, + Vec>, +) { + if !arg.no_state_machine { + warn!( + log, + "routing state machine is not available on non-illumos builds; \ + behaving as if `--no-state-machine` were set", + ); + } + if !arg.addresses.is_empty() { + warn!( + log, + "--no-state-machine set; ignoring {} --addr value(s)", + arg.addresses.len(), + ); + } + (Vec::new(), Vec::new()) +} + +/// Install a Ctrl-C handler that withdraws ddmd's imported routes from the +/// kernel before exiting. illumos-only. +#[cfg(all(feature = "illumos", target_os = "illumos"))] fn termination_handler( db: Db, dendrite: Option, @@ -271,6 +357,24 @@ fn termination_handler( }); } +/// Non-illumos variant: there are no kernel routes to withdraw on these +/// platforms, so the handler installs a Ctrl-C task that just exits cleanly. +#[cfg(not(all(feature = "illumos", target_os = "illumos")))] +fn termination_handler( + _db: Db, + _dendrite: Option, + _rt: Arc, + _log: Logger, +) { + tokio::spawn(async { + tokio::signal::ctrl_c() + .await + .expect("error setting termination handler"); + const SIGTERM_EXIT: i32 = 130; + std::process::exit(SIGTERM_EXIT); + }); +} + pub(crate) fn init_logger() -> Logger { let drain = slog_bunyan::new(std::io::stdout()).build().fuse(); let drain = slog_async::Async::new(drain) diff --git a/mg-common/Cargo.toml b/mg-common/Cargo.toml index 87a30308..657d5556 100644 --- a/mg-common/Cargo.toml +++ b/mg-common/Cargo.toml @@ -12,6 +12,7 @@ schemars.workspace = true slog.workspace = true slog-bunyan.workspace = true slog-async.workspace = true +thiserror.workspace = true oximeter-producer.workspace = true oximeter.workspace = true oxnet.workspace = true @@ -19,12 +20,16 @@ backoff.workspace = true smf.workspace = true uuid.workspace = true libc.workspace = true +omicron-common.workspace = true # We need this on illumos, but must omit it on other platforms [target.'cfg(target_os = "illumos")'.dependencies.libnet] workspace = true optional = true +[dev-dependencies] +serde_json.workspace = true + [features] default = ["libnet"] libnet = ["dep:libnet"] diff --git a/mg-common/src/net.rs b/mg-common/src/net.rs index f1784afe..8aaa69b7 100644 --- a/mg-common/src/net.rs +++ b/mg-common/src/net.rs @@ -2,10 +2,114 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +// Re-export so consumers of MulticastOrigin.vni don't need a direct +// omicron_common dependency. +pub use omicron_common::api::external::Vni; + +use omicron_common::address::UNDERLAY_MULTICAST_SUBNET; use oxnet::{IpNet, Ipv4Net, Ipv6Net}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use std::net::{Ipv4Addr, Ipv6Addr}; +use std::fmt; +use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; +use std::str::FromStr; +use thiserror::Error; + +fn default_multicast_vni() -> Vni { + Vni::DEFAULT_MULTICAST_VNI +} + +/// Error constructing an [`UnderlayMulticastIpv6`] address. +#[derive(Debug, Clone, Error)] +pub enum UnderlayMulticastError { + /// The address is not within the underlay multicast subnet (ff04::/64). + #[error( + "underlay address {addr} is not within {UNDERLAY_MULTICAST_SUBNET}" + )] + NotInSubnet { addr: Ipv6Addr }, + + /// The string could not be parsed as an IPv6 address. + #[error("invalid IPv6 address: {0}")] + InvalidIpv6(#[from] std::net::AddrParseError), +} + +/// A validated underlay multicast IPv6 address within ff04::/64. +/// +/// The Oxide rack maps overlay multicast groups 1:1 to admin-local scoped +/// IPv6 multicast addresses in `UNDERLAY_MULTICAST_SUBNET` (ff04::/64). +/// This type enforces that invariant at construction time. +#[derive( + Debug, + Copy, + Clone, + Eq, + PartialEq, + PartialOrd, + Ord, + Hash, + Serialize, + Deserialize, + JsonSchema, +)] +#[serde(try_from = "Ipv6Addr", into = "Ipv6Addr")] +#[schemars(transparent)] +pub struct UnderlayMulticastIpv6(Ipv6Addr); + +impl UnderlayMulticastIpv6 { + /// Create a new validated underlay multicast address. + /// + /// # Errors + /// + /// Returns [`UnderlayMulticastError::NotInSubnet`] if the address is + /// not within ff04::/64. + pub fn new(value: Ipv6Addr) -> Result { + if !UNDERLAY_MULTICAST_SUBNET.contains(value) { + return Err(UnderlayMulticastError::NotInSubnet { addr: value }); + } + Ok(Self(value)) + } + + /// Returns the underlying IPv6 address. + #[inline] + pub const fn ip(&self) -> Ipv6Addr { + self.0 + } +} + +impl fmt::Display for UnderlayMulticastIpv6 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl TryFrom for UnderlayMulticastIpv6 { + type Error = UnderlayMulticastError; + + fn try_from(value: Ipv6Addr) -> Result { + Self::new(value) + } +} + +impl From for Ipv6Addr { + fn from(addr: UnderlayMulticastIpv6) -> Self { + addr.0 + } +} + +impl From for IpAddr { + fn from(addr: UnderlayMulticastIpv6) -> Self { + IpAddr::V6(addr.0) + } +} + +impl FromStr for UnderlayMulticastIpv6 { + type Err = UnderlayMulticastError; + + fn from_str(s: &str) -> Result { + let addr: Ipv6Addr = s.parse()?; + Self::new(addr) + } +} #[derive( Debug, Copy, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, @@ -93,3 +197,134 @@ pub enum IpPrefix { V4(Ipv4Prefix), V6(Ipv6Prefix), } + +/// Origin information for a multicast group announcement. +/// +/// This is analogous to TunnelOrigin but for multicast groups. +/// +/// This represents a subscription to a multicast group that should be +/// advertised via DDM. The overlay_group is the application-visible multicast +/// address (e.g., 233.252.0.1 or ff0e::1), while underlay_group is the mapped +/// admin-local scoped IPv6 address (ff04::X) used in the underlay network. +#[derive(Debug, Clone, Eq, Serialize, Deserialize, JsonSchema)] +pub struct MulticastOrigin { + /// The overlay multicast group address (IPv4 or IPv6). + /// This is the group address visible to applications. + pub overlay_group: IpAddr, + + /// The underlay multicast group address (ff04::X). + /// Validated at construction to be within ff04::/64. + pub underlay_group: UnderlayMulticastIpv6, + + /// VNI for this multicast group (identifies the VPC/network context). + #[serde(default = "default_multicast_vni")] + pub vni: Vni, + + /// Metric for path selection (lower is better). + /// + /// Used for multi-rack replication optimization. + /// Excluded from identity (Hash/Eq) so that metric changes update + /// an existing entry rather than creating a duplicate. + #[serde(default)] + pub metric: u64, + + /// Optional source address for Source-Specific Multicast (S,G) routes. + /// None for Any-Source Multicast (*,G) routes. + #[serde(default)] + pub source: Option, +} + +// Equality and hashing consider only the identity fields (overlay_group, +// underlay_group, vni, source), not metric. This allows metric updates to +// replace existing entries in HashSet-based collections without creating +// duplicates. This type is not used in ordered collections (BTreeSet). +// See #649 for why adding Ord here would require more care. +impl PartialEq for MulticastOrigin { + fn eq(&self, other: &Self) -> bool { + self.overlay_group == other.overlay_group + && self.underlay_group == other.underlay_group + && self.vni == other.vni + && self.source == other.source + } +} + +impl std::hash::Hash for MulticastOrigin { + fn hash(&self, state: &mut H) { + self.overlay_group.hash(state); + self.underlay_group.hash(state); + self.vni.hash(state); + self.source.hash(state); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn underlay_valid_ff04() { + let addr = Ipv6Addr::new(0xff04, 0, 0, 0, 0, 0, 0, 1); + assert!(UnderlayMulticastIpv6::new(addr).is_ok()); + } + + #[test] + fn underlay_rejects_non_admin_local() { + // ff0e:: is global scope, not admin-local + let addr = Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 1); + assert!(UnderlayMulticastIpv6::new(addr).is_err()); + } + + #[test] + fn underlay_rejects_unicast() { + let addr = Ipv6Addr::new(0x2001, 0xdb8, 0, 0, 0, 0, 0, 1); + assert!(UnderlayMulticastIpv6::new(addr).is_err()); + } + + #[test] + fn underlay_serde_round_trip() { + let addr = UnderlayMulticastIpv6::new(Ipv6Addr::new( + 0xff04, 0, 0, 0, 0, 0, 0, 42, + )) + .unwrap(); + let json = serde_json::to_string(&addr).unwrap(); + let back: UnderlayMulticastIpv6 = serde_json::from_str(&json).unwrap(); + assert_eq!(addr, back); + } + + #[test] + fn underlay_serde_rejects_invalid() { + // ff0e::1 serialized as an Ipv6Addr, then deserialized as + // UnderlayMulticastIpv6 should fail via try_from. + let json = + serde_json::to_string(&Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 1)) + .unwrap(); + let result: Result = + serde_json::from_str(&json); + assert!(result.is_err()); + } + + #[test] + fn multicast_origin_rejects_bad_underlay() { + let json = serde_json::json!({ + "overlay_group": "233.252.0.1", + "underlay_group": "ff0e::1", + "vni": 77 + }); + let result: Result = serde_json::from_value(json); + assert!(result.is_err()); + } + + #[test] + fn multicast_origin_accepts_valid() { + let json = serde_json::json!({ + "overlay_group": "233.252.0.1", + "underlay_group": "ff04::1", + "vni": 77 + }); + let origin: MulticastOrigin = serde_json::from_value(json).unwrap(); + assert_eq!( + origin.underlay_group.ip(), + Ipv6Addr::new(0xff04, 0, 0, 0, 0, 0, 0, 1), + ); + } +} diff --git a/mg-lower/src/ddm.rs b/mg-lower/src/ddm.rs index ac7d9708..2f987477 100644 --- a/mg-lower/src/ddm.rs +++ b/mg-lower/src/ddm.rs @@ -5,7 +5,7 @@ use crate::log::ddm_log; #[cfg(target_os = "illumos")] use ddm_admin_client::Client; -use ddm_admin_client::types::TunnelOrigin; +use ddm_admin_client::types::{MulticastOrigin, TunnelOrigin}; use oxnet::Ipv6Net; use slog::Logger; use std::{net::Ipv6Addr, sync::Arc}; @@ -111,3 +111,57 @@ pub(crate) fn remove_tunnel_routes<'a, I: Iterator>( pub fn new_ddm_client(log: &Logger) -> Client { Client::new("http://localhost:8000", log.clone()) } + +pub(crate) fn add_multicast_routes< + 'a, + I: Iterator, +>( + client: &impl Ddm, + routes: I, + rt: &Arc, + log: &Logger, +) { + let routes: Vec = routes.cloned().collect(); + if routes.is_empty() { + return; + } + let resp = + rt.block_on(async { client.advertise_multicast_groups(&routes).await }); + if let Err(e) = resp { + ddm_log!(log, + error, + "advertise multicast groups error: {e}"; + "error" => format!("{e}"), + "groups" => format!("{routes:#?}") + ); + } +} + +pub(crate) fn remove_multicast_routes< + 'a, + I: Iterator, +>( + client: &impl Ddm, + routes: I, + rt: &Arc, + log: &Logger, +) { + let routes: Vec = routes.cloned().collect(); + if routes.is_empty() { + return; + } + let resp = + rt.block_on(async { client.withdraw_multicast_groups(&routes).await }); + match resp { + Err(e) => ddm_log!(log, + error, + "withdraw multicast groups error: {e}"; + "groups" => format!("{routes:#?}") + ), + Ok(_) => ddm_log!(log, + debug, + "withdrew multicast groups"; + "groups" => format!("{routes:#?}") + ), + } +} diff --git a/mg-lower/src/lib.rs b/mg-lower/src/lib.rs index bcbf5b57..c1b84595 100644 --- a/mg-lower/src/lib.rs +++ b/mg-lower/src/lib.rs @@ -39,6 +39,7 @@ mod ddm; mod dendrite; mod error; mod log; +pub mod mrib; mod platform; #[cfg(test)] diff --git a/mg-lower/src/mrib.rs b/mg-lower/src/mrib.rs new file mode 100644 index 00000000..803587f0 --- /dev/null +++ b/mg-lower/src/mrib.rs @@ -0,0 +1,214 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! MRIB (Multicast Routing Information Base) synchronization to DDM. +//! +//! This module watches for MRIB changes and propagates multicast group +//! subscriptions to DDM for distribution across the underlay network. +//! +//! ## Data Flow +//! +//! ```text +//! MRIB (loc_mrib changes) +//! | +//! v [MribChangeNotification] +//! mg-lower/mrib.rs +//! | +//! v [MulticastOrigin] +//! DDM admin API +//! | +//! v [DDM exchange protocol] +//! Other sleds/racks +//! ``` + +use crate::ddm::{ + add_multicast_routes, new_ddm_client, remove_multicast_routes, +}; +use crate::platform::{Ddm, ProductionDdm}; +use ddm_admin_client::types::MulticastOrigin; +use mg_common::net::Vni; +use rdb::Mrib; +use rdb::types::{MribChangeNotification, MulticastAddr, MulticastRoute}; +use slog::{Logger, debug, error, info}; +use std::collections::HashSet; +use std::sync::Arc; +use std::sync::mpsc::{RecvTimeoutError, channel}; +use std::thread::sleep; +use std::time::Duration; + +const MG_LOWER_MRIB_TAG: &str = "mg-lower-mrib"; + +/// Convert an MRIB [`MulticastRoute`] to a DDM [`MulticastOrigin`]. +/// +/// [`MulticastOrigin`]: ddm_admin_client::types::MulticastOrigin +fn ddm_origin(route: &MulticastRoute) -> MulticastOrigin { + MulticastOrigin { + overlay_group: route.key.group().ip(), + underlay_group: route.underlay_group.ip(), + vni: ddm_admin_client::types::Vni(route.key.vni().as_u32()), + metric: 0, + source: route.key.source(), + } +} + +/// Run the MRIB synchronization loop. +/// +/// This function loops forever, watching for MRIB changes and synchronizing +/// them to DDM. It runs on the calling thread. +pub fn run(mrib: Mrib, log: Logger, rt: Arc) { + loop { + let (tx, rx) = channel(); + + // Register as MRIB watcher + mrib.watch(MG_LOWER_MRIB_TAG.into(), tx); + + let ddm = ProductionDdm { + client: new_ddm_client(&log), + }; + + // Initial full sync + if let Err(e) = full_sync(&mrib, &ddm, &log, &rt) { + error!(log, "MRIB full sync failed: {e}"); + info!(log, "restarting MRIB sync loop in one second"); + sleep(Duration::from_secs(1)); + continue; + } + + // Handle incremental changes + loop { + match rx.recv_timeout(Duration::from_secs(10)) { + Ok(notification) => { + if let Err(e) = + handle_change(&mrib, notification, &ddm, &log, &rt) + { + error!(log, "MRIB change handling failed: {e}"); + } + } + Err(RecvTimeoutError::Timeout) => { + // Periodic full sync to catch any missed changes + if let Err(e) = full_sync(&mrib, &ddm, &log, &rt) { + error!(log, "MRIB periodic sync failed: {e}"); + } + } + Err(RecvTimeoutError::Disconnected) => { + error!(log, "MRIB watcher disconnected"); + break; + } + } + } + } +} + +/// Perform a full synchronization of MRIB to DDM. +/// +/// This compares the current MRIB loc_mrib with what DDM has advertised +/// and reconciles any differences. +pub(crate) fn full_sync( + mrib: &Mrib, + ddm: &D, + log: &Logger, + rt: &Arc, +) -> Result<(), String> { + // Get current MRIB state (installed/selected routes) + let mrib_routes = mrib.loc_mrib(); + + // Convert to DDM MulticastOrigin set + let mrib_origins: HashSet = + mrib_routes.values().map(ddm_origin).collect(); + + // Get current DDM advertised state + let ddm_current: HashSet = rt + .block_on(async { ddm.get_originated_multicast_groups().await }) + .map_err(|e| format!("failed to get DDM multicast groups: {e}"))? + .into_inner() + .into_iter() + .collect(); + + // Compute diff + let to_add: Vec<_> = mrib_origins.difference(&ddm_current).collect(); + let to_remove: Vec<_> = ddm_current.difference(&mrib_origins).collect(); + + if !to_add.is_empty() { + info!( + log, + "MRIB sync: adding {} multicast groups to DDM", + to_add.len() + ); + add_multicast_routes(ddm, to_add.into_iter(), rt, log); + } + + if !to_remove.is_empty() { + info!( + log, + "MRIB sync: removing {} multicast groups from DDM", + to_remove.len() + ); + remove_multicast_routes(ddm, to_remove.into_iter(), rt, log); + } + + Ok(()) +} + +/// Handle an incremental MRIB change notification. +fn handle_change( + mrib: &Mrib, + notification: MribChangeNotification, + ddm: &D, + log: &Logger, + rt: &Arc, +) -> Result<(), String> { + // Get current DDM state for comparison + let ddm_current: HashSet = rt + .block_on(async { ddm.get_originated_multicast_groups().await }) + .map_err(|e| format!("failed to get DDM multicast groups: {e}"))? + .into_inner() + .into_iter() + .collect(); + + let mut to_add = Vec::new(); + let mut to_remove = Vec::new(); + + for key in notification.changed { + // Check if route exists in loc_mrib (installed) + if let Some(route) = mrib.get_selected_route(&key) { + let origin = ddm_origin(&route); + if !ddm_current.contains(&origin) { + to_add.push(origin); + } + } else { + // Route was removed from loc_mrib, so we need to find matching DDM + // origin. We check all DDM origins to find any that match this key + for ddm_origin in &ddm_current { + // Reconstruct the key from the DDM origin to compare + if let Ok(overlay_group) = + MulticastAddr::try_from(ddm_origin.overlay_group) + && let Ok(ddm_key) = rdb::types::MulticastRouteKey::new( + ddm_origin.source, + overlay_group, + Vni::DEFAULT_MULTICAST_VNI, + ) + && ddm_key == key + { + to_remove.push(ddm_origin.clone()); + } + } + } + } + + if !to_add.is_empty() { + debug!(log, "MRIB change: adding {} multicast groups", to_add.len()); + add_multicast_routes(ddm, to_add.iter(), rt, log); + } + + if !to_remove.is_empty() { + debug!( + log, + "MRIB change: removing {} multicast groups", + to_remove.len() + ); + remove_multicast_routes(ddm, to_remove.iter(), rt, log); + } + + Ok(()) +} diff --git a/mg-lower/src/platform.rs b/mg-lower/src/platform.rs index a05143b9..1d453a1e 100644 --- a/mg-lower/src/platform.rs +++ b/mg-lower/src/platform.rs @@ -216,6 +216,46 @@ pub trait Ddm { ddm_admin_client::ResponseValue<()>, ddm_admin_client::Error, >; + + /// Get multicast group subscriptions originated by this router. + /// + /// Each `MulticastOrigin` pairs an overlay group address with its + /// underlay mapping (ff04::/64) and optional source for (S,G) routes. + /// + /// Method names follow the DDM admin API convention + /// (`originated_multicast_groups`, not `originated_multicast_origins`). + async fn get_originated_multicast_groups( + &self, + ) -> Result< + ddm_admin_client::ResponseValue>, + ddm_admin_client::Error, + >; + + /// Advertise multicast group subscriptions to DDM peers. + /// + /// Each entry is a `MulticastOrigin` pairing an overlay group + /// with its ff04::/64 underlay mapping. + #[allow(clippy::ptr_arg)] + async fn advertise_multicast_groups<'a>( + &'a self, + body: &'a Vec, + ) -> Result< + ddm_admin_client::ResponseValue<()>, + ddm_admin_client::Error, + >; + + /// Withdraw multicast group subscriptions from DDM peers. + /// + /// Each entry is a `MulticastOrigin` pairing an overlay group + /// with its ff04::/64 underlay mapping. + #[allow(clippy::ptr_arg)] + async fn withdraw_multicast_groups<'a>( + &'a self, + body: &'a Vec, + ) -> Result< + ddm_admin_client::ResponseValue<()>, + ddm_admin_client::Error, + >; } /// This trait wraps the methods that have expectations about switch zone @@ -405,6 +445,35 @@ impl Ddm for ProductionDdm { > { self.client.withdraw_tunnel_endpoints(body).await } + + async fn get_originated_multicast_groups( + &self, + ) -> Result< + ddm_admin_client::ResponseValue>, + ddm_admin_client::Error, + > { + self.client.get_originated_multicast_groups().await + } + + async fn advertise_multicast_groups<'a>( + &'a self, + body: &'a Vec, + ) -> Result< + ddm_admin_client::ResponseValue<()>, + ddm_admin_client::Error, + > { + self.client.advertise_multicast_groups(body).await + } + + async fn withdraw_multicast_groups<'a>( + &'a self, + body: &'a Vec, + ) -> Result< + ddm_admin_client::ResponseValue<()>, + ddm_admin_client::Error, + > { + self.client.withdraw_multicast_groups(body).await + } } /// Production switch zone that uses libnet for route lookups (illumos only). @@ -430,6 +499,7 @@ pub(crate) mod test { use crate::MG_LOWER_TAG; use super::*; + use mg_common::lock; use std::sync::Mutex; use std::{collections::HashMap, net::IpAddr}; @@ -484,7 +554,7 @@ pub(crate) mod test { link_id: &LinkId, ) -> Result, DpdClientError> { - let links = self.links.lock().unwrap(); + let links = lock!(self.links); let link = links .iter() .find(|x| &x.port_id == port_id && &x.link_id == link_id); @@ -502,10 +572,7 @@ pub(crate) mod test { dpd_client::ResponseValue>, DpdClientError, > { - let result = self - .v4_routes - .lock() - .unwrap() + let result = lock!(self.v4_routes) .get(cidr) .cloned() .unwrap_or(Vec::default()); @@ -519,10 +586,7 @@ pub(crate) mod test { dpd_client::ResponseValue>, DpdClientError, > { - let result = self - .v6_routes - .lock() - .unwrap() + let result = lock!(self.v6_routes) .get(cidr) .cloned() .unwrap_or(Vec::default()); @@ -534,7 +598,7 @@ pub(crate) mod test { addr: &Ipv6Entry, ) -> Result, DpdClientError> { - self.loopback.lock().unwrap().replace(addr.clone()); + lock!(self.loopback).replace(addr.clone()); Ok(dpd_response_ok!(())) } @@ -545,7 +609,7 @@ pub(crate) mod test { dpd_client::ResponseValue>, DpdClientError, > { - let links = self.links.lock().unwrap(); + let links = lock!(self.links); let result = links .iter() .filter(|x| match filter { @@ -612,7 +676,7 @@ pub(crate) mod test { RouteTarget::V4(v4) => Route::V4(v4.clone()), RouteTarget::V6(v6) => Route::V6(v6.clone()), }; - let mut routes = self.v4_routes.lock().unwrap(); + let mut routes = lock!(self.v4_routes); match routes.get_mut(&body.cidr) { Some(targets) => { targets.push(route); @@ -629,7 +693,7 @@ pub(crate) mod test { body: &'a Ipv6RouteUpdate, ) -> Result, DpdClientError> { - let mut routes = self.v6_routes.lock().unwrap(); + let mut routes = lock!(self.v6_routes); match routes.get_mut(&body.cidr) { Some(targets) => { targets.push(body.target.clone()); @@ -649,7 +713,7 @@ pub(crate) mod test { tgt_ip: &'a IpAddr, ) -> Result, DpdClientError> { - let mut routes = self.v4_routes.lock().unwrap(); + let mut routes = lock!(self.v4_routes); if let Some(targets) = routes.get_mut(cidr) { targets.retain(|x| match (x, tgt_ip) { (Route::V4(x), IpAddr::V4(ip)) => { @@ -677,7 +741,7 @@ pub(crate) mod test { tgt_ip: &'a std::net::Ipv6Addr, ) -> Result, DpdClientError> { - let mut routes = self.v6_routes.lock().unwrap(); + let mut routes = lock!(self.v6_routes); if let Some(targets) = routes.get_mut(cidr) { targets.retain(|x| { !(x.tgt_ip == *tgt_ip @@ -699,6 +763,7 @@ pub(crate) mod test { pub(crate) struct TestDdm { pub(crate) tunnel_originated: Mutex>, pub(crate) originated: Mutex>, + pub(crate) multicast_originated: Mutex>, } impl Default for TestDdm { @@ -706,6 +771,7 @@ pub(crate) mod test { Self { tunnel_originated: Mutex::new(Vec::default()), originated: Mutex::new(Vec::default()), + multicast_originated: Mutex::new(Vec::default()), } } } @@ -717,9 +783,7 @@ pub(crate) mod test { ddm_admin_client::ResponseValue>, ddm_admin_client::Error, > { - Ok(ddm_response_ok!( - self.tunnel_originated.lock().unwrap().clone() - )) + Ok(ddm_response_ok!(lock!(self.tunnel_originated).clone())) } async fn get_originated( @@ -728,7 +792,7 @@ pub(crate) mod test { ddm_admin_client::ResponseValue>, ddm_admin_client::Error, > { - Ok(ddm_response_ok!(self.originated.lock().unwrap().clone())) + Ok(ddm_response_ok!(lock!(self.originated).clone())) } async fn advertise_prefixes<'a>( @@ -738,7 +802,7 @@ pub(crate) mod test { ddm_admin_client::ResponseValue<()>, ddm_admin_client::Error, > { - self.originated.lock().unwrap().extend(body); + lock!(self.originated).extend(body); Ok(ddm_response_ok!(())) } @@ -749,7 +813,7 @@ pub(crate) mod test { ddm_admin_client::ResponseValue<()>, ddm_admin_client::Error, > { - self.tunnel_originated.lock().unwrap().extend(body.clone()); + lock!(self.tunnel_originated).extend(body.clone()); Ok(ddm_response_ok!(())) } @@ -760,10 +824,38 @@ pub(crate) mod test { ddm_admin_client::ResponseValue<()>, ddm_admin_client::Error, > { - self.tunnel_originated - .lock() - .unwrap() - .retain(|x| !body.contains(x)); + lock!(self.tunnel_originated).retain(|x| !body.contains(x)); + Ok(ddm_response_ok!(())) + } + + async fn get_originated_multicast_groups( + &self, + ) -> Result< + ddm_admin_client::ResponseValue>, + ddm_admin_client::Error, + > { + Ok(ddm_response_ok!(lock!(self.multicast_originated).clone())) + } + + async fn advertise_multicast_groups<'a>( + &'a self, + body: &'a Vec, + ) -> Result< + ddm_admin_client::ResponseValue<()>, + ddm_admin_client::Error, + > { + lock!(self.multicast_originated).extend(body.clone()); + Ok(ddm_response_ok!(())) + } + + async fn withdraw_multicast_groups<'a>( + &'a self, + body: &'a Vec, + ) -> Result< + ddm_admin_client::ResponseValue<()>, + ddm_admin_client::Error, + > { + lock!(self.multicast_originated).retain(|x| !body.contains(x)); Ok(ddm_response_ok!(())) } } diff --git a/mg-types/versions/Cargo.toml b/mg-types/versions/Cargo.toml index 854e12c1..254b82e9 100644 --- a/mg-types/versions/Cargo.toml +++ b/mg-types/versions/Cargo.toml @@ -6,6 +6,7 @@ edition = "2024" [dependencies] bfd.workspace = true bgp.workspace = true +mg-common.workspace = true rdb.workspace = true schemars.workspace = true serde.workspace = true diff --git a/mg-types/versions/src/multicast_support/mrib.rs b/mg-types/versions/src/multicast_support/mrib.rs index e6779b3a..7e4e4f88 100644 --- a/mg-types/versions/src/multicast_support/mrib.rs +++ b/mg-types/versions/src/multicast_support/mrib.rs @@ -8,9 +8,8 @@ use std::net::IpAddr; -use rdb::types::{ - AddressFamily, MulticastRouteKey, UnderlayMulticastIpv6, Vni, -}; +use mg_common::net::UnderlayMulticastIpv6; +use rdb::types::{AddressFamily, MulticastRouteKey, Vni}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; diff --git a/mgadm/src/mrib.rs b/mgadm/src/mrib.rs index 402421d2..ab4548c5 100644 --- a/mgadm/src/mrib.rs +++ b/mgadm/src/mrib.rs @@ -21,9 +21,9 @@ use mg_admin_client::types::{ MribRpfRebuildIntervalRequest, MulticastRoute, MulticastRouteKey, RouteOriginFilter, Vni, }; -use rdb::types::{AddressFamily, DEFAULT_MULTICAST_VNI}; +use rdb::types::AddressFamily; -const DEFAULT_VNI: u32 = DEFAULT_MULTICAST_VNI.as_u32(); +const DEFAULT_VNI: u32 = rdb::Vni::DEFAULT_MULTICAST_VNI.as_u32(); fn parse_route_origin(s: &str) -> Result { match s.to_lowercase().as_str() { diff --git a/openapi/ddm-admin/ddm-admin-1.0.0-b6eac7.json.gitstub b/openapi/ddm-admin/ddm-admin-1.0.0-b6eac7.json.gitstub new file mode 100644 index 00000000..0d935c8b --- /dev/null +++ b/openapi/ddm-admin/ddm-admin-1.0.0-b6eac7.json.gitstub @@ -0,0 +1 @@ +76204d2907209bd8b963fb2da976ea688282d990:openapi/ddm-admin/ddm-admin-1.0.0-b6eac7.json diff --git a/openapi/ddm-admin/ddm-admin-1.0.0-b6eac7.json b/openapi/ddm-admin/ddm-admin-2.0.0-0cfd90.json similarity index 60% rename from openapi/ddm-admin/ddm-admin-1.0.0-b6eac7.json rename to openapi/ddm-admin/ddm-admin-2.0.0-0cfd90.json index fe80efd3..5684230a 100644 --- a/openapi/ddm-admin/ddm-admin-1.0.0-b6eac7.json +++ b/openapi/ddm-admin/ddm-admin-2.0.0-0cfd90.json @@ -6,7 +6,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "1.0.0" + "version": "2.0.0" }, "paths": { "/disable-stats": { @@ -51,6 +51,94 @@ } } }, + "/multicast_group": { + "put": { + "operationId": "advertise_multicast_groups", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Set_of_MulticastOrigin", + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastOrigin" + }, + "uniqueItems": true + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "operationId": "withdraw_multicast_groups", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Set_of_MulticastOrigin", + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastOrigin" + }, + "uniqueItems": true + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/multicast_groups": { + "get": { + "operationId": "get_multicast_groups", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Set_of_MulticastRoute", + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastRoute" + }, + "uniqueItems": true + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/originated": { "get": { "operationId": "get_originated", @@ -79,6 +167,34 @@ } } }, + "/originated_multicast_groups": { + "get": { + "operationId": "get_originated_multicast_groups", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Set_of_MulticastOrigin", + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastOrigin" + }, + "uniqueItems": true + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/originated_tunnel_endpoints": { "get": { "operationId": "get_originated_tunnel_endpoints", @@ -107,6 +223,34 @@ } } }, + "/peer": { + "put": { + "summary": "Set peer information for a given interface index, bypassing the state machine.", + "description": "Intended for test fixtures that run `ddmd` with `--no-state-machine`. In a normal run, discovery writes peer entries keyed by interface index whenever it processes an advertisement, so any directly-injected entry for an active interface will be overwritten the next time a peer is observed there.", + "operationId": "put_peer", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/PutPeerRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/peers": { "get": { "operationId": "get_peers", @@ -444,6 +588,106 @@ "type": "string", "pattern": "^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\\/([0-9]|[1-9][0-9]|1[0-1][0-9]|12[0-8])$" }, + "MulticastOrigin": { + "description": "Origin information for a multicast group announcement.\n\nThis is analogous to TunnelOrigin but for multicast groups.\n\nThis represents a subscription to a multicast group that should be advertised via DDM. The overlay_group is the application-visible multicast address (e.g., 233.252.0.1 or ff0e::1), while underlay_group is the mapped admin-local scoped IPv6 address (ff04::X) used in the underlay network.", + "type": "object", + "properties": { + "metric": { + "description": "Metric for path selection (lower is better).\n\nUsed for multi-rack replication optimization. Excluded from identity (Hash/Eq) so that metric changes update an existing entry rather than creating a duplicate.", + "default": 0, + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "overlay_group": { + "description": "The overlay multicast group address (IPv4 or IPv6). This is the group address visible to applications.", + "type": "string", + "format": "ip" + }, + "source": { + "nullable": true, + "description": "Optional source address for Source-Specific Multicast (S,G) routes. None for Any-Source Multicast (*,G) routes.", + "default": null, + "type": "string", + "format": "ip" + }, + "underlay_group": { + "description": "The underlay multicast group address (ff04::X). Validated at construction to be within ff04::/64.", + "type": "string", + "format": "ipv6" + }, + "vni": { + "description": "VNI for this multicast group (identifies the VPC/network context).", + "default": 77, + "allOf": [ + { + "$ref": "#/components/schemas/Vni" + } + ] + } + }, + "required": [ + "overlay_group", + "underlay_group" + ] + }, + "MulticastPathHop": { + "description": "A single hop in the multicast path, carrying metadata needed for replication optimization.", + "type": "object", + "properties": { + "downstream_subscriber_count": { + "description": "Number of downstream subscribers reachable via this hop. Used for load-aware replication decisions in multi-rack topologies.", + "default": 0, + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "router_id": { + "description": "Router identifier (hostname).", + "type": "string" + }, + "underlay_addr": { + "description": "The underlay address of this router (for replication targeting).", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "router_id", + "underlay_addr" + ] + }, + "MulticastRoute": { + "description": "A multicast route learned via DDM.\n\nCarries a MulticastOrigin (overlay group + ff04::/64 underlay mapping) and the path vector from the originating subscriber through intermediate transit routers.", + "type": "object", + "properties": { + "nexthop": { + "description": "Underlay nexthop address (DDM peer that advertised this route). Used to associate the route with a peer for expiration.", + "type": "string", + "format": "ipv6" + }, + "origin": { + "description": "The multicast group origin information.", + "allOf": [ + { + "$ref": "#/components/schemas/MulticastOrigin" + } + ] + }, + "path": { + "description": "Path vector from the originating subscriber outward. Each hop records the router that redistributed this subscription announcement. Used for loop detection on pull and for future replication optimization in multi-rack topologies.", + "default": [], + "type": "array", + "items": { + "$ref": "#/components/schemas/MulticastPathHop" + } + } + }, + "required": [ + "nexthop", + "origin" + ] + }, "PathVector": { "type": "object", "properties": { @@ -463,6 +707,7 @@ ] }, "PeerInfo": { + "description": "Peer information with an optional interface name.", "type": "object", "properties": { "addr": { @@ -472,6 +717,12 @@ "host": { "type": "string" }, + "if_name": { + "nullable": true, + "description": "Interface name the peer was discovered on (e.g., \"tfportrear0_0\").", + "default": null, + "type": "string" + }, "kind": { "$ref": "#/components/schemas/RouterKind" }, @@ -494,6 +745,24 @@ "Expired" ] }, + "PutPeerRequest": { + "description": "Body for `PUT /peer`. Sets `info` at the slot keyed by `if_index` (interface index) in the in-memory peer map.", + "type": "object", + "properties": { + "if_index": { + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "info": { + "$ref": "#/components/schemas/PeerInfo" + } + }, + "required": [ + "if_index", + "info" + ] + }, "RouterKind": { "type": "integer", "enum": [ @@ -544,6 +813,12 @@ "nexthop", "origin" ] + }, + "Vni": { + "description": "A Geneve Virtual Network Identifier", + "type": "integer", + "format": "uint32", + "minimum": 0 } }, "responses": { diff --git a/openapi/ddm-admin/ddm-admin-latest.json b/openapi/ddm-admin/ddm-admin-latest.json index 45446659..0032bd2a 120000 --- a/openapi/ddm-admin/ddm-admin-latest.json +++ b/openapi/ddm-admin/ddm-admin-latest.json @@ -1 +1 @@ -ddm-admin-1.0.0-b6eac7.json \ No newline at end of file +ddm-admin-2.0.0-0cfd90.json \ No newline at end of file diff --git a/openapi/mg-admin/mg-admin-9.0.0-2de23b.json b/openapi/mg-admin/mg-admin-9.0.0-a21db7.json similarity index 99% rename from openapi/mg-admin/mg-admin-9.0.0-2de23b.json rename to openapi/mg-admin/mg-admin-9.0.0-a21db7.json index 1efd66b0..384ed242 100644 --- a/openapi/mg-admin/mg-admin-9.0.0-2de23b.json +++ b/openapi/mg-admin/mg-admin-9.0.0-a21db7.json @@ -5466,7 +5466,7 @@ ] }, "PeerId": { - "description": "Identifies a BGP peer for session management and route tracking.\n\nBGP peers can be identified in two ways: - **Numbered**: Traditional BGP peering using explicit IP addresses - **Unnumbered**: Modern peering using interface names with link-local addresses\n\n# Unnumbered Peering\n\nUnnumbered BGP uses interface names as stable identifiers instead of IP addresses. This is important because: - Link-local IPv6 addresses are discovered dynamically via NDP - Multiple interfaces may have peers with the same link-local address (e.g., fe80::1 on eth0 and fe80::1 on eth1) - Scope ID (interface index) disambiguates link-local addresses, but is not stable across reboots - Interface names provide stable, unambiguous peer identification\n\n# Route Tracking\n\nThis type is used in [`BgpPathProperties`](crate::BgpPathProperties) to track which peer advertised a route. Using `PeerId` instead of `IpAddr` ensures: - Unnumbered peers are properly distinguished even if they share link-local IPs - Route cleanup correctly removes only the routes from the intended peer - No cross-contamination when multiple unnumbered sessions exist\n\n# Examples\n\n``` use rdb_types::PeerId; use std::net::IpAddr;\n\n// Numbered peer let numbered = PeerId::Ip(\"192.0.2.1\".parse::().unwrap());\n\n// Unnumbered peer let unnumbered = PeerId::Interface(\"eth0\".to_string()); ```", + "description": "Identifies a BGP peer for session management and route tracking.\n\nBGP peers can be identified in two ways: - **Numbered**: Traditional BGP peering using explicit IP addresses - **Unnumbered**: Modern peering using interface names with link-local addresses\n\n# Unnumbered Peering\n\nUnnumbered BGP uses interface names as stable identifiers instead of IP addresses. This is important because: - Link-local IPv6 addresses are discovered dynamically via NDP - Multiple interfaces may have peers with the same link-local address (e.g., fe80::1 on eth0 and fe80::1 on eth1) - Scope ID (interface index) disambiguates link-local addresses, but is not stable across reboots - Interface names provide stable, unambiguous peer identification\n\n# Route Tracking\n\nThis type is used in `BgpPathProperties` to track which peer advertised a route. Using `PeerId` instead of `IpAddr` ensures: - Unnumbered peers are properly distinguished even if they share link-local IPs - Route cleanup correctly removes only the routes from the intended peer - No cross-contamination when multiple unnumbered sessions exist\n\n# Examples\n\n``` use rdb_types::PeerId; use std::net::IpAddr;\n\n// Numbered peer let numbered = PeerId::Ip(\"192.0.2.1\".parse::().unwrap());\n\n// Unnumbered peer let unnumbered = PeerId::Interface(\"eth0\".to_string()); ```", "oneOf": [ { "description": "Numbered peer identified by IP address\n\nUsed for traditional BGP sessions where peers are configured with explicit IP addresses (either IPv4 or IPv6 global unicast).", diff --git a/openapi/mg-admin/mg-admin-latest.json b/openapi/mg-admin/mg-admin-latest.json index 48fc10dc..f434d6dc 120000 --- a/openapi/mg-admin/mg-admin-latest.json +++ b/openapi/mg-admin/mg-admin-latest.json @@ -1 +1 @@ -mg-admin-9.0.0-2de23b.json \ No newline at end of file +mg-admin-9.0.0-a21db7.json \ No newline at end of file diff --git a/rdb-types/src/lib.rs b/rdb-types/src/lib.rs index af365e51..44a3777c 100644 --- a/rdb-types/src/lib.rs +++ b/rdb-types/src/lib.rs @@ -459,8 +459,8 @@ pub enum ProtocolFilter { /// /// # Route Tracking /// -/// This type is used in [`BgpPathProperties`](crate::BgpPathProperties) to track -/// which peer advertised a route. Using `PeerId` instead of `IpAddr` ensures: +/// This type is used in `BgpPathProperties` to track which peer advertised a +/// route. Using `PeerId` instead of `IpAddr` ensures: /// - Unnumbered peers are properly distinguished even if they share link-local IPs /// - Route cleanup correctly removes only the routes from the intended peer /// - No cross-contamination when multiple unnumbered sessions exist diff --git a/rdb/src/db.rs b/rdb/src/db.rs index 3144bcde..f61de096 100644 --- a/rdb/src/db.rs +++ b/rdb/src/db.rs @@ -1452,6 +1452,19 @@ impl Db { }); } + // Synchronously revalidate affected (S,G) routes against the + // updated unicast RIB. The poptrie rebuild triggered above is + // async, so without this the MRIB update would depend on + // the rebuild thread completing first. The linear-scan fallback + // in rpf_table's lookup is sufficient here. + for prefix in &pcn.changed { + let event = match prefix { + Prefix::V4(p) => crate::mrib::rpf::RebuildEvent::V4(Some(*p)), + Prefix::V6(p) => crate::mrib::rpf::RebuildEvent::V6(Some(*p)), + }; + self.revalidate_mrib(Some(event)); + } + self.notify(pcn); Ok(()) } @@ -2020,14 +2033,14 @@ impl Reaper { #[cfg(test)] mod test { use crate::{ - AddressFamily, DEFAULT_MULTICAST_VNI, DEFAULT_RIB_PRIORITY_STATIC, - Path, Prefix, Prefix4, Prefix6, StaticRouteKey, + AddressFamily, DEFAULT_RIB_PRIORITY_STATIC, Path, Prefix, Prefix4, + Prefix6, StaticRouteKey, db::Db, test::{TEST_WAIT_ITERATIONS, TestDb}, types::{ MulticastAddr, MulticastAddrV4, MulticastAddrV6, MulticastRoute, MulticastRouteKey, MulticastSourceProtocol, PrefixDbKey, - UnderlayMulticastIpv6, UnicastAddrV4, UnicastAddrV6, + UnderlayMulticastIpv6, UnicastAddrV4, UnicastAddrV6, Vni, test_helpers::path_vecs_equal, }, }; @@ -2385,7 +2398,7 @@ mod test { let key = MulticastRouteKey::new( Some(s_ip), group, - DEFAULT_MULTICAST_VNI, + Vni::DEFAULT_MULTICAST_VNI, ) .expect("AF match"); let route = MulticastRoute::new( diff --git a/rdb/src/types.rs b/rdb/src/types.rs index 10c8a4b6..ac0b8e36 100644 --- a/rdb/src/types.rs +++ b/rdb/src/types.rs @@ -800,9 +800,6 @@ impl Display for PrefixChangeNotification { // MRIB (Multicast RIB) Types // ============================================================================ -/// Default VNI for fleet-wide multicast routing. -pub const DEFAULT_MULTICAST_VNI: Vni = Vni::DEFAULT_MULTICAST_VNI; - /// A validated IPv4 unicast address suitable for multicast source fields. /// /// This rejects addresses that cannot appear as a forwarded unicast source: @@ -1118,92 +1115,7 @@ impl From for Ipv6Addr { } } -/// A validated underlay multicast IPv6 address within ff04::/64. -/// -/// The Oxide rack maps overlay multicast groups 1:1 to admin-local scoped -/// IPv6 multicast addresses in `UNDERLAY_MULTICAST_SUBNET` (ff04::/64). -/// This type enforces that invariant at construction time. -/// -// TODO: This duplicates `dpd_types::mcast::UnderlayMulticastIpv6` in dendrite. -// Both should be consolidated into `omicron_common` so maghemite, dendrite, -// and omicron share a single definition. -#[derive( - Debug, - Copy, - Clone, - Eq, - PartialEq, - PartialOrd, - Ord, - Hash, - Serialize, - Deserialize, - JsonSchema, -)] -#[serde(try_from = "Ipv6Addr", into = "Ipv6Addr")] -#[schemars(transparent)] -pub struct UnderlayMulticastIpv6(Ipv6Addr); - -impl UnderlayMulticastIpv6 { - /// Create a new validated underlay multicast address. - /// - /// # Errors - /// - /// Returns an error if the address is not within `UNDERLAY_MULTICAST_SUBNET` - /// (ff04::/64). - pub fn new(value: Ipv6Addr) -> Result { - if !UNDERLAY_MULTICAST_SUBNET.contains(value) { - return Err(Error::Validation(format!( - "underlay address {value} is not within \ - {UNDERLAY_MULTICAST_SUBNET}" - ))); - } - Ok(Self(value)) - } - - /// Returns the underlying IPv6 address. - #[inline] - pub const fn ip(&self) -> Ipv6Addr { - self.0 - } -} - -impl fmt::Display for UnderlayMulticastIpv6 { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.0) - } -} - -impl TryFrom for UnderlayMulticastIpv6 { - type Error = Error; - - fn try_from(value: Ipv6Addr) -> Result { - Self::new(value) - } -} - -impl From for Ipv6Addr { - fn from(addr: UnderlayMulticastIpv6) -> Self { - addr.0 - } -} - -impl From for IpAddr { - fn from(addr: UnderlayMulticastIpv6) -> Self { - IpAddr::V6(addr.0) - } -} - -impl FromStr for UnderlayMulticastIpv6 { - type Err = Error; - - fn from_str(s: &str) -> Result { - let addr: Ipv6Addr = s.parse().map_err(|_| { - Error::Validation(format!("invalid IPv6 address: {s}")) - })?; - Self::new(addr) - } -} +pub use mg_common::net::UnderlayMulticastIpv6; /// A validated multicast group address (IPv4 or IPv6). /// @@ -1700,6 +1612,18 @@ impl MulticastRoute { } } +impl From<&MulticastRoute> for mg_common::net::MulticastOrigin { + fn from(route: &MulticastRoute) -> Self { + Self { + overlay_group: route.key.group().ip(), + underlay_group: route.underlay_group, + vni: route.key.vni(), + metric: 0, + source: route.key.source(), + } + } +} + /// Source of a multicast route entry. #[derive( Debug, Copy, Clone, Serialize, Deserialize, JsonSchema, Eq, PartialEq, @@ -2040,7 +1964,7 @@ mod test { let result = MulticastRouteKey::new( Some(IpAddr::V4(src.ip())), group.into(), - DEFAULT_MULTICAST_VNI, + Vni::DEFAULT_MULTICAST_VNI, ); assert!( result.is_err(), @@ -2055,7 +1979,7 @@ mod test { let result = MulticastRouteKey::new( Some(IpAddr::V6(src)), group.into(), - DEFAULT_MULTICAST_VNI, + Vni::DEFAULT_MULTICAST_VNI, ); assert!( result.is_err(), diff --git a/tests/src/ddm.rs b/tests/src/ddm.rs index 4e0afdd0..fde1f8af 100644 --- a/tests/src/ddm.rs +++ b/tests/src/ddm.rs @@ -4,7 +4,10 @@ use anyhow::{Result, anyhow}; use ddm_admin_client::Client; -use ddm_admin_client::types::TunnelOrigin; +use ddm_admin_client::types::{ + MulticastOrigin, PeerInfo, PeerStatus, PutPeerRequest, RouterKind, + TunnelOrigin, Vni, +}; use slog::{Drain, Logger}; use std::env; use std::net::Ipv6Addr; @@ -462,6 +465,31 @@ async fn run_trio_tests( println!("initial peering test passed"); + // PUT /peer smoke against a running ddmd. Use an unused interface + // index so the live discovery handler does not race the injection + // on a real interface. + let synthetic = PeerInfo { + status: PeerStatus::Active, + addr: "fd00::dead:beef".parse().unwrap(), + host: "synthetic".to_string(), + // RouterKind is integer-encoded in the generated client schema; + // 0 is `Server`. See ddm-types::initial::db::RouterKind. + kind: RouterKind::try_from(0_i64).unwrap(), + if_name: Some("synthetic0".to_string()), + }; + + t1.put_peer(&PutPeerRequest { + if_index: 9999, + info: synthetic.clone(), + }) + .await?; + + wait_for_eq!(t1.get_peers().await.map_or(99, |x| x.len()), 3); + let peers = t1.get_peers().await?; + assert_eq!(peers["9999"].host, "synthetic"); + + println!("put_peer synthetic injection passed"); + s1.advertise_prefixes(&vec!["fd00:1::/64".parse().unwrap()]) .await?; @@ -642,6 +670,49 @@ async fn run_trio_tests( println!("tunnel endpoint withdraw passed"); + // Multicast group advertise/withdraw across the trio. Mirrors how + // mg-lower in the switch zone publishes overlay→underlay multicast + // bindings: the transit router originates an advertisement, and the + // server routers learn it via DDM exchange. + wait_for_eq!(multicast_originated_count(&t1).await?, 0); + + let mcast_origin = MulticastOrigin { + overlay_group: "233.252.0.1".parse().unwrap(), + underlay_group: "ff04::100".parse().unwrap(), + vni: Vni(77), + source: None, + metric: 0, + }; + + t1.advertise_multicast_groups(&vec![mcast_origin.clone()]) + .await?; + + wait_for_eq!(multicast_originated_count(&t1).await?, 1); + wait_for_eq!(multicast_group_count(&t1).await?, 0); + wait_for_eq!(multicast_group_count(&s1).await?, 1); + wait_for_eq!(multicast_group_count(&s2).await?, 1); + + println!("multicast group advertise passed"); + + // Server router restart: s1's view of the multicast group must + // converge again after ddmd restarts. wait_for_eq tolerates the + // restart window via unwrap_or sentinel. + zs1.stop_router()?; + zs1.start_router(false)?; + let s1 = Client::new("http://10.0.0.1:8000", log.clone()); + wait_for_eq!(multicast_group_count(&s1).await.unwrap_or(99), 1); + + println!("multicast router restart passed"); + + t1.withdraw_multicast_groups(&vec![mcast_origin]).await?; + + wait_for_eq!(multicast_originated_count(&t1).await?, 0); + wait_for_eq!(multicast_group_count(&t1).await?, 0); + wait_for_eq!(multicast_group_count(&s1).await?, 0); + wait_for_eq!(multicast_group_count(&s2).await?, 0); + + println!("multicast group withdraw passed"); + Ok(()) } @@ -812,6 +883,14 @@ async fn tunnel_originated_endpoint_count(c: &Client) -> Result { Ok(c.get_originated_tunnel_endpoints().await?.len()) } +async fn multicast_group_count(c: &Client) -> Result { + Ok(c.get_multicast_groups().await?.len()) +} + +async fn multicast_originated_count(c: &Client) -> Result { + Ok(c.get_originated_multicast_groups().await?.len()) +} + fn init_logger() -> Logger { let decorator = slog_term::TermDecorator::new().build(); let drain = slog_term::FullFormat::new(decorator).build().fuse();