diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 9d60b21ed9c..22d553fbdcc 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -5327,7 +5327,7 @@ async fn cmd_db_instance_info( let table = tabled::Table::new(vmms.iter().map(|vmm| { let &Vmm { - id, + id: _, sled_id, propolis_ip: _, propolis_port: _, @@ -5336,15 +5336,12 @@ async fn cmd_db_instance_info( time_created, time_deleted, time_state_updated: _, - generation, - state, + generation: _, + state: _, + failure_reason: _, } = vmm; VmmRow { - state: VmmStateRow { - id, - state, - generation: generation.0.into(), - }, + state: VmmStateRow::from(vmm), sled_id: sled_id.into(), time_created, time_deleted, @@ -5365,10 +5362,23 @@ async fn cmd_db_instance_info( struct VmmStateRow { id: Uuid, state: db::model::VmmState, + #[tabled(display_with = "display_option_blank")] + failure_reason: Option, #[tabled(rename = "GEN")] generation: u64, } +impl From<&'_ db::model::Vmm> for VmmStateRow { + fn from(vmm: &db::model::Vmm) -> Self { + Self { + id: vmm.id, + state: vmm.state, + failure_reason: vmm.failure_reason, + generation: vmm.generation.0.into(), + } + } +} + /// Common fields extracted from an InstanceAndActiveVmm, shared by /// both `CustomerInstanceRow` and `SledInstanceRow`. struct InstanceFields { @@ -7965,6 +7975,8 @@ fn prettyprint_vmm( const CPU_PLATFORM: &'static str = "CPU platform"; const ADDRESS: &'static str = "propolis address"; const STATE: &'static str = "state"; + const FAILURE_REASON: &'static str = " failure reason"; + const FAILURE_NOTE: &'static str = " note"; const WIDTH: usize = const_max_len(&[ ID, CREATED, @@ -7976,6 +7988,8 @@ fn prettyprint_vmm( CPU_PLATFORM, STATE, ADDRESS, + FAILURE_REASON, + FAILURE_NOTE, ]); let width = std::cmp::max(width, Some(WIDTH)).unwrap_or(WIDTH); @@ -7991,6 +8005,7 @@ fn prettyprint_vmm( state, generation, time_state_updated, + failure_reason, } = vmm; println!("{indent}{ID:>width$}: {id}"); @@ -8002,6 +8017,31 @@ fn prettyprint_vmm( println!("{indent}{DELETED:width$}: {deleted}"); } println!("{indent}{STATE:>width$}: {state}"); + if let Some(reason) = failure_reason { + println!("{indent}{FAILURE_REASON:>width$}: {reason}"); + + if state == &db::model::VmmState::Failed { + println!( + "{indent}{FAILURE_NOTE:>width$}: {}", + reason.description() + ); + } else { + println!( + "{:width$}: {time_state_updated:?} (generation {g})" @@ -8085,7 +8125,7 @@ async fn cmd_db_vmm_list( impl<'a> From<&'a (Vmm, Option)> for VmmRow<'a> { fn from((vmm, sled): &'a (Vmm, Option)) -> Self { let &Vmm { - id, + id: _, time_created: _, time_deleted: _, instance_id, @@ -8094,8 +8134,9 @@ async fn cmd_db_vmm_list( propolis_port: _, cpu_platform: _, time_state_updated: _, - generation, - state, + generation: _, + state: _, + failure_reason: _, } = vmm; let sled = match sled { Some(sled) => sled.serial_number(), @@ -8104,15 +8145,7 @@ async fn cmd_db_vmm_list( "" } }; - VmmRow { - instance_id, - state: VmmStateRow { - id, - state, - generation: generation.0.into(), - }, - sled, - } + VmmRow { instance_id, state: VmmStateRow::from(vmm), sled } } } diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index bb833e31572..bd78fd55f6d 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -246,6 +246,7 @@ mod virtual_provisioning_collection; mod virtual_provisioning_resource; mod vmm; mod vmm_cpu_platform; +mod vmm_failure_reason; mod vni; mod volume; mod volume_repair; @@ -386,6 +387,7 @@ pub use virtual_provisioning_collection::*; pub use virtual_provisioning_resource::*; pub use vmm::*; pub use vmm_cpu_platform::*; +pub use vmm_failure_reason::*; pub use vmm_state::*; pub use vni::*; pub use volume::*; diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 3071ce5d4a5..23ad6b94e3c 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(258, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(259, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(259, "vmm-failure-reason"), KnownVersion::new(258, "lookup-unmarked-ereports-by-class"), KnownVersion::new(257, "add-disk-adoption-requests"), KnownVersion::new(256, "bgp-unnumbered-peer-cleanup"), diff --git a/nexus/db-model/src/vmm.rs b/nexus/db-model/src/vmm.rs index 5d31f3eac56..039171aa947 100644 --- a/nexus/db-model/src/vmm.rs +++ b/nexus/db-model/src/vmm.rs @@ -14,7 +14,7 @@ use super::{Generation, VmmState}; use crate::typed_uuid::DbTypedUuid; -use crate::{SqlU16, VmmCpuPlatform}; +use crate::{SqlU16, VmmCpuPlatform, VmmFailureReason}; use chrono::{DateTime, Utc}; use nexus_db_schema::schema::vmm; use omicron_uuid_kinds::*; @@ -72,6 +72,10 @@ pub struct Vmm { /// control plane if this VMM's instance didn't specify a required platform /// when it was started. pub cpu_platform: VmmCpuPlatform, + + /// If this VMM is in the `Failed` state, this field describes why it + /// failed. This is `None` for VMMs that are not in the `Failed` state. + pub failure_reason: Option, } impl Vmm { @@ -101,15 +105,34 @@ impl Vmm { propolis_port: SqlU16(propolis_port), state: VmmState::Creating, cpu_platform, + failure_reason: None, } } - /// Returns the runtime state of this VMM. - pub fn runtime(&self) -> VmmRuntimeState { - VmmRuntimeState { - time_state_updated: self.time_state_updated, - generation: self.generation, - state: self.state, + pub fn runtime(&self) -> nexus_types::instance::VmmRuntimeState { + use nexus_types::instance as types; + let state = match (self.state, self.failure_reason) { + (VmmState::Failed, None) => { + // Weird and bad! + types::VmmState::Failed(types::VmmFailureReason::Prehistoric) + } + (VmmState::Failed, Some(reason)) => { + types::VmmState::Failed(reason.into()) + } + (VmmState::Creating, _) => types::VmmState::Creating, + (VmmState::Starting, _) => types::VmmState::Starting, + (VmmState::Running, _) => types::VmmState::Running, + (VmmState::Stopping, _) => types::VmmState::Stopping, + (VmmState::Stopped, _) => types::VmmState::Stopped, + (VmmState::Rebooting, _) => types::VmmState::Rebooting, + (VmmState::Migrating, _) => types::VmmState::Migrating, + (VmmState::Destroyed, _) => types::VmmState::Destroyed, + (VmmState::SagaUnwound, _) => types::VmmState::SagaUnwound, + }; + types::VmmRuntimeState { + state, + generation: self.generation.into(), + time_updated: self.time_state_updated, } } @@ -117,50 +140,3 @@ impl Vmm { self.sled_id.into() } } - -/// Runtime state for a VMM, owned by the sled where that VMM is running. -#[derive( - Clone, - Debug, - AsChangeset, - Selectable, - Insertable, - Queryable, - Serialize, - Deserialize, - PartialEq, -)] -#[diesel(table_name = vmm)] -pub struct VmmRuntimeState { - /// The time at which this state was most recently updated. - pub time_state_updated: DateTime, - - /// The generation number protecting this VMM's state and update time. - #[diesel(column_name = state_generation)] - #[serde(rename = "gen")] - pub generation: Generation, - - /// The state of this VMM. If this VMM is the active VMM for a given - /// instance, this state is the instance's logical state. - pub state: VmmState, -} - -impl From for VmmRuntimeState { - fn from(value: sled_agent_types::instance::VmmRuntimeState) -> Self { - Self { - state: value.state.into(), - time_state_updated: value.time_updated, - generation: value.generation.into(), - } - } -} - -impl From for sled_agent_types::instance::VmmRuntimeState { - fn from(s: Vmm) -> Self { - Self { - generation: s.generation.into(), - state: s.state.into(), - time_updated: s.time_state_updated, - } - } -} diff --git a/nexus/db-model/src/vmm_failure_reason.rs b/nexus/db-model/src/vmm_failure_reason.rs new file mode 100644 index 00000000000..b2ab3e8531c --- /dev/null +++ b/nexus/db-model/src/vmm_failure_reason.rs @@ -0,0 +1,82 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Describes why a VMM record is in the `Failed` state. + +use super::impl_enum_type; +use nexus_types::instance as types; +use serde::{Deserialize, Serialize}; +use std::fmt; + +impl_enum_type!( + VmmFailureReasonEnum: + + #[derive( + Copy, + Clone, + Debug, + PartialEq, + AsExpression, + FromSqlRow, + Serialize, + Deserialize, + )] + pub enum VmmFailureReason; + + // The reason for this VMM's failure is unknown, because the VMM failed + // prior to the recording of failure reasons. + Prehistoric => b"prehistoric" + // The sled-agent reported that this VMM failed. + FromSledAgent => b"from_sled_agent" + // A request to the sled-agent received a response indicating that this + // VMM is no longer present on the sled. + NoSuchInstance => b"no_such_instance" + // The sled on which this VMM was running has been expunged. + SledExpunged => b"sled_expunged" + // The sled on which this VMM was running has powered off. + SledOff => b"sled_off" +); + +impl From for VmmFailureReason { + fn from(reason: types::VmmFailureReason) -> Self { + match reason { + types::VmmFailureReason::Prehistoric => Self::Prehistoric, + types::VmmFailureReason::FromSledAgent => Self::FromSledAgent, + types::VmmFailureReason::NoSuchInstance => Self::NoSuchInstance, + types::VmmFailureReason::SledExpunged => Self::SledExpunged, + types::VmmFailureReason::SledOff => Self::SledOff, + } + } +} + +impl From for types::VmmFailureReason { + fn from(reason: VmmFailureReason) -> Self { + match reason { + VmmFailureReason::Prehistoric => Self::Prehistoric, + VmmFailureReason::FromSledAgent => Self::FromSledAgent, + VmmFailureReason::NoSuchInstance => Self::NoSuchInstance, + VmmFailureReason::SledExpunged => Self::SledExpunged, + VmmFailureReason::SledOff => Self::SledOff, + } + } +} + +impl VmmFailureReason { + pub fn from_vmm_state(state: types::VmmState) -> Option { + match state { + types::VmmState::Failed(reason) => Some(reason.into()), + _ => None, + } + } + + pub fn description(&self) -> &'static str { + types::VmmFailureReason::from(*self).description() + } +} + +impl fmt::Display for VmmFailureReason { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + types::VmmFailureReason::from(*self).fmt(f) + } +} diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index f604e5a0c4e..4df9b32d190 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -36,21 +36,37 @@ impl_enum_type!( ); impl VmmState { - pub fn label(&self) -> &'static str { + /// Converts this DB VMM state to the corresponding + /// `nexus_types::instance::VmmState`, to pass through methods defined in + /// `nexus_types`. + /// + /// This is an internal conversion that always emits the `Prehistoric` + /// failure reason, since we don't know the actual `nexus_types` failure + /// reason. Doing this is fine *here*, as the methods we intend to call + /// don't care about the failure reason. + fn to_nexus_state(self) -> nexus_types::instance::VmmState { + use nexus_types::instance::VmmFailureReason; + use nexus_types::instance::VmmState as NexusVmmState; match self { - VmmState::Creating => "creating", - VmmState::Starting => "starting", - VmmState::Running => "running", - VmmState::Stopping => "stopping", - VmmState::Stopped => "stopped", - VmmState::Rebooting => "rebooting", - VmmState::Migrating => "migrating", - VmmState::Failed => "failed", - VmmState::Destroyed => "destroyed", - VmmState::SagaUnwound => "saga_unwound", + Self::Creating => NexusVmmState::Creating, + Self::Starting => NexusVmmState::Starting, + Self::Running => NexusVmmState::Running, + Self::Stopping => NexusVmmState::Stopping, + Self::Stopped => NexusVmmState::Stopped, + Self::Rebooting => NexusVmmState::Rebooting, + Self::Migrating => NexusVmmState::Migrating, + Self::Failed => { + NexusVmmState::Failed(VmmFailureReason::Prehistoric) + } + Self::Destroyed => NexusVmmState::Destroyed, + Self::SagaUnwound => NexusVmmState::SagaUnwound, } } + pub fn label(&self) -> &'static str { + self.to_nexus_state().label() + } + /// All VMM states. pub const ALL_STATES: &'static [Self] = ::VARIANTS; @@ -71,34 +87,42 @@ impl VmmState { &[Self::Creating, Self::SagaUnwound, Self::Destroyed]; pub fn is_terminal(&self) -> bool { - Self::TERMINAL_STATES.contains(self) + self.to_nexus_state().is_terminal() } - /// Returns `true` if the instance is in a state in which it exists on a + + /// Returns `true` if the VMM is in a state where it is safe to + /// deallocate its sled resources and mark it as deleted. + pub fn is_destroyable(&self) -> bool { + self.to_nexus_state().is_destroyable() + } + + /// Returns `true` if the VMM is in a state in which it exists on a /// sled. pub fn exists_on_sled(&self) -> bool { - !Self::NONEXISTENT_STATES.contains(self) + self.to_nexus_state().exists_on_sled() } } impl fmt::Display for VmmState { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.label()) + fmt::Display::fmt(&self.to_nexus_state(), f) } } -impl From for sled_agent_types::instance::VmmState { - fn from(value: VmmState) -> Self { +impl From for VmmState { + fn from(value: nexus_types::instance::VmmState) -> Self { + use nexus_types::instance::VmmState as Input; match value { - // The `Creating` state is internal to Nexus; the outside world - // should treat it as equivalent to `Starting`. - VmmState::Creating | VmmState::Starting => Self::Starting, - VmmState::Running => Self::Running, - VmmState::Stopping => Self::Stopping, - VmmState::Stopped => Self::Stopped, - VmmState::Rebooting => Self::Rebooting, - VmmState::Migrating => Self::Migrating, - VmmState::Failed => Self::Failed, - VmmState::Destroyed | VmmState::SagaUnwound => Self::Destroyed, + Input::Creating => Self::Creating, + Input::Starting => Self::Starting, + Input::Running => Self::Running, + Input::Stopping => Self::Stopping, + Input::Stopped => Self::Stopped, + Input::Rebooting => Self::Rebooting, + Input::Migrating => Self::Migrating, + Input::Failed(_) => Self::Failed, + Input::Destroyed => Self::Destroyed, + Input::SagaUnwound => Self::SagaUnwound, } } } @@ -121,28 +145,7 @@ impl From for VmmState { impl From for omicron_common::api::external::InstanceState { fn from(value: VmmState) -> Self { - use omicron_common::api::external::InstanceState as Output; - - match value { - // An instance with a VMM which is in the `Creating` state maps to - // `InstanceState::Starting`, rather than `InstanceState::Creating`. - // If we are still creating the VMM, this is because we are - // attempting to *start* the instance; instances may be created - // without creating a VMM to run them, and then started later. - VmmState::Creating | VmmState::Starting => Output::Starting, - VmmState::Running => Output::Running, - VmmState::Stopping => Output::Stopping, - // `SagaUnwound` should map to `Stopped` so that an `instance_view` - // API call that produces an instance with an unwound VMM will appear to - // be `Stopped`. This is because instances with unwound VMMs can - // be started by a subsequent instance-start saga, just like - // instances whose internal state actually is `Stopped`. - VmmState::Stopped | VmmState::SagaUnwound => Output::Stopped, - VmmState::Rebooting => Output::Rebooting, - VmmState::Migrating => Output::Migrating, - VmmState::Failed => Output::Failed, - VmmState::Destroyed => Output::Destroyed, - } + value.to_nexus_state().into() } } @@ -201,4 +204,46 @@ mod tests { assert_eq!(Ok(dbg!(variant)), dbg!(variant.to_string().parse())); } } + + #[test] + fn test_terminal_states_consistent() { + for &state in VmmState::ALL_STATES { + assert_eq!( + VmmState::TERMINAL_STATES.contains(&state), + state.is_terminal(), + "inconsistency between nexus_db_model::VmmState::{state:?} \ + and nexus_types::instance::VmmState::{state:?}: if the \ + is_terminal() method in nexus_types returns true, the state \ + should be in TERMINAL_STATES, and vice versa", + ); + } + } + + #[test] + fn test_destroyable_states_consistent() { + for &state in VmmState::ALL_STATES { + assert_eq!( + VmmState::DESTROYABLE_STATES.contains(&state), + state.is_destroyable(), + "inconsistency between nexus_db_model::VmmState::{state:?} \ + and nexus_types::instance::VmmState::{state:?}: if the \ + is_destroyable() method in nexus_types returns true, the state \ + should be in DESTROYABLE_STATES, and vice versa", + ); + } + } + + #[test] + fn test_nonexistent_states_consistent() { + for &state in VmmState::ALL_STATES { + assert_eq!( + VmmState::NONEXISTENT_STATES.contains(&state), + !state.exists_on_sled(), + "inconsistency between nexus_db_model::VmmState::{state:?} \ + and nexus_types::instance::VmmState::{state:?}: if the \ + exists_on_sled() method in nexus_types returns false, the \ + state should be in NONEXISTENT_STATES, and vice versa", + ); + } + } } diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index eb00f9bc6ac..b6d199da917 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -2307,7 +2307,6 @@ mod tests { use nexus_db_model::InstanceState; use nexus_db_model::Project; use nexus_db_model::VmmCpuPlatform; - use nexus_db_model::VmmRuntimeState; use nexus_db_model::VmmState; use nexus_types::external_api::instance as instance_types; use nexus_types::external_api::project; @@ -2991,6 +2990,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3052,6 +3052,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3148,6 +3149,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Stopped, + failure_reason: None, }, ) .await @@ -3187,6 +3189,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3217,15 +3220,13 @@ mod tests { assert!(res.is_err()); // Okay, now, advance the active VMM to Running, and try again. + let vmm1_state = + vmm1.runtime().transition(nexus_types::instance::VmmState::Running); let updated = dbg!( datastore .vmm_update_runtime( &PropolisUuid::from_untyped_uuid(vmm1.id), - &VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(vmm2.generation.0.next()), - state: VmmState::Running, - }, + &vmm1_state, ) .await ) @@ -3288,6 +3289,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await @@ -3320,11 +3322,9 @@ mod tests { datastore .vmm_update_runtime( &PropolisUuid::from_untyped_uuid(vmm2.id), - &VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(vmm2.generation.0.next().next()), - state: VmmState::SagaUnwound, - }, + &vmm2.runtime().transition( + nexus_types::instance::VmmState::SagaUnwound + ) ) .await ) @@ -3433,6 +3433,7 @@ mod tests { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::Running, + failure_reason: None, }, ) .await diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index b83ccfad48f..68ad35fbb32 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -6,21 +6,25 @@ use super::DataStore; use crate::context::OpContext; +use crate::db::model; use crate::db::model::Vmm; -use crate::db::model::VmmRuntimeState; use crate::db::model::VmmState as DbVmmState; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateAndQueryResult; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; +use chrono::DateTime; use chrono::Utc; use diesel::prelude::*; use nexus_db_errors::ErrorHandler; use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; +use nexus_db_schema::schema::vmm; use nexus_db_schema::schema::vmm::dsl; +use nexus_types::instance::Migrations; +use nexus_types::instance::VmmRuntimeState; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -32,7 +36,7 @@ use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; -use sled_agent_types::instance::{MigrationRuntimeState, Migrations}; +use sled_agent_types::instance::MigrationRuntimeState; use std::net::SocketAddr; use uuid::Uuid; @@ -57,6 +61,50 @@ pub struct VmmStateUpdateResult { pub migration_out_updated: bool, } +/// Changeset for a VMM's runtime state. +#[derive(Clone, Debug, AsChangeset)] +#[diesel(table_name = vmm)] +struct RuntimeStateChangeset { + /// The time at which this state was most recently updated. + time_state_updated: DateTime, + + /// The generation number protecting this VMM's state and update time. + #[diesel(column_name = state_generation)] + generation: model::Generation, + + /// The state of this VMM. If this VMM is the active VMM for a given + /// instance, this state is the instance's logical state. + state: DbVmmState, + + /// If this VMM is in the `Failed` state, this field describes why it + /// failed. This is `None` for VMMs that are not in the `Failed` state. + failure_reason: Option, +} + +impl From for RuntimeStateChangeset { + fn from(value: nexus_types::instance::VmmRuntimeState) -> Self { + use nexus_types::instance as input; + let input::VmmRuntimeState { state, time_updated, generation } = value; + + // The `nexus_types` version of this includes the failure reason as + // a field of the `Failed` variant, to enforce the invariant that + // failed VMMs have a failure reason, and non-failed VMMs do not. + // The db model type cannot do that, so unpack it here. + let (state, failure_reason) = match state { + input::VmmState::Failed(reason) => { + (DbVmmState::Failed, Some(reason.into())) + } + state => (state.into(), None), + }; + Self { + state, + time_state_updated: time_updated, + generation: generation.into(), + failure_reason, + } + } +} + impl DataStore { pub async fn vmm_insert( &self, @@ -138,7 +186,7 @@ impl DataStore { self.vmm_update_runtime_on_connection( &*self.pool_connection_unauthorized().await?, vmm_id, - new_runtime, + new_runtime.clone().into(), ) .await .map(|r| match r.status { @@ -160,7 +208,7 @@ impl DataStore { &self, conn: &async_bb8_diesel::Connection, vmm_id: &PropolisUuid, - new_runtime: &VmmRuntimeState, + new_runtime: RuntimeStateChangeset, ) -> Result, diesel::result::Error> { diesel::update(dsl::vmm) .filter(dsl::time_deleted.is_null()) @@ -223,10 +271,12 @@ impl DataStore { let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; + let new_runtime = RuntimeStateChangeset::from(new_runtime.clone()); self.transaction_retry_wrapper("vmm_and_migration_update_runtime") .transaction(&conn, |conn| { let err = err.clone(); + let new_runtime = new_runtime.clone(); async move { let vmm_update_result = self .vmm_update_runtime_on_connection( @@ -440,10 +490,9 @@ mod tests { use crate::db; use crate::db::model::Generation; use crate::db::model::Migration; - use crate::db::model::VmmRuntimeState; - use crate::db::model::VmmState; use crate::db::pub_test_utils::TestDatabase; use nexus_db_model::VmmCpuPlatform; + use nexus_types::instance::VmmState; use omicron_test_utils::dev; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::SledUuid; @@ -472,7 +521,8 @@ mod tests { cpu_platform: VmmCpuPlatform::SledDefault, time_state_updated: Utc::now(), generation: Generation::new(), - state: VmmState::Running, + state: db::model::VmmState::Running, + failure_reason: None, }, ) .await @@ -492,11 +542,13 @@ mod tests { cpu_platform: VmmCpuPlatform::SledDefault, time_state_updated: Utc::now(), generation: Generation::new(), - state: VmmState::Running, + state: db::model::VmmState::Running, + failure_reason: None, }, ) .await .expect("VMM 2 should be inserted successfully!"); + let mut vmm2_state = vmm2.runtime(); let migration1 = datastore .migration_insert( @@ -524,11 +576,7 @@ mod tests { .vmm_and_migration_update_runtime( &opctx, PropolisUuid::from_untyped_uuid(vmm1.id), - &VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(vmm1.generation.0.next()), - state: VmmState::Stopping, - }, + &vmm1.runtime().transition(VmmState::Stopping), Migrations { migration_in: None, migration_out: Some(&vmm1_migration_out), @@ -542,22 +590,19 @@ mod tests { generation: Generation::new().0.next(), time_updated: Utc::now(), }; + vmm2_state = vmm2_state.transition(VmmState::Running); datastore .vmm_and_migration_update_runtime( &opctx, PropolisUuid::from_untyped_uuid(vmm2.id), - &VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(vmm2.generation.0.next()), - state: VmmState::Running, - }, + &vmm2_state, Migrations { migration_in: Some(&vmm2_migration_in), migration_out: None, }, ) .await - .expect("vmm1 state should update"); + .expect("vmm2 state should update"); let all_migrations = datastore .instance_list_migrations( @@ -601,7 +646,8 @@ mod tests { cpu_platform: VmmCpuPlatform::SledDefault, time_state_updated: Utc::now(), generation: Generation::new(), - state: VmmState::Running, + state: db::model::VmmState::Running, + failure_reason: None, }, ) .await @@ -628,15 +674,12 @@ mod tests { generation: Generation::new().0.next(), time_updated: Utc::now(), }; + vmm2_state = vmm2_state.transition(VmmState::Destroyed); datastore .vmm_and_migration_update_runtime( &opctx, PropolisUuid::from_untyped_uuid(vmm2.id), - &VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(vmm2.generation.0.next()), - state: VmmState::Destroyed, - }, + &vmm2_state, Migrations { migration_in: Some(&vmm2_migration_in), migration_out: Some(&vmm2_migration_out), @@ -656,11 +699,7 @@ mod tests { .vmm_and_migration_update_runtime( &opctx, PropolisUuid::from_untyped_uuid(vmm3.id), - &VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(vmm3.generation.0.next()), - state: VmmState::Destroyed, - }, + &vmm3.runtime().transition(VmmState::Running), Migrations { migration_in: Some(&vmm3_migration_in), migration_out: None, diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index d2288443874..cddaf8f4d36 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -122,6 +122,7 @@ define_enums! { UserDataExportStateEnum => "user_data_export_state", UserProvisionTypeEnum => "user_provision_type", VmmCpuPlatformEnum => "vmm_cpu_platform", + VmmFailureReasonEnum => "vmm_failure_reason", VmmStateEnum => "vmm_state", VolumeResourceUsageTypeEnum => "volume_resource_usage_type", VpcFirewallRuleActionEnum => "vpc_firewall_rule_action", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 40378afb9ef..a9b8c880db1 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -485,6 +485,7 @@ table! { propolis_port -> Int4, state -> crate::enums::VmmStateEnum, cpu_platform -> crate::enums::VmmCpuPlatformEnum, + failure_reason -> Nullable, } } joinable!(vmm -> sled (sled_id)); diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index 6db4c8c38b2..40d6c04b2d7 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -250,6 +250,7 @@ mod tests { state: VmmState::Destroyed, time_state_updated: Utc::now(), generation: Generation::new(), + failure_reason: None, }), ) .await diff --git a/nexus/src/app/background/tasks/instance_reincarnation.rs b/nexus/src/app/background/tasks/instance_reincarnation.rs index 8b497b618ef..1b67e86d402 100644 --- a/nexus/src/app/background/tasks/instance_reincarnation.rs +++ b/nexus/src/app/background/tasks/instance_reincarnation.rs @@ -458,6 +458,7 @@ mod test { time_state_updated: Utc::now(), generation: Generation::new(), state: VmmState::SagaUnwound, + failure_reason: None, }, ) .await diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 1895e7091f1..6fd0af70983 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -20,6 +20,9 @@ use nexus_networking::GatewayClient; use nexus_types::external_api::sled::SledPolicy; use nexus_types::identity::Asset; use nexus_types::identity::Resource; +use nexus_types::instance::SledVmmState; +use nexus_types::instance::VmmFailureReason; +use nexus_types::instance::VmmState; use nexus_types::inventory; use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; @@ -30,8 +33,6 @@ use omicron_uuid_kinds::SledUuid; use oximeter::types::ProducerRegistry; use parallel_task_set::ParallelTaskSet; use sled_agent_client::Client as SledAgentClient; -use sled_agent_types::instance; -use sled_agent_types::instance::SledVmmState; use sled_hardware_types::BaseboardId; use slog_error_chain::InlineErrorChain; use std::borrow::Cow; @@ -202,15 +203,11 @@ impl Check { gateways: &[GatewayClient], client: &SledAgentClient, ) -> Option { - let mk_failed = || { + let mk_failed = |reason: VmmFailureReason| { // TODO(eliza): it would be nicer if this used the same // code path as `mark_instance_failed`... Some(SledVmmState { - vmm_state: instance::VmmRuntimeState { - generation: vmm.generation.0.next(), - state: instance::VmmState::Failed, - time_updated: chrono::Utc::now(), - }, + vmm_state: vmm.runtime().transition(VmmState::Failed(reason)), // It's fine to synthesize `None`s here because a `None` // just means "don't update the migration state", not // "there is no migration". @@ -228,7 +225,7 @@ impl Check { marking it as Failed"; ); self.outcome = CheckOutcome::Failure(Failure::SledExpunged); - return mk_failed(); + return mk_failed(VmmFailureReason::SledExpunged); } // Ask the sled-agent what it has to say for itself. @@ -243,7 +240,7 @@ impl Check { // Note that this does not always mean that the *VMM* is healthy, // but only that we successfully got its state from the sled-agent. Ok(rsp) => { - let state = rsp.into_inner(); + let state = SledVmmState::from(rsp.into_inner()); self.outcome = CheckOutcome::Success(state.vmm_state.state.into()); Some(state) @@ -260,7 +257,7 @@ impl Check { error, ); self.outcome = CheckOutcome::Failure(Failure::NoSuchInstance); - mk_failed() + mk_failed(VmmFailureReason::NoSuchInstance) } // We were able to contact the sled-agent, but it responded with an // error which does *not* tell us that the VMM has failed. Either @@ -336,7 +333,7 @@ impl Check { "sled_power_state" => ?state, ); self.outcome = CheckOutcome::Failure(Failure::SledOff); - return mk_failed(); + return mk_failed(VmmFailureReason::SledOff); } } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index d24a401b317..d0b77f6ff3f 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -30,7 +30,6 @@ use nexus_db_model::InstanceUpdate; use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; use nexus_db_model::Vmm as DbVmm; -use nexus_db_model::VmmRuntimeState; use nexus_db_model::VmmState as DbVmmState; use nexus_db_queries::authn; use nexus_db_queries::authz; @@ -46,6 +45,7 @@ use nexus_types::external_api::instance; use nexus_types::external_api::ip_pool; use nexus_types::external_api::multicast; use nexus_types::external_api::project; +use nexus_types::instance::SledVmmState; use omicron_common::address::ConcreteIp; use omicron_common::api::external::ByteCount; use omicron_common::api::external::CreateResult; @@ -82,7 +82,6 @@ use sled_agent_client::types::VmmPutStateBody; use sled_agent_types::instance as sled_agent; use sled_agent_types::instance::ExternalIpConfig; use sled_agent_types::instance::ExternalIps; -use sled_agent_types::instance::SledVmmState; use sled_agent_types::inventory::SourceNatConfig; use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; @@ -190,6 +189,16 @@ impl SledAgentInstanceError { _ => false, } } + + /// If this error indicates that the VMM should be marked as `Failed`, + /// returns the reason for the failure. + pub fn vmm_failure_reason(&self) -> Option { + if self.vmm_gone() { + Some(db::model::VmmFailureReason::NoSuchInstance) + } else { + None + } + } } /// An error that can be returned from an operation that changes the state of an @@ -1008,9 +1017,15 @@ impl super::Nexus { if let (InstanceStateChangeError::SledAgent(inner), Some(vmm)) = (&e, state.vmm()) { - if inner.vmm_gone() { + if let Some(reason) = inner.vmm_failure_reason() { let _ = self - .mark_vmm_failed(opctx, authz_instance, vmm, inner) + .mark_vmm_failed( + opctx, + authz_instance, + vmm, + inner, + reason, + ) .await; } } @@ -1123,9 +1138,15 @@ impl super::Nexus { if let (InstanceStateChangeError::SledAgent(inner), Some(vmm)) = (&e, state.vmm()) { - if inner.vmm_gone() { + if let Some(reason) = inner.vmm_failure_reason() { let _ = self - .mark_vmm_failed(opctx, authz_instance, vmm, inner) + .mark_vmm_failed( + opctx, + authz_instance, + vmm, + inner, + reason, + ) .await; } } @@ -1150,7 +1171,7 @@ impl super::Nexus { let sa = self.sled_client(&sled_id).await?; sa.vmm_unregister(propolis_id) .await - .map(|res| res.into_inner().updated_runtime) + .map(|res| res.into_inner().updated_runtime.map(Into::into)) .map_err(|e| { InstanceStateChangeError::SledAgent(SledAgentInstanceError(e)) }) @@ -1385,7 +1406,7 @@ impl super::Nexus { &VmmPutStateBody { state: requested.into() }, ) .await - .map(|res| res.into_inner().updated_runtime) + .map(|res| res.into_inner().updated_runtime.map(Into::into)) .map_err(|e| SledAgentInstanceError(e)); // If the operation succeeded, write the instance state back, @@ -1682,29 +1703,34 @@ impl super::Nexus { }, ) .await - .map(|res| res.into_inner()) + .map(|res| res.into_inner().into()) .map_err(|e| SledAgentInstanceError(e)); match instance_register_result { Ok(state) => { self.notify_vmm_updated(opctx, *propolis_id, &state).await?; - let runtime: db::model::VmmRuntimeState = - state.vmm_state.into(); + let runtime = state.vmm_state; Ok(db::model::Vmm { - time_state_updated: runtime.time_state_updated, - generation: runtime.generation, - state: runtime.state, + time_state_updated: runtime.time_updated, + generation: runtime.generation.into(), + state: DbVmmState::from(runtime.state), + failure_reason: db::model::VmmFailureReason::from_vmm_state( + runtime.state, + ), ..initial_vmm.clone() }) } Err(e) => { - if e.vmm_gone() { + // If this error indicates that the VMM should be marked as + // `Failed`, do that now. + if let Some(reason) = e.vmm_failure_reason() { let _ = self .mark_vmm_failed( &opctx, authz_instance.clone(), &initial_vmm, &e, + reason, ) .await; } @@ -1715,7 +1741,7 @@ impl super::Nexus { /// Attempts to transition an instance to to the `Failed` state in /// response to an error returned from a call to a sled - /// agent instance API, supplied in `reason`. + /// agent instance API, supplied in `error`. /// /// This is done by first marking the associated `vmm` as `Failed`, and then /// running an `instance-update` saga which transitions the instance record @@ -1734,25 +1760,30 @@ impl super::Nexus { opctx: &OpContext, authz_instance: authz::Instance, vmm: &db::model::Vmm, - reason: &SledAgentInstanceError, + error: &SledAgentInstanceError, + reason: db::model::VmmFailureReason, ) -> Result<(), Error> { let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); - error!(self.log, "marking VMM failed due to sled agent API error"; - "instance_id" => %instance_id, - "vmm_id" => %vmm_id, - "error" => ?reason); - - let new_runtime = VmmRuntimeState { - state: db::model::VmmState::Failed, - time_state_updated: chrono::Utc::now(), - generation: db::model::Generation(vmm.generation.next()), - }; + error!( + &opctx.log, + "marking VMM failed due to sled agent API error"; + "instance_id" => %instance_id, + "vmm_id" => %vmm_id, + "error" => &InlineErrorChain::new(&error), + "reason" => %reason, + ); + + let new_runtime = vmm + .runtime() + .transition(nexus_types::instance::VmmState::Failed(reason.into())); match self.db_datastore.vmm_update_runtime(&vmm_id, &new_runtime).await { Ok(_) => { - info!(self.log, "marked VMM as Failed, preparing update saga"; + info!( + &opctx.log, + "marked VMM as Failed, preparing update saga"; "instance_id" => %instance_id, "vmm_id" => %vmm_id, "reason" => ?reason, @@ -1946,7 +1977,7 @@ impl super::Nexus { &self, opctx: &OpContext, propolis_id: PropolisUuid, - new_runtime_state: &sled_agent::SledVmmState, + new_runtime_state: &SledVmmState, ) -> Result<(), Error> { if let Some((instance_id, saga)) = process_vmm_update( &self.db_datastore, @@ -2752,7 +2783,7 @@ pub(crate) async fn process_vmm_update( &opctx, propolis_id, // TODO(eliza): probably should take this by value... - &new_runtime_state.vmm_state.clone().into(), + &new_runtime_state.vmm_state.clone(), migrations, ) .await?; diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 4e6e428f885..0899ffcd474 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -982,36 +982,44 @@ async fn sis_ensure_registered_undo( // be a bit of a stretch. See the definition of `instance_unhealthy` for // more details. match e { - InstanceStateChangeError::SledAgent(inner) if inner.vmm_gone() => { - error!(osagactx.log(), - "start saga: failing instance after unregister failure"; - "instance_id" => %instance_id, - "start_reason" => ?params.reason, - "error" => ?inner); - - if let Err(set_failed_error) = osagactx - .nexus() - .mark_vmm_failed(&opctx, authz_instance, &db_vmm, &inner) - .await - { + InstanceStateChangeError::SledAgent(inner) => { + if let Some(reason) = inner.vmm_failure_reason() { error!(osagactx.log(), - "start saga: failed to mark instance as failed"; + "start saga: failing instance after unregister failure"; "instance_id" => %instance_id, "start_reason" => ?params.reason, - "error" => ?set_failed_error); - - Err(set_failed_error.into()) + "error" => ?inner, + "reason" => %reason); + + if let Err(set_failed_error) = osagactx + .nexus() + .mark_vmm_failed( + &opctx, + authz_instance, + &db_vmm, + &inner, + reason, + ) + .await + { + error!(osagactx.log(), + "start saga: failed to mark instance as failed"; + "instance_id" => %instance_id, + "start_reason" => ?params.reason, + "error" => ?set_failed_error); + + Err(set_failed_error.into()) + } else { + Err(inner.0.into()) + } } else { - Err(inner.0.into()) - } - } - InstanceStateChangeError::SledAgent(_) => { - info!(osagactx.log(), - "start saga: instance already unregistered from sled"; - "instance_id" => %instance_id, - "start_reason" => ?params.reason); + info!(osagactx.log(), + "start saga: instance already unregistered from sled"; + "instance_id" => %instance_id, + "start_reason" => ?params.reason); - Ok(()) + Ok(()) + } } InstanceStateChangeError::Other(inner) => { error!(osagactx.log(), diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 2a85f888d8e..6de40e3fc04 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -362,6 +362,7 @@ use chrono::Utc; use nexus_db_lookup::LookupPath; use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; +use nexus_types::instance::SledVmmState; use nexus_types::saga::saga_action_failed; use omicron_common::api::external::Error; use omicron_common::backoff; @@ -370,7 +371,6 @@ use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; use serde::{Deserialize, Serialize}; -use sled_agent_types::instance::SledVmmState; use steno::{ActionError, DagBuilder, Node}; use uuid::Uuid; @@ -1548,7 +1548,6 @@ mod test { use crate::app::OpContext; use crate::app::db; use crate::app::db::model::Instance; - use crate::app::db::model::VmmRuntimeState; use crate::app::saga::create_saga_dag; use crate::app::sagas::test_helpers; use anyhow::Context; @@ -1564,6 +1563,9 @@ mod test { use nexus_types::external_api::{ instance as instance_types, networking as networking_types, }; + use nexus_types::instance::Migrations; + use nexus_types::instance::VmmFailureReason; + use nexus_types::instance::VmmState as NexusVmmState; use nexus_types::internal_api::params::InstanceMigrateRequest; use omicron_common::api::external::{ ByteCount, DataPageParams, IdentityMetadataCreateParams, @@ -1573,9 +1575,7 @@ mod test { use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; use sled_agent_types::early_networking::SwitchSlot; - use sled_agent_types::instance::{ - MigrationRuntimeState, MigrationState, Migrations, - }; + use sled_agent_types::instance::{MigrationRuntimeState, MigrationState}; use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; @@ -1978,11 +1978,8 @@ mod test { datastore .vmm_update_runtime( &vmm_id, - &VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(vmm.generation.0.next()), - state: VmmState::Destroyed, - }, + &vmm.runtime() + .transition(nexus_types::instance::VmmState::Destroyed), ) .await .unwrap(); @@ -2038,7 +2035,7 @@ mod test { ) { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .source(MigrationState::Completed, VmmState::Stopping) + .source(MigrationState::Completed, NexusVmmState::Stopping) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2052,7 +2049,7 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .source(MigrationState::Completed, VmmState::Stopping) + .source(MigrationState::Completed, NexusVmmState::Stopping) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2064,7 +2061,7 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .source(MigrationState::Completed, VmmState::Stopping) + .source(MigrationState::Completed, NexusVmmState::Stopping) .run_unwinding_test(cptestctx) .await; } @@ -2078,7 +2075,7 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Running) + .target(MigrationState::Completed, NexusVmmState::Running) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2092,7 +2089,7 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Running) + .target(MigrationState::Completed, NexusVmmState::Running) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2104,7 +2101,7 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Running) + .target(MigrationState::Completed, NexusVmmState::Running) .run_unwinding_test(cptestctx) .await; } @@ -2118,8 +2115,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Running) - .source(MigrationState::Completed, VmmState::Destroyed) + .target(MigrationState::Completed, NexusVmmState::Running) + .source(MigrationState::Completed, NexusVmmState::Destroyed) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2133,8 +2130,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Running) - .source(MigrationState::Completed, VmmState::Destroyed) + .target(MigrationState::Completed, NexusVmmState::Running) + .source(MigrationState::Completed, NexusVmmState::Destroyed) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2146,8 +2143,8 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Running) - .source(MigrationState::Completed, VmmState::Destroyed) + .target(MigrationState::Completed, NexusVmmState::Running) + .source(MigrationState::Completed, NexusVmmState::Destroyed) .run_unwinding_test(cptestctx) .await; } @@ -2161,8 +2158,11 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Failed) - .source(MigrationState::Failed, VmmState::Running) + .target( + MigrationState::Failed, + NexusVmmState::Failed(VmmFailureReason::FromSledAgent), + ) + .source(MigrationState::Failed, NexusVmmState::Running) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2176,8 +2176,11 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Failed) - .source(MigrationState::Failed, VmmState::Running) + .target( + MigrationState::Failed, + NexusVmmState::Failed(VmmFailureReason::FromSledAgent), + ) + .source(MigrationState::Failed, NexusVmmState::Running) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2189,8 +2192,11 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Failed) - .source(MigrationState::Failed, VmmState::Running) + .target( + MigrationState::Failed, + NexusVmmState::Failed(VmmFailureReason::FromSledAgent), + ) + .source(MigrationState::Failed, NexusVmmState::Running) .run_unwinding_test(cptestctx) .await; } @@ -2204,8 +2210,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Running) + .target(MigrationState::Failed, NexusVmmState::Destroyed) + .source(MigrationState::Failed, NexusVmmState::Running) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2219,8 +2225,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Running) + .target(MigrationState::Failed, NexusVmmState::Destroyed) + .source(MigrationState::Failed, NexusVmmState::Running) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2232,8 +2238,8 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Running) + .target(MigrationState::Failed, NexusVmmState::Destroyed) + .source(MigrationState::Failed, NexusVmmState::Running) .run_unwinding_test(cptestctx) .await; } @@ -2247,8 +2253,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::InProgress, VmmState::Running) - .source(MigrationState::Failed, VmmState::Destroyed) + .target(MigrationState::InProgress, NexusVmmState::Running) + .source(MigrationState::Failed, NexusVmmState::Destroyed) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2262,8 +2268,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::InProgress, VmmState::Running) - .source(MigrationState::Failed, VmmState::Destroyed) + .target(MigrationState::InProgress, NexusVmmState::Running) + .source(MigrationState::Failed, NexusVmmState::Destroyed) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2275,8 +2281,8 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .target(MigrationState::InProgress, VmmState::Running) - .source(MigrationState::Failed, VmmState::Destroyed) + .target(MigrationState::InProgress, NexusVmmState::Running) + .source(MigrationState::Failed, NexusVmmState::Destroyed) .run_unwinding_test(cptestctx) .await; } @@ -2290,8 +2296,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Destroyed) + .target(MigrationState::Failed, NexusVmmState::Destroyed) + .source(MigrationState::Failed, NexusVmmState::Destroyed) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2305,8 +2311,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Destroyed) + .target(MigrationState::Failed, NexusVmmState::Destroyed) + .source(MigrationState::Failed, NexusVmmState::Destroyed) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2318,8 +2324,8 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Destroyed) + .target(MigrationState::Failed, NexusVmmState::Destroyed) + .source(MigrationState::Failed, NexusVmmState::Destroyed) .run_unwinding_test(cptestctx) .await; } @@ -2333,8 +2339,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Destroyed) - .source(MigrationState::Completed, VmmState::Stopping) + .target(MigrationState::Completed, NexusVmmState::Destroyed) + .source(MigrationState::Completed, NexusVmmState::Stopping) .setup_test(cptestctx) .await .run_saga_basic_usage_succeeds_test(cptestctx) @@ -2348,8 +2354,8 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Destroyed) - .source(MigrationState::Completed, VmmState::Stopping) + .target(MigrationState::Completed, NexusVmmState::Destroyed) + .source(MigrationState::Completed, NexusVmmState::Stopping) .setup_test(cptestctx) .await .run_actions_succeed_idempotently_test(cptestctx) @@ -2361,8 +2367,8 @@ mod test { cptestctx: &ControlPlaneTestContext, ) { MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Destroyed) - .source(MigrationState::Completed, VmmState::Stopping) + .target(MigrationState::Completed, NexusVmmState::Destroyed) + .source(MigrationState::Completed, NexusVmmState::Stopping) .run_unwinding_test(cptestctx) .await; } @@ -2504,14 +2510,14 @@ mod test { migration_test .update_src_state( &cptestctx, - VmmState::Destroyed, + NexusVmmState::Destroyed, MigrationState::Completed, ) .await; migration_test .update_target_state( &cptestctx, - VmmState::Running, + NexusVmmState::Running, MigrationState::Completed, ) .await; @@ -2805,23 +2811,23 @@ mod test { #[derive(Clone, Copy, Default)] struct MigrationOutcome { - source: Option<(MigrationState, VmmState)>, - target: Option<(MigrationState, VmmState)>, + source: Option<(MigrationState, NexusVmmState)>, + target: Option<(MigrationState, NexusVmmState)>, failed: bool, } impl MigrationOutcome { - fn source(self, migration: MigrationState, vmm: VmmState) -> Self { + fn source(self, migration: MigrationState, vmm: NexusVmmState) -> Self { let failed = self.failed || migration == MigrationState::Failed - || vmm == VmmState::Failed; + || vmm.is_failed(); Self { source: Some((migration, vmm)), failed, ..self } } - fn target(self, migration: MigrationState, vmm: VmmState) -> Self { + fn target(self, migration: MigrationState, vmm: NexusVmmState) -> Self { let failed = self.failed || migration == MigrationState::Failed - || vmm == VmmState::Failed; + || vmm.is_failed(); Self { target: Some((migration, vmm)), failed, ..self } } @@ -3065,7 +3071,7 @@ mod test { async fn update_src_state( &self, cptestctx: &ControlPlaneTestContext, - vmm_state: VmmState, + vmm_state: NexusVmmState, migration_state: MigrationState, ) { let src_vmm = self @@ -3074,11 +3080,7 @@ mod test { .as_ref() .expect("must have an active VMM"); let vmm_id = PropolisUuid::from_untyped_uuid(src_vmm.id); - let new_runtime = nexus_db_model::VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(src_vmm.generation.0.next()), - state: vmm_state, - }; + let new_runtime = src_vmm.runtime().transition(vmm_state); let migration = self .initial_state @@ -3122,7 +3124,7 @@ mod test { async fn update_target_state( &self, cptestctx: &ControlPlaneTestContext, - vmm_state: VmmState, + vmm_state: NexusVmmState, migration_state: MigrationState, ) { let target_vmm = self @@ -3131,11 +3133,7 @@ mod test { .as_ref() .expect("must have a target VMM"); let vmm_id = PropolisUuid::from_untyped_uuid(target_vmm.id); - let new_runtime = nexus_db_model::VmmRuntimeState { - time_state_updated: Utc::now(), - generation: Generation(target_vmm.generation.0.next()), - state: vmm_state, - }; + let new_runtime = target_vmm.runtime().transition(vmm_state); let migration = self .initial_state @@ -3210,7 +3208,7 @@ mod test { .target .as_ref() .map(|&(_, state)| state) - .unwrap_or(VmmState::Migrating); + .unwrap_or(NexusVmmState::Migrating); if self.outcome.failed { assert_eq!( @@ -3279,7 +3277,8 @@ mod test { .source .as_ref() .map(|&(_, state)| state) - .unwrap_or(VmmState::Migrating); + .unwrap_or(NexusVmmState::Migrating); + assert_eq!( self.src_resource_records_exist(cptestctx).await, !dbg!(src_vmm_state).is_terminal(), @@ -3294,11 +3293,13 @@ mod test { source VMM is not in a terminal state (Destroyed/Failed)", ); - fn expected_instance_state(vmm: VmmState) -> InstanceState { - match vmm { - VmmState::Destroyed => InstanceState::NoVmm, - VmmState::Failed => InstanceState::Failed, - _ => InstanceState::Vmm, + fn expected_instance_state(vmm: NexusVmmState) -> InstanceState { + if vmm.is_failed() { + InstanceState::Failed + } else if vmm.is_terminal() { + InstanceState::NoVmm + } else { + InstanceState::Vmm } } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index eed0c3e42eb..0637895b2a6 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -117,7 +117,7 @@ impl NexusInternalApi for NexusInternalApiImpl { let apictx = &rqctx.context().context; let nexus = &apictx.nexus; let path = path_params.into_inner(); - let new_state = new_runtime_state.into_inner(); + let new_state = new_runtime_state.into_inner().into(); let opctx = crate::context::op_context_for_internal_api(&rqctx).await; let handler = async { nexus diff --git a/nexus/types/src/instance.rs b/nexus/types/src/instance.rs new file mode 100644 index 00000000000..326f5b32b4a --- /dev/null +++ b/nexus/types/src/instance.rs @@ -0,0 +1,290 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use chrono::{DateTime, Utc}; +use omicron_common::api::external::Generation; +use serde::{Deserialize, Serialize}; +use sled_agent_types::instance as sled_agent; +use std::fmt; + +#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] +pub struct SledVmmState { + pub vmm_state: VmmRuntimeState, + /// The current state of any inbound migration to this VMM. + pub migration_in: Option, + + /// The state of any outbound migration from this VMM. + pub migration_out: Option, +} + +impl From for SledVmmState { + fn from(state: sled_agent::SledVmmState) -> Self { + let sled_agent::SledVmmState { vmm_state, migration_in, migration_out } = + state; + Self { vmm_state: vmm_state.into(), migration_in, migration_out } + } +} + +impl SledVmmState { + pub fn migrations(&self) -> Migrations<'_> { + Migrations { + migration_in: self.migration_in.as_ref(), + migration_out: self.migration_out.as_ref(), + } + } +} + +/// The dynamic runtime properties of an individual VMM process. +#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] +pub struct VmmRuntimeState { + /// The last state reported by this VMM. + pub state: VmmState, + /// The generation number for this VMM's state. + #[serde(rename = "gen")] + pub generation: Generation, + /// Timestamp for the VMM's state. + pub time_updated: DateTime, +} + +impl VmmRuntimeState { + /// Transitions the VMM state from the current state to `new_state`, + /// returning a `VmmRuntimeState` representing the VMM after transitioning + /// to the new state. + /// + /// If `new_state` is the same as the current state, returns a copy of + /// `self` without any changes. + pub fn transition(&self, new_state: VmmState) -> Self { + if new_state == self.state { + return self.clone(); + } + Self { + state: new_state, + generation: self.generation.next(), + time_updated: Utc::now(), + } + } +} + +impl From for VmmRuntimeState { + fn from(state: sled_agent::VmmRuntimeState) -> Self { + let sled_agent::VmmRuntimeState { state, generation, time_updated } = + state; + Self { state: state.into(), generation, time_updated } + } +} + +impl From for sled_agent::VmmRuntimeState { + fn from(state: VmmRuntimeState) -> Self { + let VmmRuntimeState { state, generation, time_updated } = state; + Self { state: state.into(), generation, time_updated } + } +} + +#[derive(Copy, Clone, Debug, Deserialize, Serialize, Eq, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum VmmState { + /// The VMM exists in the database but has not begun initialization on the sled. + Creating, + /// The VMM is initializing and has not started running guest CPUs yet. + Starting, + /// The VMM has finished initializing and may be running guest CPUs. + Running, + /// The VMM is shutting down. + Stopping, + /// The VMM's guest has stopped, and the guest will not run again, but the + /// VMM process may not have released all of its resources yet. + Stopped, + /// The VMM is being restarted or its guest OS is rebooting. + Rebooting, + /// The VMM is part of a live migration. + Migrating, + /// The VMM process reported an internal failure. + Failed(VmmFailureReason), + /// The VMM process has been destroyed and its resources have been released. + Destroyed, + /// The start saga which created this VMM has failed and unwound before the + /// instance was able to start. + SagaUnwound, +} + +impl VmmState { + /// Returns a human-readable label for this VMM state. + pub fn label(&self) -> &'static str { + match self { + VmmState::Creating => "creating", + VmmState::Starting => "starting", + VmmState::Running => "running", + VmmState::Stopping => "stopping", + VmmState::Stopped => "stopped", + VmmState::Rebooting => "rebooting", + VmmState::Migrating => "migrating", + VmmState::Failed(_) => "failed", + VmmState::Destroyed => "destroyed", + VmmState::SagaUnwound => "saga_unwound", + } + } + + /// Returns `true` if this is a terminal VMM state. + /// + /// A VMM in a terminal state will never transition to another state. + pub fn is_terminal(&self) -> bool { + matches!(self, VmmState::Destroyed | VmmState::Failed(_)) + } + + /// Returns `true` if the VMM is in a `Failed` state. + pub fn is_failed(&self) -> bool { + matches!(self, VmmState::Failed(_)) + } + + /// Returns `true` if the VMM is in a state where it is safe to + /// deallocate its sled resources and mark it as deleted. + pub fn is_destroyable(&self) -> bool { + matches!( + self, + VmmState::Destroyed | VmmState::Failed(_) | VmmState::SagaUnwound + ) + } + + /// Returns `true` if the VMM exists on a sled. + /// + /// This is `false` for VMMs that have not yet been created, have been + /// destroyed, or were produced by an unwound saga and will never exist. + pub fn exists_on_sled(&self) -> bool { + !matches!( + self, + VmmState::Creating | VmmState::SagaUnwound | VmmState::Destroyed + ) + } +} + +impl fmt::Display for VmmState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.label()) + } +} + +impl From for VmmState { + fn from(state: sled_agent::VmmState) -> Self { + match state { + sled_agent::VmmState::Starting => VmmState::Starting, + sled_agent::VmmState::Running => VmmState::Running, + sled_agent::VmmState::Stopping => VmmState::Stopping, + sled_agent::VmmState::Stopped => VmmState::Stopped, + sled_agent::VmmState::Rebooting => VmmState::Rebooting, + sled_agent::VmmState::Migrating => VmmState::Migrating, + sled_agent::VmmState::Failed => { + // If we are converting a state received from a sled-agent that + // indicates that the VMM is failed, the failure reason is + // implicitly "from_sled_agent" for now. + VmmState::Failed(VmmFailureReason::FromSledAgent) + } + sled_agent::VmmState::Destroyed => VmmState::Destroyed, + } + } +} + +impl From for sled_agent::VmmState { + fn from(state: VmmState) -> Self { + match state { + // The `Creating` state is internal to Nexus; the outside world + // should treat it as equivalent to `Starting`. + VmmState::Starting | VmmState::Creating => { + sled_agent::VmmState::Starting + } + VmmState::Running => sled_agent::VmmState::Running, + VmmState::Stopping => sled_agent::VmmState::Stopping, + VmmState::Stopped => sled_agent::VmmState::Stopped, + VmmState::Rebooting => sled_agent::VmmState::Rebooting, + VmmState::Migrating => sled_agent::VmmState::Migrating, + VmmState::Failed(_) => sled_agent::VmmState::Failed, + + // The `SagaUnwound` state is internal to Nexus; the outside world + // should treat it as equivalent to `Destroyed`. + VmmState::Destroyed | VmmState::SagaUnwound => { + sled_agent::VmmState::Destroyed + } + } + } +} + +impl From for omicron_common::api::external::InstanceState { + fn from(value: VmmState) -> Self { + use omicron_common::api::external::InstanceState as Output; + + match value { + // An instance with a VMM which is in the `Creating` state maps to + // `InstanceState::Starting`, rather than `InstanceState::Creating`. + // If we are still creating the VMM, this is because we are + // attempting to *start* the instance; instances may be created + // without creating a VMM to run them, and then started later. + VmmState::Creating | VmmState::Starting => Output::Starting, + VmmState::Running => Output::Running, + VmmState::Stopping => Output::Stopping, + // `SagaUnwound` should map to `Stopped` so that an `instance_view` + // API call that produces an instance with an unwound VMM will appear to + // be `Stopped`. This is because instances with unwound VMMs can + // be started by a subsequent instance-start saga, just like + // instances whose internal state actually is `Stopped`. + VmmState::Stopped | VmmState::SagaUnwound => Output::Stopped, + VmmState::Rebooting => Output::Rebooting, + VmmState::Migrating => Output::Migrating, + VmmState::Failed(_) => Output::Failed, + VmmState::Destroyed => Output::Destroyed, + } + } +} + +#[derive( + Copy, + Clone, + Debug, + PartialEq, + Eq, + Serialize, + Deserialize, + strum::Display, + strum::IntoStaticStr, +)] +#[strum(serialize_all = "snake_case")] +#[serde(rename_all = "snake_case")] +pub enum VmmFailureReason { + /// The reason for this VMM's failure is unknown, because the VMM failed + /// prior to the recording of failure reasons. + Prehistoric, + /// The sled-agent reported that this VMM failed. + FromSledAgent, + /// A request to the sled-agent received a response indicating that this + /// VMM is no longer present on the sled. + NoSuchInstance, + /// The sled on which this VMM was running has been expunged. + SledExpunged, + /// The sled on which this VMM was running has powered off. + SledOff, +} + +impl VmmFailureReason { + pub fn description(&self) -> &'static str { + match self { + Self::Prehistoric => "VMM failed prior to recorded history", + Self::FromSledAgent => "failed VMM state received from sled-agent", + Self::NoSuchInstance => "VMM no longer present on sled", + Self::SledExpunged => { + "the sled this VMM was running on has been expunged" + } + Self::SledOff => "the sled this VMM was running on powered off", + } + } +} + +#[derive(Copy, Clone, Debug, Default)] +pub struct Migrations<'state> { + pub migration_in: Option<&'state sled_agent::MigrationRuntimeState>, + pub migration_out: Option<&'state sled_agent::MigrationRuntimeState>, +} + +impl Migrations<'_> { + pub fn empty() -> Self { + Self { migration_in: None, migration_out: None } + } +} diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index 13978312f5a..36e945f58eb 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -35,6 +35,7 @@ pub mod deployment; pub mod external_api; pub mod fm; pub mod identity; +pub mod instance; pub mod internal_api; pub mod inventory; pub mod multicast; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 4a9a2400c93..707c3270779 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6022,6 +6022,37 @@ CREATE TYPE IF NOT EXISTS omicron.public.vmm_cpu_platform AS ENUM ( 'amd_turin' ); +/* Describes why a VMM record is in the 'failed' `vmm_state`. */ +CREATE TYPE IF NOT EXISTS omicron.public.vmm_failure_reason AS ENUM ( + /* + * The reason for this VMM's failure is unknown, because the VMM failed + * prior to the recording of failure reasons. + */ + 'prehistoric', + /* + * The sled-agent reported that this VMM failed. + * + * TODO(eliza): once omicron#10290 is resolved or client-side API + * versioning is implemented (RFD 567), it would be nice if the sled-agent + * could report more detailed failure reasons... + */ + 'from_sled_agent', + /* + * A request to the sled-agent received a response indicating that this + * VMM is no longer present on the sled. + */ + 'no_such_instance', + /* + * The sled on which this VMM was running has been expunged. + */ + 'sled_expunged', + /* + * The sled on which this VMM was running has powered off. + */ + 'sled_off' +); + + -- Per-VMM state. CREATE TABLE IF NOT EXISTS omicron.public.vmm ( id UUID PRIMARY KEY, @@ -6034,7 +6065,15 @@ CREATE TABLE IF NOT EXISTS omicron.public.vmm ( propolis_ip INET NOT NULL, propolis_port INT4 NOT NULL CHECK (propolis_port BETWEEN 0 AND 65535) DEFAULT 12400, state omicron.public.vmm_state NOT NULL, - cpu_platform omicron.public.vmm_cpu_platform NOT NULL + cpu_platform omicron.public.vmm_cpu_platform NOT NULL, + failure_reason omicron.public.vmm_failure_reason, + + -- If a VMM is in the 'failed' state, it must have a failure reason; if it + -- is not in the failed state, it must not have a failure reason. + CONSTRAINT failure_reason_iff_failed CHECK ( + (state = 'failed' AND failure_reason IS NOT NULL) + OR (state != 'failed' AND failure_reason IS NULL) + ) ); CREATE INDEX IF NOT EXISTS lookup_vmms_by_sled_id ON omicron.public.vmm ( @@ -8587,7 +8626,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '258.0.0', NULL) + (TRUE, NOW(), NOW(), '259.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/vmm-failure-reason/up01.sql b/schema/crdb/vmm-failure-reason/up01.sql new file mode 100644 index 00000000000..0d7c8695e35 --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up01.sql @@ -0,0 +1,7 @@ +CREATE TYPE IF NOT EXISTS omicron.public.vmm_failure_reason AS ENUM ( + 'prehistoric', + 'from_sled_agent', + 'no_such_instance', + 'sled_expunged', + 'sled_off' +); diff --git a/schema/crdb/vmm-failure-reason/up02.sql b/schema/crdb/vmm-failure-reason/up02.sql new file mode 100644 index 00000000000..9deaae2cc4f --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.vmm + ADD COLUMN IF NOT EXISTS failure_reason omicron.public.vmm_failure_reason; diff --git a/schema/crdb/vmm-failure-reason/up03.sql b/schema/crdb/vmm-failure-reason/up03.sql new file mode 100644 index 00000000000..62914d3185f --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up03.sql @@ -0,0 +1,7 @@ +SET LOCAL disallow_full_table_scans = off; + +-- Backfill any VMMs that are already in the 'failed' state with a +-- 'prehistoric' failure reason. +UPDATE omicron.public.vmm + SET failure_reason = 'prehistoric' + WHERE state = 'failed' AND failure_reason IS NULL; diff --git a/schema/crdb/vmm-failure-reason/up04.sql b/schema/crdb/vmm-failure-reason/up04.sql new file mode 100644 index 00000000000..a1492ec5565 --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up04.sql @@ -0,0 +1,5 @@ +ALTER TABLE omicron.public.vmm + ADD CONSTRAINT IF NOT EXISTS failure_reason_iff_failed CHECK ( + (state = 'failed' AND failure_reason IS NOT NULL) + OR (state != 'failed' AND failure_reason IS NULL) + ); diff --git a/schema/crdb/vmm-failure-reason/up04.verify.sql b/schema/crdb/vmm-failure-reason/up04.verify.sql new file mode 100644 index 00000000000..c1dc2ad3c26 --- /dev/null +++ b/schema/crdb/vmm-failure-reason/up04.verify.sql @@ -0,0 +1,2 @@ +-- DO NOT EDIT. Generated by test_migration_verification_files. +SELECT CAST(IF((SELECT true WHERE EXISTS (SELECT 1 FROM [SHOW CONSTRAINTS FROM vmm] WHERE constraint_name = 'failure_reason_iff_failed' AND validated = true)),'true','Schema change verification failed: constraint failure_reason_iff_failed not found on table vmm') AS BOOL); diff --git a/sled-agent/types/versions/src/impls/instance.rs b/sled-agent/types/versions/src/impls/instance.rs index 18521fced53..57177c899cf 100644 --- a/sled-agent/types/versions/src/impls/instance.rs +++ b/sled-agent/types/versions/src/impls/instance.rs @@ -6,8 +6,6 @@ use crate::latest::instance::ExternalIpConfig; use crate::latest::instance::ExternalIpv4Config; use crate::latest::instance::ExternalIpv6Config; use crate::latest::instance::MigrationState; -use crate::latest::instance::Migrations; -use crate::latest::instance::SledVmmState; use crate::latest::instance::VmmSpec; use crate::latest::instance::VmmState; use crate::latest::instance::VmmStateRequested; @@ -112,21 +110,6 @@ impl VmmStateRequested { } } -impl Migrations<'_> { - pub fn empty() -> Self { - Self { migration_in: None, migration_out: None } - } -} - -impl SledVmmState { - pub fn migrations(&self) -> Migrations<'_> { - Migrations { - migration_in: self.migration_in.as_ref(), - migration_out: self.migration_out.as_ref(), - } - } -} - impl MigrationState { pub fn label(&self) -> &'static str { match self { diff --git a/sled-agent/types/versions/src/initial/instance.rs b/sled-agent/types/versions/src/initial/instance.rs index 12474627be0..7b8db337973 100644 --- a/sled-agent/types/versions/src/initial/instance.rs +++ b/sled-agent/types/versions/src/initial/instance.rs @@ -103,12 +103,6 @@ pub struct SledVmmState { pub migration_out: Option, } -#[derive(Copy, Clone, Debug, Default)] -pub struct Migrations<'state> { - pub migration_in: Option<&'state MigrationRuntimeState>, - pub migration_out: Option<&'state MigrationRuntimeState>, -} - /// An update from a sled regarding the state of a migration, indicating the /// role of the VMM whose migration state was updated. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, Eq, PartialEq)] diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 60591115103..b4529cf1d27 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -95,7 +95,6 @@ pub mod instance { pub use crate::v1::instance::InstanceMigrationTargetParams; pub use crate::v1::instance::MigrationRuntimeState; pub use crate::v1::instance::MigrationState; - pub use crate::v1::instance::Migrations; pub use crate::v1::instance::SledVmmState; pub use crate::v1::instance::VmmIssueDiskSnapshotRequestBody; pub use crate::v1::instance::VmmIssueDiskSnapshotRequestPathParam;