diff --git a/docs/configuration/storage-config.md b/docs/configuration/storage-config.md index 5c5d24af416..2366462d88a 100644 --- a/docs/configuration/storage-config.md +++ b/docs/configuration/storage-config.md @@ -15,6 +15,7 @@ Quickwit currently supports four types of storage providers: Storage URIs refer to different storage providers identified by a URI "protocol" or "scheme". Quickwit supports the following storage URI protocols: - `s3://` for Amazon S3 and S3-compatible +- `s3+://` for additional S3-compatible backends configured under `storage.s3.named.` (see [Named S3 backends](#named-s3-backends)) - `azure://` for Azure Blob Storage - `file://` for local file systems - `gs://` for Google Cloud Storage @@ -104,6 +105,42 @@ storage: endpoint: https://storage.googleapis.com ``` +#### Named S3 backends + +In addition to the primary `s3:` block, you can declare any number of additional S3-compatible backends under `storage.s3.named.`. Each entry is an independent endpoint with its own credentials, region, and flags. Indexes route to a named backend via the URI scheme `s3+://bucket/path` (plain `s3://` continues to use the primary endpoint). + +Each named entry accepts the same fields as the primary `s3:` block, *except* `named` itself (no recursion). If `access_key_id` / `secret_access_key` are omitted on a named entry, the global AWS SDK credential chain is used (env vars, instance metadata, etc.). + +Named backends are self-contained: the process-wide `QW_S3_ENDPOINT` and `QW_S3_FORCE_PATH_STYLE_ACCESS` overrides apply to the primary `s3:` backend only. A named backend always uses its own `endpoint` and `force_path_style_access` values. + +```yaml +storage: + s3: + # Primary backend — addressed by plain `s3://...` URIs. + endpoint: https://s3.us-east-1.amazonaws.com + region: us-east-1 + named: + # Addressed by `s3+secondary://bucket/path` URIs. + secondary: + endpoint: https://s3.eu-west-3.amazonaws.com + region: eu-west-3 + access_key_id: ${SECONDARY_S3_ACCESS_KEY_ID} + secret_access_key: ${SECONDARY_S3_SECRET_ACCESS_KEY} + # Addressed by `s3+seaweed://bucket/path` URIs. Falls back to the + # global AWS SDK credentials when keys are omitted. + seaweed: + endpoint: http://seaweedfs-s3:8333 + region: us-east-1 + force_path_style_access: true +``` + +An index pointed at a named backend declares its URI accordingly: + +```yaml +index_id: logs-eu +index_uri: s3+secondary://logs-bucket/logs-eu +``` + ### Azure storage configuration | Property | Description | Default value | diff --git a/quickwit/quickwit-common/src/uri.rs b/quickwit/quickwit-common/src/uri.rs index ff191fb04cb..3200ba41cbe 100644 --- a/quickwit/quickwit-common/src/uri.rs +++ b/quickwit/quickwit-common/src/uri.rs @@ -89,6 +89,9 @@ impl FromStr for Protocol { "ram" => Ok(Protocol::Ram), "s3" => Ok(Protocol::S3), "gs" => Ok(Protocol::Google), + // `s3+://...` for a named S3-compatible backend configured under + // `storage.s3.named.`. Routes to the same factory as `s3://`. + s if s.starts_with("s3+") && s.len() > 3 => Ok(Protocol::S3), _ => bail!("unknown URI protocol `{protocol}`"), } } @@ -186,13 +189,33 @@ impl Uri { let parent_path = path.parent()?; Some(Self { - uri: format!("{protocol}{PROTOCOL_SEPARATOR}{}", parent_path.display()), + // Preserve the scheme verbatim so an `s3+` qualifier survives. + uri: format!( + "{}{PROTOCOL_SEPARATOR}{}", + self.scheme(), + parent_path.display() + ), protocol, }) } + /// Returns the URI scheme, preserving any `s3+` qualifier, which may + /// differ from the canonical protocol string (e.g. `s3+alt` vs `s3`). + fn scheme(&self) -> &str { + match self.uri.split_once(PROTOCOL_SEPARATOR) { + Some((scheme, _path)) => scheme, + None => self.protocol.as_str(), + } + } + fn path(&self) -> &Path { - Path::new(&self.uri[self.protocol.as_str().len() + PROTOCOL_SEPARATOR.len()..]) + // Slice at the actual `://` separator rather than assuming the scheme + // equals the canonical protocol — `s3+` schemes are longer. + let path = match self.uri.split_once(PROTOCOL_SEPARATOR) { + Some((_scheme, path)) => path, + None => &self.uri, + }; + Path::new(path) } /// Returns the last component of the URI. @@ -262,9 +285,13 @@ impl Uri { if uri_str.is_empty() { bail!("failed to parse empty URI"); } - let (protocol, mut path) = match uri_str.split_once(PROTOCOL_SEPARATOR) { - None => (Protocol::File, uri_str.to_string()), - Some((protocol, path)) => (Protocol::from_str(protocol)?, path.to_string()), + let (scheme_opt, protocol, mut path) = match uri_str.split_once(PROTOCOL_SEPARATOR) { + None => (None, Protocol::File, uri_str.to_string()), + Some((scheme, path)) => ( + Some(scheme.to_string()), + Protocol::from_str(scheme)?, + path.to_string(), + ), }; if protocol == Protocol::File { if path.starts_with('~') { @@ -292,8 +319,14 @@ impl Uri { .to_string_lossy() .to_string(); } + // Preserve `s3+` qualifier so the storage resolver can route to + // the named backend; other schemes normalize to canonical form. + let display_scheme: &str = match scheme_opt.as_deref() { + Some(s) if s.starts_with("s3+") => s, + _ => protocol.as_str(), + }; Ok(Self { - uri: format!("{protocol}{PROTOCOL_SEPARATOR}{path}"), + uri: format!("{display_scheme}{PROTOCOL_SEPARATOR}{path}"), protocol, }) } @@ -663,6 +696,26 @@ mod tests { ); } + #[test] + fn test_uri_named_s3_scheme() { + // `s3+` schemes are preserved end-to-end: `path` strips the real + // scheme (not the canonical `s3`), and `parent`/`file_name` keep the + // qualifier intact. + let uri = Uri::for_test("s3+alt://bucket/foo/bar"); + assert_eq!(uri.as_str(), "s3+alt://bucket/foo/bar"); + assert_eq!(uri.protocol(), Protocol::S3); + assert_eq!(uri.parent().unwrap(), "s3+alt://bucket/foo"); + assert_eq!(uri.file_name().unwrap(), Path::new("bar")); + + let uri = Uri::for_test("s3+with-dash://bucket/key"); + assert_eq!(uri.parent().unwrap(), "s3+with-dash://bucket"); + assert_eq!(uri.file_name().unwrap(), Path::new("key")); + + // Mirrors the plain-`s3` guard: a bucket-only URI has no parent. + assert!(Uri::for_test("s3+alt://bucket").parent().is_none()); + assert!(Uri::for_test("s3+alt://bucket/").parent().is_none()); + } + #[test] fn test_uri_file_name() { assert!(Uri::for_test("file:///").file_name().is_none()); @@ -812,4 +865,24 @@ mod tests { serde_json::Value::String("s3://bucket/key".to_string()) ); } + + #[test] + fn test_uri_s3_named_preserved() { + // The `s3+` qualifier is the routing token for named S3-compatible + // backends (`storage.s3.named.`). It must survive parse + serialize + // so the storage resolver can recover the backend name on deserialization; + // before this guarantee, the qualifier was stripped by URI normalization + // and every `s3+://` URI silently resolved to the primary endpoint. + let uri = Uri::from_str("s3+alt://bucket/key").unwrap(); + assert_eq!(uri.protocol(), Protocol::S3); + assert_eq!(uri.as_str(), "s3+alt://bucket/key"); + let json = serde_json::to_value(&uri).unwrap(); + assert_eq!( + json, + serde_json::Value::String("s3+alt://bucket/key".to_string()) + ); + let round_trip: Uri = serde_json::from_value(json).unwrap(); + assert_eq!(round_trip.as_str(), "s3+alt://bucket/key"); + assert_eq!(round_trip.protocol(), Protocol::S3); + } } diff --git a/quickwit/quickwit-config/src/storage_config.rs b/quickwit/quickwit-config/src/storage_config.rs index d04cd93aaa0..707aaad11a9 100644 --- a/quickwit/quickwit-config/src/storage_config.rs +++ b/quickwit/quickwit-config/src/storage_config.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::ops::Deref; -use std::sync::OnceLock; use std::{env, fmt}; use anyhow::ensure; @@ -92,6 +91,19 @@ pub enum StorageBackendFlavor { #[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize)] pub struct StorageConfigs(#[serde_as(as = "EnumMap")] Vec); +/// Named backend names must be valid lowercase URL-scheme tails (`s3+://`). +fn validate_named_s3_backend_name(name: &str) -> anyhow::Result<()> { + ensure!(!name.is_empty(), "named S3 backend name must not be empty"); + for character in name.chars() { + ensure!( + character.is_ascii_lowercase() || character.is_ascii_digit() || character == '-', + "invalid named S3 backend name `{name}`: only lowercase ASCII letters, digits, and \ + `-` are allowed (the name is used in the `s3+{name}://` URI scheme)" + ); + } + Ok(()) +} + impl StorageConfigs { pub fn new(storage_configs: Vec) -> Self { Self(storage_configs) @@ -125,6 +137,13 @@ impl StorageConfigs { "{left:?} storage config is defined multiple times", ); } + for storage_config in self.0.iter() { + if let StorageConfig::S3(s3_storage_config) = storage_config { + for name in s3_storage_config.named.keys() { + validate_named_s3_backend_name(name)?; + } + } + } Ok(()) } @@ -362,6 +381,118 @@ pub struct S3StorageConfig { pub disable_stalled_stream_protection_upload: bool, #[serde(default)] pub disable_stalled_stream_protection_download: bool, + /// Additional named S3-compatible backends, addressed via `s3+://bucket/path` + /// URIs. Each entry is an independent endpoint with its own credentials, region, + /// etc. The map key (``) is the routing token used in the URI scheme. + #[serde(default)] + #[serde(skip_serializing_if = "std::collections::BTreeMap::is_empty")] + pub named: std::collections::BTreeMap, + /// Set when this config is the projection of a named backend. Named + /// backends are self-contained, so the process-wide `QW_S3_ENDPOINT` / + /// `QW_S3_FORCE_PATH_STYLE_ACCESS` overrides apply to the primary backend + /// only. Not serialized; defaults to `false` (the primary backend). + #[serde(skip)] + pub is_named_backend: bool, +} + +/// Configuration for a named S3-compatible backend nested under +/// `storage.s3.named.`. Mirrors `S3StorageConfig` but cannot itself +/// have a `named` field (no recursion). +#[derive(Clone, Default, Eq, PartialEq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NamedS3StorageConfig { + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub flavor: Option, + #[serde(default)] + pub access_key_id: Option, + #[serde(default)] + pub secret_access_key: Option, + #[serde(default)] + pub region: Option, + #[serde(default)] + pub endpoint: Option, + #[serde(default)] + pub force_path_style_access: bool, + #[serde(alias = "disable_multi_object_delete_requests")] + #[serde(default)] + pub disable_multi_object_delete: bool, + #[serde(default)] + pub disable_multipart_upload: bool, + #[serde(default)] + pub checksum_algorithm: ChecksumAlgorithm, + /// Deprecated: applies into `checksum_algorithm: disabled`. + #[serde(default, skip_serializing)] + pub disable_checksums: bool, + #[serde(default)] + pub disable_stalled_stream_protection_upload: bool, + #[serde(default)] + pub disable_stalled_stream_protection_download: bool, +} + +impl NamedS3StorageConfig { + /// Project this named config back into a full `S3StorageConfig` + /// (with an empty `named` map) so it can flow through the existing + /// S3 client construction code unchanged. + pub fn as_s3_config(&self) -> S3StorageConfig { + let mut s3_config = S3StorageConfig { + flavor: self.flavor, + access_key_id: self.access_key_id.clone(), + secret_access_key: self.secret_access_key.clone(), + region: self.region.clone(), + endpoint: self.endpoint.clone(), + force_path_style_access: self.force_path_style_access, + disable_multi_object_delete: self.disable_multi_object_delete, + disable_multipart_upload: self.disable_multipart_upload, + checksum_algorithm: self.checksum_algorithm, + disable_checksums: self.disable_checksums, + disable_stalled_stream_protection_upload: self.disable_stalled_stream_protection_upload, + disable_stalled_stream_protection_download: self + .disable_stalled_stream_protection_download, + named: Default::default(), + is_named_backend: true, + }; + // Expand `flavor` shortcuts (region/path-style/checksum defaults) the + // same way the primary backend does at config load time. + s3_config.apply_flavor(); + s3_config + } + + pub fn redact(&mut self) { + if let Some(secret_access_key) = self.secret_access_key.as_mut() { + *secret_access_key = "***redacted***".to_string(); + } + } +} + +impl fmt::Debug for NamedS3StorageConfig { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("NamedS3StorageConfig") + .field("flavor", &self.flavor) + .field("access_key_id", &self.access_key_id) + .field( + "secret_access_key", + &self.secret_access_key.as_ref().map(|_| "***redacted***"), + ) + .field("region", &self.region) + .field("endpoint", &self.endpoint) + .field("force_path_style_access", &self.force_path_style_access) + .field( + "disable_multi_object_delete", + &self.disable_multi_object_delete, + ) + .field("disable_multipart_upload", &self.disable_multipart_upload) + .field("checksum_algorithm", &self.checksum_algorithm) + .field( + "disable_stalled_stream_protection_upload", + &self.disable_stalled_stream_protection_upload, + ) + .field( + "disable_stalled_stream_protection_download", + &self.disable_stalled_stream_protection_download, + ) + .finish() + } } impl S3StorageConfig { @@ -397,23 +528,32 @@ impl S3StorageConfig { if let Some(secret_access_key) = self.secret_access_key.as_mut() { *secret_access_key = "***redacted***".to_string(); } + for named_config in self.named.values_mut() { + named_config.redact(); + } } pub fn endpoint(&self) -> Option { - env::var("QW_S3_ENDPOINT") - .ok() - .or_else(|| self.endpoint.clone()) + // `QW_S3_ENDPOINT` overrides the primary backend only; named backends + // are self-contained and use their own configured endpoint. + if !self.is_named_backend + && let Ok(endpoint) = env::var("QW_S3_ENDPOINT") + { + return Some(endpoint); + } + self.endpoint.clone() } pub fn force_path_style_access(&self) -> Option { - static FORCE_PATH_STYLE: OnceLock> = OnceLock::new(); - *FORCE_PATH_STYLE.get_or_init(|| { - let force_path_style_access = get_bool_from_env( - "QW_S3_FORCE_PATH_STYLE_ACCESS", - self.force_path_style_access, - ); - Some(force_path_style_access) - }) + // `QW_S3_FORCE_PATH_STYLE_ACCESS` overrides the primary backend only. + // No process-wide cache: each backend must honor its own setting. + if self.is_named_backend { + return Some(self.force_path_style_access); + } + Some(get_bool_from_env( + "QW_S3_FORCE_PATH_STYLE_ACCESS", + self.force_path_style_access, + )) } } @@ -442,6 +582,7 @@ impl fmt::Debug for S3StorageConfig { "disable_stalled_stream_protection_download", &self.disable_stalled_stream_protection_download, ) + .field("named", &self.named) .finish() } } @@ -740,4 +881,150 @@ mod tests { assert_eq!(s3_storage_config.flavor, Some(StorageBackendFlavor::MinIO)); } } + + #[test] + fn test_storage_s3_named_backends_serde() { + let s3_storage_config_yaml = r#" + endpoint: https://primary.example.com + region: us-east-1 + named: + alt: + endpoint: https://alt.example.com + region: eu-west-3 + force_path_style_access: true + access_key_id: alt-key + secret_access_key: alt-secret + secondary: + endpoint: http://secondary.example.com:8333 + region: us-east-1 + force_path_style_access: true + "#; + let s3_storage_config: S3StorageConfig = + serde_yaml::from_str(s3_storage_config_yaml).unwrap(); + assert_eq!(s3_storage_config.named.len(), 2); + + let alt = s3_storage_config.named.get("alt").unwrap(); + assert_eq!(alt.region.as_deref(), Some("eu-west-3")); + assert_eq!(alt.access_key_id.as_deref(), Some("alt-key")); + assert!(alt.force_path_style_access); + + // `as_s3_config` projects a named entry back into a full S3StorageConfig + // (with an empty `named` map) so it can drive the S3 client builder + // unchanged. + let projected = alt.as_s3_config(); + assert_eq!(projected.region.as_deref(), Some("eu-west-3")); + assert_eq!(projected.access_key_id.as_deref(), Some("alt-key")); + assert!(projected.force_path_style_access); + assert!(projected.named.is_empty()); + } + + #[test] + fn test_storage_s3_named_backends_redact() { + let mut named = NamedS3StorageConfig { + access_key_id: Some("public-key".to_string()), + secret_access_key: Some("super-secret".to_string()), + ..Default::default() + }; + named.redact(); + assert_eq!(named.access_key_id.as_deref(), Some("public-key")); + assert_eq!(named.secret_access_key.as_deref(), Some("***redacted***")); + } + + #[test] + fn test_storage_s3_named_backends_field_parity() { + // Named backends accept the same fields as the primary S3 block, + // including the legacy `disable_multi_object_delete_requests` alias and + // the stalled-stream toggles, and project them through `as_s3_config`. + let s3_storage_config_yaml = r#" + named: + alt: + endpoint: https://alt.example.com + disable_multi_object_delete_requests: true + disable_stalled_stream_protection_upload: true + disable_stalled_stream_protection_download: true + checksum_algorithm: disabled + "#; + let s3_storage_config: S3StorageConfig = + serde_yaml::from_str(s3_storage_config_yaml).unwrap(); + let alt = s3_storage_config.named.get("alt").unwrap(); + assert!(alt.disable_multi_object_delete); + assert!(alt.disable_stalled_stream_protection_upload); + assert!(alt.disable_stalled_stream_protection_download); + + let projected = alt.as_s3_config(); + assert!(projected.is_named_backend); + assert!(projected.disable_multi_object_delete); + assert!(projected.disable_stalled_stream_protection_upload); + assert!(projected.disable_stalled_stream_protection_download); + assert_eq!(projected.checksum_algorithm, ChecksumAlgorithm::Disabled); + + // A genuinely unknown field is still rejected. + let invalid_yaml = r#" + named: + alt: + bogus_field: true + "#; + assert!(serde_yaml::from_str::(invalid_yaml).is_err()); + } + + #[test] + fn test_storage_s3_named_backend_applies_flavor() { + // `flavor` shortcuts expand for named backends just like the primary. + let s3_storage_config_yaml = r#" + named: + minio-backend: + flavor: minio + endpoint: http://minio.example.com:9000 + "#; + let s3_storage_config: S3StorageConfig = + serde_yaml::from_str(s3_storage_config_yaml).unwrap(); + let projected = s3_storage_config + .named + .get("minio-backend") + .unwrap() + .as_s3_config(); + assert_eq!(projected.region.as_deref(), Some("minio")); + assert!(projected.force_path_style_access); + } + + #[test] + fn test_storage_s3_named_backend_uses_own_endpoint() { + // A named backend is self-contained: `endpoint()` returns its configured + // endpoint regardless of the process-wide `QW_S3_ENDPOINT` override, + // which applies to the primary backend only. + let named = NamedS3StorageConfig { + endpoint: Some("https://named.example.com".to_string()), + ..Default::default() + }; + let projected = named.as_s3_config(); + assert!(projected.is_named_backend); + assert_eq!( + projected.endpoint(), + Some("https://named.example.com".to_string()) + ); + } + + #[test] + fn test_validate_named_s3_backend_name() { + for valid in ["alt", "seaweedfs", "ovh-morocco", "s3alt", "minio-backend"] { + validate_named_s3_backend_name(valid).unwrap(); + } + for invalid in ["", "prod_logs", "Prod", "a.b", "a/b", "a b"] { + validate_named_s3_backend_name(invalid).unwrap_err(); + } + } + + #[test] + fn test_storage_configs_reject_url_incompatible_named_backend() { + let s3_storage_config_yaml = r#" + named: + prod_logs: + endpoint: https://logs.example.com + "#; + let s3_storage_config: S3StorageConfig = + serde_yaml::from_str(s3_storage_config_yaml).unwrap(); + let storage_configs = StorageConfigs::new(vec![s3_storage_config.into()]); + let error = storage_configs.validate().unwrap_err().to_string(); + assert!(error.contains("prod_logs"), "unexpected error: {error}"); + } } diff --git a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage_resolver.rs b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage_resolver.rs index 92b4406486b..1ef8357671a 100644 --- a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage_resolver.rs +++ b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage_resolver.rs @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; use async_trait::async_trait; use aws_sdk_s3::Client as S3Client; @@ -25,6 +26,14 @@ use crate::{ DebouncedStorage, S3CompatibleObjectStorage, Storage, StorageFactory, StorageResolverError, }; +/// Extracts the named-backend key out of an `s3+://...` URI, if any. +/// Returns `None` for plain `s3://...`. +fn parse_named_key(uri: &Uri) -> Option<&str> { + let scheme_end = uri.as_str().find("://")?; + let scheme = &uri.as_str()[..scheme_end]; + scheme.strip_prefix("s3+") +} + /// S3 compatible object storage resolver. pub struct S3CompatibleObjectStorageFactory { storage_config: S3StorageConfig, @@ -34,6 +43,8 @@ pub struct S3CompatibleObjectStorageFactory { // end up being used, or if something like azure, gcs, or even local files, will be used // instead. s3_client: OnceCell, + // Per-named-backend client cell; the mutex is held only to fetch/insert the cell, never across the build. + named_s3_clients: Mutex>>>, } impl S3CompatibleObjectStorageFactory { @@ -42,6 +53,7 @@ impl S3CompatibleObjectStorageFactory { Self { storage_config, s3_client: OnceCell::new(), + named_s3_clients: Mutex::new(HashMap::new()), } } } @@ -53,6 +65,32 @@ impl StorageFactory for S3CompatibleObjectStorageFactory { } async fn resolve(&self, uri: &Uri) -> Result, StorageResolverError> { + if let Some(name) = parse_named_key(uri) { + let named_config = self + .storage_config + .named + .get(name) + .ok_or_else(|| { + StorageResolverError::InvalidUri(format!( + "no `storage.s3.named.{name}` entry configured for URI `{uri}`" + )) + })? + .as_s3_config(); + let client_cell = { + let mut clients = self + .named_s3_clients + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + Arc::clone(clients.entry(name.to_string()).or_default()) + }; + let client = client_cell + .get_or_init(|| create_s3_client(&named_config)) + .await + .clone(); + let storage = + S3CompatibleObjectStorage::from_uri_and_client(&named_config, uri, client).await?; + return Ok(Arc::new(DebouncedStorage::new(storage))); + } let s3_client = self .s3_client .get_or_init(|| create_s3_client(&self.storage_config)) @@ -64,3 +102,23 @@ impl StorageFactory for S3CompatibleObjectStorageFactory { Ok(Arc::new(DebouncedStorage::new(storage))) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_named_key() { + // Plain s3:// URIs route through the primary backend. + assert_eq!(parse_named_key(&Uri::for_test("s3://bucket/key")), None); + // `s3+` URIs return the named-backend key. + assert_eq!( + parse_named_key(&Uri::for_test("s3+alt://bucket/key")), + Some("alt") + ); + assert_eq!( + parse_named_key(&Uri::for_test("s3+with-dash://bucket/key")), + Some("with-dash") + ); + } +}