diff --git a/.github/workflows/openvariant_tester.yml b/.github/workflows/openvariant_tester.yml index 59b197f..da4bc61 100644 --- a/.github/workflows/openvariant_tester.yml +++ b/.github/workflows/openvariant_tester.yml @@ -40,16 +40,20 @@ jobs: uses: actions-rust-lang/setup-rust-toolchain@v1 - uses: astral-sh/setup-uv@v4 - + - name: Lint with Ruff run: | uv sync --extra linting --frozen --no-install-project uv run ruff check openvariant - - name: Test + - name: Test Python-based code run: | uv sync --extra tests --frozen - uv run pytest --cov . + uv run pytest --cov . + + - name: Test Rust-based code + run: | + make test - name: Upload coverage reports to Codecov with GitHub Action uses: codecov/codecov-action@v3 diff --git a/Cargo.lock b/Cargo.lock index b07f0c9..05465b2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,18 +2,52 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + [[package]] name = "libc" version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "libyaml-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e126dda6f34391ab7b444f9922055facc83c07a910da3eb16f1e4d9c45dc777" + [[package]] name = "once_cell" version = "1.21.4" @@ -25,7 +59,9 @@ name = "openvariant-core" version = "2.0.0" dependencies = [ "pyo3", + "serde", "thiserror", + "yaml_serde", ] [[package]] @@ -110,6 +146,42 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "syn" version = "2.0.117" @@ -152,3 +224,16 @@ name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "yaml_serde" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c7c1b1a6a7c8a6b2741a6c21a4f8918e51899b111cfa08d1288202656e3975" +dependencies = [ + "indexmap", + "itoa", + "libyaml-rs", + "ryu", + "serde", +] diff --git a/Cargo.toml b/Cargo.toml index d503dfd..578a934 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,4 +9,6 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.28.3", features = ["extension-module"] } +serde = { version = "1.0.228", features = ["derive"] } thiserror = "2" +yaml_serde = "0.10.4" diff --git a/Makefile b/Makefile index 675906c..09dff5e 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,16 @@ release: test: PYO3_PYTHON=$$(uv python find) cargo test +fmt: + PYO3_PYTHON=$$(uv python find) cargo fmt + lint: - PYO3_PYTHON=$$(uv python find) cargo clippy + PYO3_PYTHON=$$(uv python find) cargo clippy -- -D warnings -format: - PYO3_PYTHON=$$(uv python find) cargo fmt --check +fix: + PYO3_PYTHON=$$(uv python find) cargo clippy --fix --allow-dirty --allow-staged + PYO3_PYTHON=$$(uv python find) cargo fmt -check: lint format \ No newline at end of file +check: + PYO3_PYTHON=$$(uv python find) cargo fmt --check + PYO3_PYTHON=$$(uv python find) cargo clippy -- -D warnings \ No newline at end of file diff --git a/src/annotation/config.rs b/src/annotation/config.rs new file mode 100644 index 0000000..7fd126d --- /dev/null +++ b/src/annotation/config.rs @@ -0,0 +1,246 @@ +//! Core configuration types for OpenVariant annotation. +//! +//! This file defines the core configuration types for OpenVariant annotation, including +//! `AnnotationType`, `AnnotationDelimiter`, `AnnotationFormat`, `AnnotationEntry`, +//! `ExcludeEntry`, and `AnnotationConfig`. These types are designed to be serde-compatible, +//! allowing for easy serialisation and deserialisation to and from YAML annotation definition files. The structures defined here mirror the +//! expected structure of the YAML files, enabling seamless integration with the OpenVariant annotation system. + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// Constants +pub const ANNOTATION_EXTENSION: &str = "yaml"; +pub const DEFAULT_COLUMNS: &[&str] = &[]; +pub const DEFAULT_RECURSIVE: bool = false; + +/// AnnotationType + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum AnnotationType { + /// Assign a fixed literal value to every output record. + /// + /// ```yaml + /// - type: static + /// field: GENOME_BUILD + /// value: GRCh38 + /// ``` + Static, + + /// Copy the value from another column already present in the same record. + /// + /// ```yaml + /// - type: internal + /// field: ALT_ALLELE + /// fieldSource: ALT + /// ``` + Internal, + + /// Set the field to the **directory name** of the source file being processed. + /// + /// ```yaml + /// - type: dirname + /// field: STUDY_DIR + /// ``` + Dirname, + + /// Set the field to the **filename** (no path) of the source file being processed. + /// + /// ```yaml + /// - type: filename + /// field: SOURCE_FILE + /// ``` + Filename, + + /// Delegate value computation to an external Python plugin function. + /// + /// ```yaml + /// - type: plugin + /// field: COMPUTED_SCORE + /// plugin: my_package.scoring + /// function: compute_score + /// ``` + Plugin, + + /// Look up the value in an external delimited mapping file. + /// + /// ```yaml + /// - type: mapping + /// field: GENE_NAME + /// fieldSource: ENSEMBL_ID + /// fileMapping: /data/gene_map.tsv + /// fieldMapping: ENSEMBL_ID + /// fieldValue: GENE_SYMBOL + /// ``` + Mapping, +} + +impl fmt::Display for AnnotationType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + AnnotationType::Static => "static", + AnnotationType::Internal => "internal", + AnnotationType::Dirname => "dirname", + AnnotationType::Filename => "filename", + AnnotationType::Plugin => "plugin", + AnnotationType::Mapping => "mapping", + }; + write!(f, "{s}") + } +} + +/// AnnotationDelimiter + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +pub enum AnnotationDelimiter { + /// Tab character (`\t`). + #[default] + T, + /// Comma character (`,`). + C, +} + +impl AnnotationDelimiter { + /// Returns the delimiter as a `char`. + pub fn as_char(&self) -> char { + match self { + AnnotationDelimiter::T => '\t', + AnnotationDelimiter::C => ',', + } + } +} + +impl fmt::Display for AnnotationDelimiter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + AnnotationDelimiter::T => "T", + AnnotationDelimiter::C => "C", + }; + write!(f, "{s}") + } +} + +/// AnnotationFormat + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +pub enum AnnotationFormat { + /// Tab-Separated Values — field separator is `\t`. + #[serde(rename = "TSV")] + #[default] + Tsv, + /// Comma-Separated Values — field separator is `,`. + #[serde(rename = "CSV")] + Csv, +} + +impl AnnotationFormat { + /// Returns the field separator character for this format. + pub fn separator(&self) -> char { + match self { + AnnotationFormat::Tsv => '\t', + AnnotationFormat::Csv => ',', + } + } +} + +impl fmt::Display for AnnotationFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + AnnotationFormat::Tsv => "TSV", + AnnotationFormat::Csv => "CSV", + }; + write!(f, "{s}") + } +} + +/// AnnotationEntry + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct AnnotationEntry { + /// The annotation strategy to apply (required). + #[serde(rename = "type")] + pub annotation_type: AnnotationType, + + /// Output column name that will receive the derived value (required, non-blank). + pub field: String, + + /// *(static, optional - internal)* Literal value to assign. Accepts any YAML scalar or + /// structure (`string`, `int`, `float`, `bool`, `null`). + #[serde(skip_serializing_if = "Option::is_none")] + pub value: Option, + + /// *(internal, mapping)* Name of the source column to copy from. + #[serde(rename = "fieldSource", skip_serializing_if = "Option::is_none")] + pub field_source: Option, + + /// *(plugin)* Dotted Python module path of the plugin (e.g. `my_pkg.scoring`). + #[serde(skip_serializing_if = "Option::is_none")] + pub plugin: Option, + + /// *(plugin)* Lambda function that will be executed. + #[serde(skip_serializing_if = "Option::is_none")] + pub function: Option, + + /// Optional regular-expression applied to the derived value before storage. + /// The first capture group is used when present. + #[serde(skip_serializing_if = "Option::is_none")] + pub regex: Option, + + /// *(mapping)* Path to the external lookup file. + #[serde(rename = "fileMapping", skip_serializing_if = "Option::is_none")] + pub file_mapping: Option, + + /// *(mapping)* Column in the mapping file used as the lookup key. + #[serde(rename = "fieldMapping", skip_serializing_if = "Option::is_none")] + pub field_mapping: Option, + + /// *(mapping)* Column in the mapping file whose value is returned. + #[serde(rename = "fieldValue", skip_serializing_if = "Option::is_none")] + pub field_value: Option, +} + +/// ExcludeEntry + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ExcludeEntry { + /// The column name to exclude values from. + pub field: String, + /// The value that triggers exclusion (any YAML scalar). + pub value: yaml_serde::Value, +} + +/// AnnotationConfig + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct AnnotationConfig { + /// Glob pattern used to discover source files (e.g. `"**/*.vcf.gz"`). + #[serde(default)] + pub pattern: Vec, + + /// Recurse into sub-directories when scanning for source files. + /// Default: `false`. + #[serde(default)] + pub recursive: bool, + + /// Output file format. Default: `TSV`. + #[serde(default)] + pub format: AnnotationFormat, + + /// Field delimiter. Default: `T` (tab). + #[serde(default)] + pub delimiter: AnnotationDelimiter, + + /// Ordered list of columns to include in the output. + /// Empty list means "all columns". + #[serde(default)] + pub columns: Vec, + + /// The annotation rules to apply, in order. + #[serde(default)] + pub annotation: Vec, + + /// Records matching **any** exclude predicate are dropped. + #[serde(default)] + pub exclude: Vec, +} diff --git a/src/annotation/mod.rs b/src/annotation/mod.rs new file mode 100644 index 0000000..2f6c314 --- /dev/null +++ b/src/annotation/mod.rs @@ -0,0 +1,8 @@ +pub mod config; +pub mod validator; + +pub use config::{ + AnnotationConfig, AnnotationDelimiter, AnnotationEntry, AnnotationFormat, AnnotationType, + ExcludeEntry, +}; +pub use validator::{Severity, ValidationError, parse_and_validate, validate_config}; diff --git a/src/annotation/validator.rs b/src/annotation/validator.rs new file mode 100644 index 0000000..8b6a6d1 --- /dev/null +++ b/src/annotation/validator.rs @@ -0,0 +1,223 @@ +//! Annotation validators for OpenVariant annotation. +//! +//! This module provides validation logic for ensuring that annotation configurations are valid and consistent. +//! It includes functions to validate annotation types, delimiters, formats, and overall configuration structures. +//! The validators are designed to be used during the deserialization process of YAML annotation definition files, +//! providing immediate feedback on any issues with the configuration. This helps maintain the integrity of the annotation system and +//! prevents runtime errors due to misconfigurations. + +use std::fmt; + +use super::config::{AnnotationConfig, AnnotationType}; + +#[derive(Debug, Clone, PartialEq)] +pub struct ValidationError { + /// Human-readable problem description. + pub message: String, + /// The YAML path that contains the problem, e.g. `"annotation[2].fieldSource"`. + pub path: String, + /// Severity: `Error` = invalid config; `Warning` = suspicious but not fatal. + pub severity: Severity, +} + +/// Severity of a [`ValidationError`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Severity { + /// The annotation file is invalid and cannot be used. + Error, + /// A field is present but will be silently ignored by the runtime. + Warning, +} + +impl fmt::Display for ValidationError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let tag = match self.severity { + Severity::Error => "ERROR", + Severity::Warning => "WARN ", + }; + write!(f, "[{tag}] {}: {}", self.path, self.message) + } +} + +/// Parse YAML text and validate the resulting [`AnnotationConfig`]. +/// +/// Validation runs in three passes: +/// +/// 1. **Syntax** — `yaml_serde` rejects malformed YAML and unknown enum +/// variants, reporting the exact line/column of the problem. +/// 2. **Structural** — missing or blank required fields (`field`, type-specific +/// keys) and an empty `annotation` list are flagged as errors. +/// 3. **Semantic** — keys that exist but are irrelevant for the chosen +/// annotation type are not reported, and don't cause and error. +/// +/// # Returns +/// +/// - `Ok(config)` — the config is structurally and semantically valid. +/// Warnings are *not* returned; use [`validate_config`] directly to +/// inspect them. +/// - `Err(errors)` — one or more `Severity::Error` diagnostics (warnings +/// included for context). +pub fn parse_and_validate(yaml: &str) -> Result> { + // Pass 1 — syntax + structural (serde) + let config: AnnotationConfig = yaml_serde::from_str(yaml).map_err(|e| { + // yaml_serde errors include line/column information in their Display. + vec![ValidationError { + message: format!("YAML parse error — {e}"), + path: "".into(), + severity: Severity::Error, + }] + })?; + + // Pass 2 + 3 — structural + semantic + let diagnostics = validate_config(&config); + let has_errors = diagnostics.iter().any(|d| d.severity == Severity::Error); + + if has_errors { + Err(diagnostics) + } else { + Ok(config) + } +} + +/// Validate an already-deserialised [`AnnotationConfig`]. +/// +/// Returns *all* diagnostics (errors and warnings) found. +/// An empty `Vec` means the configuration is fully valid, no errors found. +pub fn validate_config(config: &AnnotationConfig) -> Vec { + let mut diags: Vec = Vec::new(); + + if config.pattern.is_empty() { + diags.push(err( + "", + "`pattern` is empty — at least one entry is required", + )); + // Remaining checks are per-entry; bail early. + return diags; + } + + // Empty file check — this is technically valid YAML but not a valid annotation config. + if config.annotation.is_empty() { + diags.push(err( + "", + "`annotation` list is empty — at least one entry is required", + )); + // Remaining checks are per-entry; bail early. + return diags; + } + + // Validate Annotation entries + for (i, entry) in config.annotation.iter().enumerate() { + let base = format!("annotation[{i}]"); + + // `field` must always be present and non-blank (serde already + // ensures it exists; we check for an accidental empty string). + if entry.field.trim().is_empty() { + diags.push(err(&base, "`field` must not be blank")); + } + + match &entry.annotation_type { + // Static + AnnotationType::Static => { + // Check that `value` is present (can be any YAML scalar or structure, so we don't require a specific type). + if entry.value.is_none() { + diags.push(err(&base, "`value` is required but missing")); + } + } + + // Internal + AnnotationType::Internal => { + check_required_str( + &base, + "fieldSource", + entry.field_source.as_deref(), + &mut diags, + ); + } + + // Dirname / Filename + AnnotationType::Dirname | AnnotationType::Filename => {} + + // Plugin + AnnotationType::Plugin => { + check_required_str(&base, "plugin", entry.plugin.as_deref(), &mut diags); + check_required_str(&base, "function", entry.function.as_deref(), &mut diags); + } + + // Mapping + AnnotationType::Mapping => { + check_required_str( + &base, + "fieldSource", + entry.field_source.as_deref(), + &mut diags, + ); + check_required_str( + &base, + "fileMapping", + entry.file_mapping.as_deref(), + &mut diags, + ); + check_required_str( + &base, + "fieldMapping", + entry.field_mapping.as_deref(), + &mut diags, + ); + check_required_str( + &base, + "fieldValue", + entry.field_value.as_deref(), + &mut diags, + ); + } + } + } + + // Validate Exclude entries + for (i, excl) in config.exclude.iter().enumerate() { + if excl.field.trim().is_empty() { + diags.push(err(&format!("exclude[{i}]"), "`field` must not be blank")); + } + } + + // Validate Column name uniqueness + let mut seen_fields: std::collections::HashSet<&str> = std::collections::HashSet::new(); + for entry in &config.annotation { + if !seen_fields.insert(entry.field.as_str()) { + diags.push(ValidationError { + message: format!( + "duplicate `field` name `{}` — each annotation field must be unique", + entry.field + ), + path: ".annotation".into(), + severity: Severity::Error, + }); + } + } + + diags +} + +fn err(path: &str, message: &str) -> ValidationError { + ValidationError { + message: message.into(), + path: path.into(), + severity: Severity::Error, + } +} + +/// Emit an error if `value` is `None` or blank. +fn check_required_str( + base: &str, + key: &str, + value: Option<&str>, + diags: &mut Vec, +) { + match value { + None => diags.push(err(base, &format!("`{key}` is required but missing"))), + Some(v) if v.trim().is_empty() => { + diags.push(err(base, &format!("`{key}` must not be blank"))) + } + _ => {} + } +} diff --git a/src/lib.rs b/src/lib.rs index b4bce7b..20da8a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ use pyo3::prelude::*; -//mod annotation; +pub mod annotation; + //mod filter; //mod reader; //mod walker; @@ -8,6 +9,9 @@ use pyo3::prelude::*; //mod py_bindings; pub mod error; +#[cfg(test)] +mod tests; + #[pyfunction] fn hello() { println!("Hello, OpenVariant!"); diff --git a/src/tests/mod.rs b/src/tests/mod.rs new file mode 100644 index 0000000..21e31c2 --- /dev/null +++ b/src/tests/mod.rs @@ -0,0 +1 @@ +pub mod test_annotation; diff --git a/src/tests/test_annotation/mod.rs b/src/tests/test_annotation/mod.rs new file mode 100644 index 0000000..c73b855 --- /dev/null +++ b/src/tests/test_annotation/mod.rs @@ -0,0 +1,2 @@ +pub mod test_config; +pub mod test_validator; diff --git a/src/tests/test_annotation/test_config.rs b/src/tests/test_annotation/test_config.rs new file mode 100644 index 0000000..0f88e8d --- /dev/null +++ b/src/tests/test_annotation/test_config.rs @@ -0,0 +1,197 @@ +use crate::annotation::{AnnotationConfig, AnnotationDelimiter, AnnotationFormat, AnnotationType}; +use serde::Serialize; +use std::fmt; + +/// Deserialise from YAML, re-serialise, then deserialise again. +/// Asserts the three values are equal and returns the first parsed value. +fn round_trip(yaml: &str) -> T +where + T: serde::de::DeserializeOwned + Serialize + PartialEq + fmt::Debug, +{ + let first: T = yaml_serde::from_str(yaml).expect("initial parse failed"); + let reserialized = yaml_serde::to_string(&first).expect("serialise failed"); + let second: T = yaml_serde::from_str(&reserialized).expect("re-parse failed"); + assert_eq!( + first, second, + "round-trip mismatch:\noriginal = {first:?}\nre-parsed = {second:?}" + ); + first +} + +///ENUMS + +// AnnotationType, round-trip tests +#[test] +fn annotation_type_static_round_trip() { + assert_eq!( + round_trip::("static"), + AnnotationType::Static + ); +} + +#[test] +fn annotation_type_internal_round_trip() { + assert_eq!( + round_trip::("internal"), + AnnotationType::Internal + ); +} + +#[test] +fn annotation_type_dirname_round_trip() { + assert_eq!( + round_trip::("dirname"), + AnnotationType::Dirname + ); +} + +#[test] +fn annotation_type_filename_round_trip() { + assert_eq!( + round_trip::("filename"), + AnnotationType::Filename + ); +} + +#[test] +fn annotation_type_plugin_round_trip() { + assert_eq!( + round_trip::("plugin"), + AnnotationType::Plugin + ); +} + +#[test] +fn annotation_type_mapping_round_trip() { + assert_eq!( + round_trip::("mapping"), + AnnotationType::Mapping + ); +} + +#[test] +fn annotation_type_unknown_variant_is_error() { + let result = yaml_serde::from_str::("INVALID_TYPE"); + assert!(result.is_err(), "unknown variant must fail to parse"); +} + +#[test] +fn annotation_type_display_matches_yaml_key() { + // Display must produce the exact lowercase YAML key serde expects. + let cases = [ + (AnnotationType::Static, "static"), + (AnnotationType::Internal, "internal"), + (AnnotationType::Dirname, "dirname"), + (AnnotationType::Filename, "filename"), + (AnnotationType::Plugin, "plugin"), + (AnnotationType::Mapping, "mapping"), + ]; + for (variant, expected) in cases { + assert_eq!(variant.to_string(), expected); + } +} + +// AnnotationDelimiter, round-trip tests +#[test] +fn delimiter_t_round_trip() { + let d = round_trip::("T"); + assert_eq!(d, AnnotationDelimiter::T); + assert_eq!(d.as_char(), '\t'); +} + +#[test] +fn delimiter_c_round_trip() { + let d = round_trip::("C"); + assert_eq!(d, AnnotationDelimiter::C); + assert_eq!(d.as_char(), ','); +} + +#[test] +fn delimiter_unknown_variant_is_error() { + assert!(yaml_serde::from_str::("X").is_err()); +} + +// AnnotationFormat, round-trip tests +#[test] +fn format_tsv_round_trip() { + let f = round_trip::("TSV"); + assert_eq!(f, AnnotationFormat::Tsv); + assert_eq!(f.separator(), '\t'); + assert_eq!(f.to_string(), "TSV"); +} + +#[test] +fn format_csv_round_trip() { + let f = round_trip::("CSV"); + assert_eq!(f, AnnotationFormat::Csv); + assert_eq!(f.separator(), ','); + assert_eq!(f.to_string(), "CSV"); +} + +#[test] +fn format_lowercase_is_rejected() { + // YAML keys for format are uppercase; lowercase must be rejected. + assert!(yaml_serde::from_str::("tsv").is_err()); +} + +// Annotation Defaults +#[test] +fn config_defaults_are_applied_when_absent() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: dirname\n field: DIR\n"; + let cfg: AnnotationConfig = yaml_serde::from_str(yaml).unwrap(); + assert_eq!(cfg.format, AnnotationFormat::Tsv); + assert_eq!(cfg.delimiter, AnnotationDelimiter::T); + assert!(!cfg.recursive); + assert!(cfg.columns.is_empty()); + assert!(cfg.exclude.is_empty()); + assert!(!cfg.pattern.is_empty()); +} + +// Full config round-trip test +#[test] +fn full_config_round_trip() { + let yaml = r#" +pattern: + - "**/*.vcf.gz" +recursive: true +format: CSV +delimiter: C +columns: + - CHROM + - POS + - REF + - ALT +annotation: + - type: static + field: GENOME + value: GRCh38 + - type: internal + field: ALLELE + fieldSource: ALT + - type: dirname + field: STUDY_DIR + - type: filename + field: SAMPLE_FILE + - type: plugin + field: SCORE + plugin: scoring.module + function: compute + - type: mapping + field: GENE + fileMapping: /data/genes.tsv + fieldMapping: ENSEMBL + fieldValue: SYMBOL +exclude: + - field: FILTER + value: FAIL +"#; + let cfg = round_trip::(yaml); + assert_eq!(cfg.pattern.len(), 1); + assert_eq!(cfg.pattern[0], "**/*.vcf.gz"); + assert_eq!(cfg.annotation.len(), 6); + assert_eq!(cfg.format, AnnotationFormat::Csv); + assert_eq!(cfg.delimiter, AnnotationDelimiter::C); + assert_eq!(cfg.columns, vec!["CHROM", "POS", "REF", "ALT"]); + assert!(cfg.recursive); + assert_eq!(cfg.exclude.len(), 1); +} diff --git a/src/tests/test_annotation/test_validator.rs b/src/tests/test_annotation/test_validator.rs new file mode 100644 index 0000000..0abc2fb --- /dev/null +++ b/src/tests/test_annotation/test_validator.rs @@ -0,0 +1,188 @@ +use crate::annotation::validator::{Severity, parse_and_validate}; + +// Test parse_and_validate, non-error cases + +#[test] +fn validate_static_entry_ok() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: static\n field: BUILD\n value: GRCh38\n"; + assert!(parse_and_validate(yaml).is_ok()); +} + +#[test] +fn validate_internal_entry_ok() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: internal\n field: COPY\n fieldSource: REF\n"; + assert!(parse_and_validate(yaml).is_ok()); +} + +#[test] +fn validate_dirname_entry_ok() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: dirname\n field: DIR\n"; + assert!(parse_and_validate(yaml).is_ok()); +} + +#[test] +fn validate_filename_entry_ok() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: filename\n field: FILE\n"; + assert!(parse_and_validate(yaml).is_ok()); +} + +#[test] +fn validate_plugin_entry_ok() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: plugin\n field: S\n plugin: pkg.mod\n function: fn_name\n"; + assert!(parse_and_validate(yaml).is_ok()); +} + +#[test] +fn validate_mapping_entry_ok() { + let yaml = r#" +pattern: + - "**/*.vcf.gz" +annotation: + - type: mapping + field: GENE + fieldSource: ENSEMBL_ID + fileMapping: /data/map.tsv + fieldMapping: KEY + fieldValue: VAL +"#; + assert!(parse_and_validate(yaml).is_ok()); +} + +// Test parse_and_validate, error cases +#[test] +fn validate_empty_pattern_list_is_error() { + let yaml = "pattern: []\nannotation: []\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!(errs.iter().any(|e| e.severity == Severity::Error)); +} + +#[test] +fn validate_wrong_type_pattern_list_is_error() { + let yaml = "pattern: \"*.vcf\"\nannotation: []\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!(errs.iter().any(|e| e.severity == Severity::Error)); +} + +#[test] +fn validate_empty_annotation_list_is_error() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation: []\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!(errs.iter().any(|e| e.severity == Severity::Error)); +} + +#[test] +fn validate_static_missing_value_is_error() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: static\n field: BUILD\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!( + errs.iter() + .any(|e| e.message.contains("`value` is required")) + ); +} + +#[test] +#[allow(non_snake_case)] +fn validate_internal_missing_fieldSource_is_error() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: internal\n field: X\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!( + errs.iter() + .any(|e| e.message.contains("`fieldSource` is required")) + ); +} + +#[test] +fn validate_plugin_missing_function_is_error() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: plugin\n field: S\n plugin: mod\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!( + errs.iter() + .any(|e| e.message.contains("`function` is required")) + ); +} + +#[test] +fn validate_mapping_missing_keys_are_errors() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: mapping\n field: G\n fileMapping: /f.tsv\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + let messages: Vec<&str> = errs.iter().map(|e| e.message.as_str()).collect(); + assert!(messages.iter().any(|m| m.contains("fieldMapping"))); + assert!(messages.iter().any(|m| m.contains("fieldValue"))); +} + +#[test] +fn validate_blank_field_is_error() { + let yaml = "pattern:\n - \"**/*.vcf.gz\"\nannotation:\n - type: dirname\n field: ''\n"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!( + errs.iter() + .any(|e| e.message.contains("`field` must not be blank")) + ); +} + +#[test] +fn validate_duplicate_field_names_are_errors() { + let yaml = r#" +pattern: + - "**/*.vcf.gz" +annotation: + - type: dirname + field: DIR + - type: filename + field: DIR +"#; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!( + errs.iter() + .any(|e| e.message.contains("duplicate `field` name")) + ); +} + +#[test] +fn validate_syntax_error_returned_as_parse_error() { + let yaml = "annotation:\n - type: [\nbad yaml"; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!(errs[0].message.contains("YAML parse error")); +} + +#[test] +fn validate_unused_fields_not_errors() { + // `fieldSource` is irrelevant for `static` entries, but it should not make an + // otherwise-valid config fail validation. + let yaml = r#" +pattern: + - "**/*.vcf.gz" +annotation: + - type: static + field: BUILD + value: GRCh38 + fieldSource: SPURIOUS +"#; + assert!(parse_and_validate(yaml).is_ok()); +} + +#[test] +fn validate_formatting_errors_returned_as_parse_errors() { + let yaml = r#" + pattern: + - "**/*.vcf.gz" +annotation: + - type: dirname + field: DIR +"#; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!(errs[0].message.contains("YAML parse error")); +} + +#[test] +fn validate_formatting_errors_returned_as_parse_errors_annotation_entry() { + let yaml = r#" +pattern: + - "**/*.vcf.gz" +annotation: + - type: dirname + field: DIR +"#; + let errs = parse_and_validate(yaml).unwrap_err(); + assert!(errs[0].message.contains("YAML parse error")); +}