From 64357ce10568cd2f7696b53da5850f1a24bfda77 Mon Sep 17 00:00:00 2001 From: "Mitchell R. Vollger" Date: Thu, 29 Jan 2026 20:58:58 -0700 Subject: [PATCH 1/5] add build.rs to embed Python library rpath for runtime linking Without this, cargo test fails on macOS with "Library not loaded: libpython3.13.dylib" because the dynamic linker can't find the Python shared library. The build script uses pyo3-build-config (already a build-dependency) to discover the Python lib directory and passes it as an rpath to the linker. Co-Authored-By: Claude Opus 4.5 --- build.rs | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 build.rs diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..1865941 --- /dev/null +++ b/build.rs @@ -0,0 +1,8 @@ +fn main() { + // Use pyo3-build-config to get the Python interpreter's configuration, + // then emit an rpath so the test/binary can find libpython at runtime on macOS. + let config = pyo3_build_config::get(); + if let Some(lib_dir) = &config.lib_dir { + println!("cargo:rustc-link-arg=-Wl,-rpath,{}", lib_dir); + } +} From c93317c43ab4ea28321f34a213affec7ee588bfa Mon Sep 17 00:00:00 2001 From: "Mitchell R. Vollger" Date: Thu, 29 Jan 2026 21:48:09 -0700 Subject: [PATCH 2/5] Add map subcommand for aggregating overlapping B values per A interval Implements bedtools-map-style functionality with composable flags: - Default: aggregate all overlapping B intervals (one row per A) - -G/--group-by-b: group B intervals by name and summarize separately - -n/--name-match: only summarize B intervals whose name matches A - Supports multiple comma-separated columns (-c) and operations (-O) with bedtools-compatible pairing rules (1:N, N:1, N:N) - Operations: count, sum, mean, min, max, median --- src/cli/map.rs | 451 +++++++++++++++++++++++++++++++++++++++++ src/cli/mod.rs | 3 +- src/main.rs | 3 + src/position.rs | 30 +++ tests/map_a.bed | 2 + tests/map_b.bed | 4 + tests/map_b_noname.bed | 2 + 7 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 src/cli/map.rs create mode 100644 tests/map_a.bed create mode 100644 tests/map_b.bed create mode 100644 tests/map_b_noname.bed diff --git a/src/cli/map.rs b/src/cli/map.rs new file mode 100644 index 0000000..f0372cc --- /dev/null +++ b/src/cli/map.rs @@ -0,0 +1,451 @@ +use std::collections::HashMap; +use std::path::PathBuf; + +use clap::{Parser, ValueEnum}; + +use crate::cli::shared::HELP_TEMPLATE; + +/// The aggregation operation to apply to B values. +#[derive(Debug, Clone, ValueEnum)] +pub enum AggOp { + Count, + Sum, + Mean, + Min, + Max, + Median, +} + +impl AggOp { + /// Given a vector of f64 values, compute the aggregate. + /// Returns "." for empty data, except Count which returns "0". + pub fn compute(&self, values: &mut Vec) -> String { + if values.is_empty() { + return match self { + AggOp::Count => "0".to_string(), + _ => ".".to_string(), + }; + } + match self { + AggOp::Count => values.len().to_string(), + AggOp::Sum => format_number(values.iter().sum()), + AggOp::Mean => { + let sum: f64 = values.iter().sum(); + format_number(sum / values.len() as f64) + } + AggOp::Min => format_number( + values + .iter() + .cloned() + .fold(f64::INFINITY, f64::min), + ), + AggOp::Max => format_number( + values + .iter() + .cloned() + .fold(f64::NEG_INFINITY, f64::max), + ), + AggOp::Median => { + values + .sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let mid = values.len() / 2; + let median = if values.len() % 2 == 0 { + (values[mid - 1] + values[mid]) / 2.0 + } else { + values[mid] + }; + format_number(median) + } + } + } +} + +fn format_number(v: f64) -> String { + if v == v.trunc() && v.abs() < 1e15 { + format!("{}", v as i64) + } else { + format!("{}", v) + } +} + +/// Extract a value from a B interval for aggregation. +fn extract_value(b_pos: &bedder::position::Position, operation: &AggOp, column: usize) -> Option { + match operation { + AggOp::Count => Some(0.0), // dummy value for counting + _ => b_pos.column_as_f64(column), + } +} + +/// Expand columns and operations into paired (column, operation) tuples. +/// Rules (matching bedtools map): +/// - len(c) == len(o): zip them +/// - len(c) == 1: replicate c to match len(o) +/// - len(o) == 1: replicate o to match len(c) +/// - otherwise: error +fn expand_ops(columns: &[usize], operations: &[AggOp]) -> Result, Box> { + match (columns.len(), operations.len()) { + (c, o) if c == o => Ok(columns.iter().copied().zip(operations.iter().cloned()).collect()), + (1, _) => Ok(operations.iter().cloned().map(|op| (columns[0], op)).collect()), + (_, 1) => Ok(columns.iter().copied().map(|col| (col, operations[0].clone())).collect()), + (c, o) => Err(format!( + "number of columns ({}) and operations ({}) must match, or one must be 1", + c, o + ).into()), + } +} + +#[derive(Parser, Debug)] +#[command( + author, + version, + about = "Map operation: aggregate values from overlapping B intervals for each A interval.", + long_about = None, + rename_all = "kebab-case", + help_template = HELP_TEMPLATE, + arg_required_else_help = true, + after_long_help = "\ +EXAMPLES: + Given tests/map_a.bed (A): + chr1\t100\t200\tgeneA\t10 + chr1\t300\t400\tgeneB\t20 + + And tests/map_b.bed (B): + chr1\t120\t180\tgeneA\t5 + chr1\t130\t170\tgeneB\t7 + chr1\t150\t190\tgeneA\t3 + chr1\t350\t380\tgeneB\t4 + + 1. Sum scores of all overlapping B intervals (default): + + $ bedder map -a tests/map_a.bed -b tests/map_b.bed -g tests/hg38.small.fai + chr1\t100\t200\tgeneA\t10\t15 + chr1\t300\t400\tgeneB\t20\t4 + + All 3 B intervals overlapping geneA are summed (5+7+3=15). + + 2. Only aggregate B intervals whose name matches A (-n): + + $ bedder map -a tests/map_a.bed -b tests/map_b.bed -g tests/hg38.small.fai -n + chr1\t100\t200\tgeneA\t10\t8 + chr1\t300\t400\tgeneB\t20\t4 + + For geneA only geneA-named B intervals are used (5+3=8). + The geneB-named B interval (score 7) is excluded. + + 3. Group overlapping B by B's name (-G), one row per group: + + $ bedder map -a tests/map_a.bed -b tests/map_b.bed -g tests/hg38.small.fai -G + chr1\t100\t200\tgeneA\t10\tgeneA\t8 + chr1\t100\t200\tgeneA\t10\tgeneB\t7 + chr1\t300\t400\tgeneB\t20\tgeneB\t4 + + geneA's overlapping B intervals are split into two groups. + + 4. Multiple operations on the same column: + + $ bedder map -a tests/map_a.bed -b tests/map_b.bed -g tests/hg38.small.fai -c 5 -O sum,mean,count + chr1\t100\t200\tgeneA\t10\t15\t5\t3 + chr1\t300\t400\tgeneB\t20\t4\t4\t1 + + 5. Combine -G with -n: + + $ bedder map -a tests/map_a.bed -b tests/map_b.bed -g tests/hg38.small.fai -G -n -O sum,count + chr1\t100\t200\tgeneA\t10\tgeneA\t8\t2 + chr1\t300\t400\tgeneB\t20\tgeneB\t4\t1 + + Groups by B name, but only keeps groups matching A's name." +)] +pub struct MapCmdArgs { + #[arg(help = "input A file (query)", short = 'a')] + pub query_path: PathBuf, + + #[arg(help = "input B file (database)", short = 'b')] + pub other_path: PathBuf, + + #[arg( + help = "genome file for chromosome ordering", + short = 'g', + long = "genome", + required = true + )] + pub genome_file: PathBuf, + + #[arg( + help = "1-indexed column(s) of B to aggregate (default: 5 = score). Comma-separated for multiple.", + short = 'c', + long = "column", + default_value = "5", + value_delimiter = ',' + )] + pub columns: Vec, + + #[arg( + help = "aggregation operation(s) to apply. Comma-separated for multiple.", + short = 'O', + long = "operation", + default_value = "sum", + value_delimiter = ',', + value_enum + )] + pub operations: Vec, + + #[arg( + help = "output file (default: stdout)", + short = 'o', + long = "output", + default_value = "-" + )] + pub output_path: PathBuf, + + #[arg( + short = 'G', + long = "group-by-b", + help = "For each A, group its overlapping B intervals by name, then summarize each group separately." + )] + pub group_by_b: bool, + + #[arg( + short = 'n', + long = "name-match", + help = "Only summarize B intervals whose name matches A's name." + )] + pub name_match: bool, +} + +pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { + let ops = expand_ops(&args.columns, &args.operations)?; + + let chrom_order = + bedder::chrom_ordering::parse_genome(std::fs::File::open(&args.genome_file)?)?; + + let a_file = std::io::BufReader::new(std::fs::File::open(&args.query_path)?); + let (a_reader, a_file_type) = bedder::sniff::open(a_file, &args.query_path)?; + + if !matches!(a_file_type, bedder::sniff::FileType::Bed) { + return Err("map currently only supports BED files for -a".into()); + } + + let a_iter = a_reader.into_positioned_iterator(); + + let b_file = std::io::BufReader::new(std::fs::File::open(&args.other_path)?); + let (b_reader, b_file_type) = bedder::sniff::open(b_file, &args.other_path)?; + + if !matches!(b_file_type, bedder::sniff::FileType::Bed) { + return Err("map currently only supports BED files for -b".into()); + } + + let b_iter = b_reader.into_positioned_iterator(); + + let ii = bedder::intersection::IntersectionIterator::new( + a_iter, + vec![b_iter], + &chrom_order, + -1, // max_distance: not used + -1, // n_closest: not used + false, // can_skip_ahead: need every A interval reported + )?; + + let output_path = if args.output_path.to_str() == Some("-") { + "/dev/stdout" + } else { + args.output_path.to_str().unwrap() + }; + let mut bed_writer = bedder::bedder_bed::simplebed::BedWriter::new(output_path) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))?; + + for intersection_result in ii { + let intersection = intersection_result?; + let base = intersection + .base_interval + .try_lock() + .expect("failed to lock base_interval"); + + let a_name: Option = base.name().map(|s| s.to_string()); + + let bed_record = match &*base { + bedder::position::Position::Bed(bed) => &bed.0, + _ => return Err("map only supports BED input".into()), + }; + + if args.group_by_b { + // Group overlapping B intervals by B name. + // Each group accumulates one Vec per operation. + let mut groups: HashMap>> = HashMap::new(); + let mut insertion_order: Vec = Vec::new(); + + for overlap in &intersection.overlapping { + let b_pos = overlap + .interval + .try_lock() + .expect("failed to lock b interval"); + + let b_name = b_pos.name().unwrap_or(".").to_string(); + + if args.name_match { + if let Some(ref an) = a_name { + if b_name != *an { + continue; + } + } + } + + if !groups.contains_key(&b_name) { + insertion_order.push(b_name.clone()); + groups.insert(b_name.clone(), vec![Vec::new(); ops.len()]); + } + let vecs = groups.get_mut(&b_name).unwrap(); + for (i, (col, op)) in ops.iter().enumerate() { + if let Some(val) = extract_value(&b_pos, op, *col) { + vecs[i].push(val); + } + } + } + + if groups.is_empty() { + let mut record = bed_record.clone(); + record.push_field(bedder::bedder_bed::BedValue::String(".".to_string())); + for (_, op) in &ops { + record.push_field(bedder::bedder_bed::BedValue::String( + op.compute(&mut Vec::new()), + )); + } + bed_writer.write_record(&record).map_err(|e| { + std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) + })?; + } else { + for b_name in &insertion_order { + let vecs = groups.get_mut(b_name).unwrap(); + let mut record = bed_record.clone(); + record.push_field(bedder::bedder_bed::BedValue::String(b_name.clone())); + for (i, (_, op)) in ops.iter().enumerate() { + let agg_result = op.compute(&mut vecs[i]); + record.push_field(bedder::bedder_bed::BedValue::String(agg_result)); + } + bed_writer.write_record(&record).map_err(|e| { + std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) + })?; + } + } + } else { + // Standard path: one row per A interval with one aggregate column per op + let mut value_vecs: Vec> = vec![Vec::new(); ops.len()]; + + for overlap in &intersection.overlapping { + let b_pos = overlap + .interval + .try_lock() + .expect("failed to lock b interval"); + + if args.name_match { + if let Some(ref an) = a_name { + if let Some(bn) = b_pos.name() { + if bn != an.as_str() { + continue; + } + } + // B has no name → include it + } + } + + for (i, (col, op)) in ops.iter().enumerate() { + if let Some(val) = extract_value(&b_pos, op, *col) { + value_vecs[i].push(val); + } + } + } + + let mut record = bed_record.clone(); + for (i, (_, op)) in ops.iter().enumerate() { + let agg_result = op.compute(&mut value_vecs[i]); + record.push_field(bedder::bedder_bed::BedValue::String(agg_result)); + } + bed_writer.write_record(&record).map_err(|e| { + std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) + })?; + } + } + + bed_writer.flush().map_err(|e| { + std::io::Error::new(std::io::ErrorKind::Other, e.to_string()) + })?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_agg_count() { + let mut values = vec![1.0, 2.0, 3.0]; + assert_eq!(AggOp::Count.compute(&mut values), "3"); + } + + #[test] + fn test_agg_count_empty() { + let mut values: Vec = vec![]; + assert_eq!(AggOp::Count.compute(&mut values), "0"); + } + + #[test] + fn test_agg_sum() { + let mut values = vec![1.0, 2.0, 3.0]; + assert_eq!(AggOp::Sum.compute(&mut values), "6"); + } + + #[test] + fn test_agg_sum_empty() { + let mut values: Vec = vec![]; + assert_eq!(AggOp::Sum.compute(&mut values), "."); + } + + #[test] + fn test_agg_mean() { + let mut values = vec![1.0, 2.0, 3.0]; + assert_eq!(AggOp::Mean.compute(&mut values), "2"); + } + + #[test] + fn test_agg_min() { + let mut values = vec![3.0, 1.0, 2.0]; + assert_eq!(AggOp::Min.compute(&mut values), "1"); + } + + #[test] + fn test_agg_max() { + let mut values = vec![3.0, 1.0, 2.0]; + assert_eq!(AggOp::Max.compute(&mut values), "3"); + } + + #[test] + fn test_agg_median_odd() { + let mut values = vec![3.0, 1.0, 2.0]; + assert_eq!(AggOp::Median.compute(&mut values), "2"); + } + + #[test] + fn test_agg_median_even() { + let mut values = vec![1.0, 2.0, 3.0, 4.0]; + assert_eq!(AggOp::Median.compute(&mut values), "2.5"); + } + + #[test] + fn test_agg_mean_empty() { + let mut values: Vec = vec![]; + assert_eq!(AggOp::Mean.compute(&mut values), "."); + } + + #[test] + fn test_format_number_integer() { + assert_eq!(format_number(42.0), "42"); + assert_eq!(format_number(-3.0), "-3"); + assert_eq!(format_number(0.0), "0"); + } + + #[test] + fn test_format_number_float() { + assert_eq!(format_number(2.5), "2.5"); + assert_eq!(format_number(1.333), "1.333"); + } +} diff --git a/src/cli/mod.rs b/src/cli/mod.rs index ce519e8..b6fdadc 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -1,4 +1,5 @@ +pub mod closest; pub mod full; +pub mod map; pub mod intersect; -pub mod closest; pub mod shared; \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index ddafa56..76088c3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,6 +18,8 @@ enum Commands { Intersect(cli::intersect::IntersectCmdArgs), /// Closest mode - hides overlap requirements Closest(cli::closest::ClosestCmdArgs), + /// Map operation — aggregate overlapping B values per A interval + Map(cli::map::MapCmdArgs), } #[cfg(feature = "mimalloc_allocator")] @@ -37,5 +39,6 @@ pub fn main() -> Result<(), Box> { Commands::Full(args) => cli::full::full_command(args), Commands::Intersect(args) => cli::intersect::intersect_command(args), Commands::Closest(args) => cli::closest::closest_command(args), + Commands::Map(args) => cli::map::map_command(args), } } diff --git a/src/position.rs b/src/position.rs index 6b16c20..7ce1d0a 100644 --- a/src/position.rs +++ b/src/position.rs @@ -167,6 +167,36 @@ impl Position { Position::Other(_o) => unimplemented!("TODO: clone Box"), } } + + /// Get the BED name field (column 4) if this is a BED record with a name. + pub fn name(&self) -> Option<&str> { + match self { + Position::Bed(b) => b.0.name(), + _ => None, + } + } + + /// Get a 1-indexed BED column as f64. + /// 1=chrom (None), 2=start, 3=end, 4=name (None), 5=score, 6+=other_fields + pub fn column_as_f64(&self, col: usize) -> Option { + match self { + Position::Bed(b) => match col { + 2 => Some(b.0.start() as f64), + 3 => Some(b.0.end() as f64), + 5 => b.0.score(), + n if n >= 6 => { + let idx = n - 6; + b.0.other_fields().get(idx).and_then(|v| match v { + crate::bedder_bed::BedValue::Integer(i) => Some(*i as f64), + crate::bedder_bed::BedValue::Float(f) => Some(*f), + crate::bedder_bed::BedValue::String(s) => s.parse::().ok(), + }) + } + _ => None, + }, + _ => None, + } + } } /// PositionedIterator is an iterator over Positioned objects. diff --git a/tests/map_a.bed b/tests/map_a.bed new file mode 100644 index 0000000..7bee9b8 --- /dev/null +++ b/tests/map_a.bed @@ -0,0 +1,2 @@ +chr1 100 200 geneA 10 +chr1 300 400 geneB 20 diff --git a/tests/map_b.bed b/tests/map_b.bed new file mode 100644 index 0000000..98ae5d6 --- /dev/null +++ b/tests/map_b.bed @@ -0,0 +1,4 @@ +chr1 120 180 geneA 5 +chr1 130 170 geneB 7 +chr1 150 190 geneA 3 +chr1 350 380 geneB 4 diff --git a/tests/map_b_noname.bed b/tests/map_b_noname.bed new file mode 100644 index 0000000..56df16a --- /dev/null +++ b/tests/map_b_noname.bed @@ -0,0 +1,2 @@ +chr1 120 180 +chr1 350 380 From 3cb25d5d57d6c881c6c96d0ca4d837ebcb0452b5 Mon Sep 17 00:00:00 2001 From: "Mitchell R. Vollger" Date: Fri, 30 Jan 2026 07:22:55 -0700 Subject: [PATCH 3/5] Address PR review: compute signature, name-match bug, error handling, stdout portability - Change compute() to take &[f64] instead of &mut Vec so Median no longer sorts the caller's data in place; clones to a local vec. - Fix name-match in non-grouped path: B intervals with no name were incorrectly included when A had a name. Now correctly excludes B intervals whose name differs from A's (including when B has no name). - Remove map_err wrappers on write_record/flush/BedWriter::new; BedError implements std::error::Error so ? propagates directly without losing the original error type and context. - Replace /dev/stdout with BedWriter::from_writer(stdout()) for portable stdout handling, and pass PathBuf directly to BedWriter::new to avoid unwrap() on non-UTF-8 paths. Co-Authored-By: Claude Opus 4.5 --- src/cli/map.rs | 92 ++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 52 deletions(-) diff --git a/src/cli/map.rs b/src/cli/map.rs index f0372cc..c09f6c7 100644 --- a/src/cli/map.rs +++ b/src/cli/map.rs @@ -17,9 +17,9 @@ pub enum AggOp { } impl AggOp { - /// Given a vector of f64 values, compute the aggregate. + /// Given a slice of f64 values, compute the aggregate. /// Returns "." for empty data, except Count which returns "0". - pub fn compute(&self, values: &mut Vec) -> String { + pub fn compute(&self, values: &[f64]) -> String { if values.is_empty() { return match self { AggOp::Count => "0".to_string(), @@ -46,13 +46,14 @@ impl AggOp { .fold(f64::NEG_INFINITY, f64::max), ), AggOp::Median => { - values + let mut sorted = values.to_vec(); + sorted .sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); - let mid = values.len() / 2; - let median = if values.len() % 2 == 0 { - (values[mid - 1] + values[mid]) / 2.0 + let mid = sorted.len() / 2; + let median = if sorted.len() % 2 == 0 { + (sorted[mid - 1] + sorted[mid]) / 2.0 } else { - values[mid] + sorted[mid] }; format_number(median) } @@ -245,13 +246,11 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { false, // can_skip_ahead: need every A interval reported )?; - let output_path = if args.output_path.to_str() == Some("-") { - "/dev/stdout" + let mut bed_writer = if args.output_path.to_str() == Some("-") { + bedder::bedder_bed::simplebed::BedWriter::from_writer(Box::new(std::io::BufWriter::new(std::io::stdout())))? } else { - args.output_path.to_str().unwrap() + bedder::bedder_bed::simplebed::BedWriter::new(&args.output_path)? }; - let mut bed_writer = bedder::bedder_bed::simplebed::BedWriter::new(output_path) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))?; for intersection_result in ii { let intersection = intersection_result?; @@ -306,24 +305,20 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { record.push_field(bedder::bedder_bed::BedValue::String(".".to_string())); for (_, op) in &ops { record.push_field(bedder::bedder_bed::BedValue::String( - op.compute(&mut Vec::new()), + op.compute(&[]), )); } - bed_writer.write_record(&record).map_err(|e| { - std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) - })?; + bed_writer.write_record(&record)?; } else { for b_name in &insertion_order { let vecs = groups.get_mut(b_name).unwrap(); let mut record = bed_record.clone(); record.push_field(bedder::bedder_bed::BedValue::String(b_name.clone())); for (i, (_, op)) in ops.iter().enumerate() { - let agg_result = op.compute(&mut vecs[i]); + let agg_result = op.compute(&vecs[i]); record.push_field(bedder::bedder_bed::BedValue::String(agg_result)); } - bed_writer.write_record(&record).map_err(|e| { - std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) - })?; + bed_writer.write_record(&record)?; } } } else { @@ -338,12 +333,9 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { if args.name_match { if let Some(ref an) = a_name { - if let Some(bn) = b_pos.name() { - if bn != an.as_str() { - continue; - } + if b_pos.name() != Some(an.as_str()) { + continue; } - // B has no name → include it } } @@ -356,18 +348,14 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { let mut record = bed_record.clone(); for (i, (_, op)) in ops.iter().enumerate() { - let agg_result = op.compute(&mut value_vecs[i]); + let agg_result = op.compute(&value_vecs[i]); record.push_field(bedder::bedder_bed::BedValue::String(agg_result)); } - bed_writer.write_record(&record).map_err(|e| { - std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()) - })?; + bed_writer.write_record(&record)?; } } - bed_writer.flush().map_err(|e| { - std::io::Error::new(std::io::ErrorKind::Other, e.to_string()) - })?; + bed_writer.flush()?; Ok(()) } @@ -378,62 +366,62 @@ mod tests { #[test] fn test_agg_count() { - let mut values = vec![1.0, 2.0, 3.0]; - assert_eq!(AggOp::Count.compute(&mut values), "3"); + let values = vec![1.0, 2.0, 3.0]; + assert_eq!(AggOp::Count.compute(&values), "3"); } #[test] fn test_agg_count_empty() { - let mut values: Vec = vec![]; - assert_eq!(AggOp::Count.compute(&mut values), "0"); + let values: Vec = vec![]; + assert_eq!(AggOp::Count.compute(&values), "0"); } #[test] fn test_agg_sum() { - let mut values = vec![1.0, 2.0, 3.0]; - assert_eq!(AggOp::Sum.compute(&mut values), "6"); + let values = vec![1.0, 2.0, 3.0]; + assert_eq!(AggOp::Sum.compute(&values), "6"); } #[test] fn test_agg_sum_empty() { - let mut values: Vec = vec![]; - assert_eq!(AggOp::Sum.compute(&mut values), "."); + let values: Vec = vec![]; + assert_eq!(AggOp::Sum.compute(&values), "."); } #[test] fn test_agg_mean() { - let mut values = vec![1.0, 2.0, 3.0]; - assert_eq!(AggOp::Mean.compute(&mut values), "2"); + let values = vec![1.0, 2.0, 3.0]; + assert_eq!(AggOp::Mean.compute(&values), "2"); } #[test] fn test_agg_min() { - let mut values = vec![3.0, 1.0, 2.0]; - assert_eq!(AggOp::Min.compute(&mut values), "1"); + let values = vec![3.0, 1.0, 2.0]; + assert_eq!(AggOp::Min.compute(&values), "1"); } #[test] fn test_agg_max() { - let mut values = vec![3.0, 1.0, 2.0]; - assert_eq!(AggOp::Max.compute(&mut values), "3"); + let values = vec![3.0, 1.0, 2.0]; + assert_eq!(AggOp::Max.compute(&values), "3"); } #[test] fn test_agg_median_odd() { - let mut values = vec![3.0, 1.0, 2.0]; - assert_eq!(AggOp::Median.compute(&mut values), "2"); + let values = vec![3.0, 1.0, 2.0]; + assert_eq!(AggOp::Median.compute(&values), "2"); } #[test] fn test_agg_median_even() { - let mut values = vec![1.0, 2.0, 3.0, 4.0]; - assert_eq!(AggOp::Median.compute(&mut values), "2.5"); + let values = vec![1.0, 2.0, 3.0, 4.0]; + assert_eq!(AggOp::Median.compute(&values), "2.5"); } #[test] fn test_agg_mean_empty() { - let mut values: Vec = vec![]; - assert_eq!(AggOp::Mean.compute(&mut values), "."); + let values: Vec = vec![]; + assert_eq!(AggOp::Mean.compute(&values), "."); } #[test] From dcbf9ee01c62fe54193f611562cd9cc7c8ae5729 Mon Sep 17 00:00:00 2001 From: "Mitchell R. Vollger" Date: Fri, 30 Jan 2026 07:30:21 -0700 Subject: [PATCH 4/5] Move map integration tests to tests/integration_map.rs Consolidate inline unit tests and move end-to-end map_command tests to a separate integration test file using the Command-based pattern matching integration_skip_position.rs. Inline tests now cover only pure functions (AggOp, format_number, expand_ops). Co-Authored-By: Claude Opus 4.5 --- src/cli/map.rs | 101 ++++++++++++++-------------------- tests/integration_map.rs | 113 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 61 deletions(-) create mode 100644 tests/integration_map.rs diff --git a/src/cli/map.rs b/src/cli/map.rs index c09f6c7..03862c0 100644 --- a/src/cli/map.rs +++ b/src/cli/map.rs @@ -365,75 +365,54 @@ mod tests { use super::*; #[test] - fn test_agg_count() { - let values = vec![1.0, 2.0, 3.0]; - assert_eq!(AggOp::Count.compute(&values), "3"); + fn test_agg_ops() { + let vals = vec![3.0, 1.0, 2.0]; + assert_eq!(AggOp::Count.compute(&vals), "3"); + assert_eq!(AggOp::Sum.compute(&vals), "6"); + assert_eq!(AggOp::Mean.compute(&vals), "2"); + assert_eq!(AggOp::Min.compute(&vals), "1"); + assert_eq!(AggOp::Max.compute(&vals), "3"); + assert_eq!(AggOp::Median.compute(&vals), "2"); + + // even-length median + assert_eq!(AggOp::Median.compute(&[1.0, 2.0, 3.0, 4.0]), "2.5"); + + // empty: count returns "0", others return "." + assert_eq!(AggOp::Count.compute(&[]), "0"); + assert_eq!(AggOp::Sum.compute(&[]), "."); + assert_eq!(AggOp::Mean.compute(&[]), "."); + assert_eq!(AggOp::Min.compute(&[]), "."); + assert_eq!(AggOp::Max.compute(&[]), "."); + assert_eq!(AggOp::Median.compute(&[]), "."); } #[test] - fn test_agg_count_empty() { - let values: Vec = vec![]; - assert_eq!(AggOp::Count.compute(&values), "0"); - } - - #[test] - fn test_agg_sum() { - let values = vec![1.0, 2.0, 3.0]; - assert_eq!(AggOp::Sum.compute(&values), "6"); - } - - #[test] - fn test_agg_sum_empty() { - let values: Vec = vec![]; - assert_eq!(AggOp::Sum.compute(&values), "."); - } - - #[test] - fn test_agg_mean() { - let values = vec![1.0, 2.0, 3.0]; - assert_eq!(AggOp::Mean.compute(&values), "2"); - } - - #[test] - fn test_agg_min() { - let values = vec![3.0, 1.0, 2.0]; - assert_eq!(AggOp::Min.compute(&values), "1"); - } - - #[test] - fn test_agg_max() { - let values = vec![3.0, 1.0, 2.0]; - assert_eq!(AggOp::Max.compute(&values), "3"); - } - - #[test] - fn test_agg_median_odd() { - let values = vec![3.0, 1.0, 2.0]; - assert_eq!(AggOp::Median.compute(&values), "2"); - } - - #[test] - fn test_agg_median_even() { - let values = vec![1.0, 2.0, 3.0, 4.0]; - assert_eq!(AggOp::Median.compute(&values), "2.5"); - } - - #[test] - fn test_agg_mean_empty() { - let values: Vec = vec![]; - assert_eq!(AggOp::Mean.compute(&values), "."); - } - - #[test] - fn test_format_number_integer() { + fn test_format_number() { assert_eq!(format_number(42.0), "42"); assert_eq!(format_number(-3.0), "-3"); assert_eq!(format_number(0.0), "0"); + assert_eq!(format_number(2.5), "2.5"); + assert_eq!(format_number(1.333), "1.333"); } #[test] - fn test_format_number_float() { - assert_eq!(format_number(2.5), "2.5"); - assert_eq!(format_number(1.333), "1.333"); + fn test_expand_ops() { + // zip when equal length + let ops = expand_ops(&[4, 5], &[AggOp::Sum, AggOp::Mean]).unwrap(); + assert_eq!(ops.len(), 2); + assert_eq!(ops[0].0, 4); + assert_eq!(ops[1].0, 5); + + // replicate single column + let ops = expand_ops(&[5], &[AggOp::Sum, AggOp::Count]).unwrap(); + assert_eq!(ops.len(), 2); + assert!(ops.iter().all(|(c, _)| *c == 5)); + + // replicate single operation + let ops = expand_ops(&[4, 5], &[AggOp::Sum]).unwrap(); + assert_eq!(ops.len(), 2); + + // mismatch is an error + assert!(expand_ops(&[4, 5], &[AggOp::Sum, AggOp::Mean, AggOp::Count]).is_err()); } } diff --git a/tests/integration_map.rs b/tests/integration_map.rs new file mode 100644 index 0000000..43ea2c5 --- /dev/null +++ b/tests/integration_map.rs @@ -0,0 +1,113 @@ +use std::process::Command; + +/// Run `cargo run -- map` with the given extra args, return stdout lines. +fn run_map(args: &[&str]) -> Vec { + let mut cmd_args = vec![ + "run", "--", "map", + "-a", "tests/map_a.bed", + "-b", "tests/map_b.bed", + "-g", "tests/hg38.small.fai", + ]; + cmd_args.extend_from_slice(args); + + let output = Command::new("cargo") + .args(&cmd_args) + .output() + .expect("failed to execute bedder map"); + + assert!( + output.status.success(), + "bedder map failed:\n{}", + String::from_utf8_lossy(&output.stderr) + ); + + String::from_utf8_lossy(&output.stdout) + .lines() + .filter(|l| !l.is_empty()) + .map(String::from) + .collect() +} + +#[test] +fn test_map_default_sum() { + // All 3 B intervals overlap geneA (5+7+3=15), 1 overlaps geneB (4) + let lines = run_map(&[]); + assert_eq!(lines.len(), 2); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\t15"); + assert_eq!(lines[1], "chr1\t300\t400\tgeneB\t20\t4"); +} + +#[test] +fn test_map_count() { + let lines = run_map(&["-O", "count"]); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\t3"); + assert_eq!(lines[1], "chr1\t300\t400\tgeneB\t20\t1"); +} + +#[test] +fn test_map_multiple_ops() { + let lines = run_map(&["-c", "5", "-O", "sum,mean,count"]); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\t15\t5\t3"); + assert_eq!(lines[1], "chr1\t300\t400\tgeneB\t20\t4\t4\t1"); +} + +#[test] +fn test_map_name_match() { + // geneA: only geneA-named B (5+3=8), geneB: only geneB-named B (4) + let lines = run_map(&["-n"]); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\t8"); + assert_eq!(lines[1], "chr1\t300\t400\tgeneB\t20\t4"); +} + +#[test] +fn test_map_group_by_b() { + // geneA overlaps: geneA(5+3=8), geneB(7). geneB overlaps: geneB(4) + let lines = run_map(&["-G"]); + assert_eq!(lines.len(), 3); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\tgeneA\t8"); + assert_eq!(lines[1], "chr1\t100\t200\tgeneA\t10\tgeneB\t7"); + assert_eq!(lines[2], "chr1\t300\t400\tgeneB\t20\tgeneB\t4"); +} + +#[test] +fn test_map_group_by_b_with_name_match() { + // Group by B name, but only keep groups matching A's name + let lines = run_map(&["-G", "-n"]); + assert_eq!(lines.len(), 2); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\tgeneA\t8"); + assert_eq!(lines[1], "chr1\t300\t400\tgeneB\t20\tgeneB\t4"); +} + +#[test] +fn test_map_group_by_b_with_multiple_ops() { + let lines = run_map(&["-G", "-O", "sum,count"]); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\tgeneA\t8\t2"); + assert_eq!(lines[1], "chr1\t100\t200\tgeneA\t10\tgeneB\t7\t1"); + assert_eq!(lines[2], "chr1\t300\t400\tgeneB\t20\tgeneB\t4\t1"); +} + +#[test] +fn test_map_b_no_name_with_name_match() { + // B has no name column: name-match should exclude all B (no name != geneA/geneB) + let output = Command::new("cargo") + .args([ + "run", "--", "map", + "-a", "tests/map_a.bed", + "-b", "tests/map_b_noname.bed", + "-g", "tests/hg38.small.fai", + "-n", "-O", "count", + ]) + .output() + .expect("failed to execute bedder map"); + + assert!(output.status.success(), "bedder map failed:\n{}", String::from_utf8_lossy(&output.stderr)); + + let lines: Vec = String::from_utf8_lossy(&output.stdout) + .lines() + .filter(|l| !l.is_empty()) + .map(String::from) + .collect(); + + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\t0"); + assert_eq!(lines[1], "chr1\t300\t400\tgeneB\t20\t0"); +} From b60954f0dd2862e7ea27c20dbb6e35e7ed851066 Mon Sep 17 00:00:00 2001 From: "Mitchell R. Vollger" Date: Fri, 30 Jan 2026 21:23:46 -0700 Subject: [PATCH 5/5] Fix edge cases: name-match with missing names, column validation, non-numeric warnings - Treat missing A name as "." for --name-match (previously silently skipped filtering) - Reject -c 0 with a clear error (columns are 1-indexed, matching bedtools) - Warn once per column on first non-numeric value (matching bedtools behavior) - Add tests for zero overlaps, A-with-no-name + name-match, column 0 error, single-element median Co-Authored-By: Claude Opus 4.5 --- src/cli/map.rs | 50 ++++++++++++++++++--------- tests/integration_map.rs | 73 ++++++++++++++++++++++++++++++++++++++++ tests/map_a_nohit.bed | 2 ++ tests/map_a_noname.bed | 2 ++ 4 files changed, 111 insertions(+), 16 deletions(-) create mode 100644 tests/map_a_nohit.bed create mode 100644 tests/map_a_noname.bed diff --git a/src/cli/map.rs b/src/cli/map.rs index 03862c0..af86572 100644 --- a/src/cli/map.rs +++ b/src/cli/map.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::PathBuf; use clap::{Parser, ValueEnum}; @@ -70,10 +70,22 @@ fn format_number(v: f64) -> String { } /// Extract a value from a B interval for aggregation. -fn extract_value(b_pos: &bedder::position::Position, operation: &AggOp, column: usize) -> Option { +/// Logs a warning (once per column) when a non-numeric value is encountered. +fn extract_value( + b_pos: &bedder::position::Position, + operation: &AggOp, + column: usize, + warned_columns: &mut HashSet, +) -> Option { match operation { AggOp::Count => Some(0.0), // dummy value for counting - _ => b_pos.column_as_f64(column), + _ => { + let val = b_pos.column_as_f64(column); + if val.is_none() && warned_columns.insert(column) { + log::warn!("Non-numeric value in column {}.", column); + } + val + } } } @@ -214,6 +226,12 @@ pub struct MapCmdArgs { } pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { + for &col in &args.columns { + if col == 0 { + return Err("column index must be >= 1 (columns are 1-indexed)".into()); + } + } + let ops = expand_ops(&args.columns, &args.operations)?; let chrom_order = @@ -252,6 +270,8 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { bedder::bedder_bed::simplebed::BedWriter::new(&args.output_path)? }; + let mut warned_columns: HashSet = HashSet::new(); + for intersection_result in ii { let intersection = intersection_result?; let base = intersection @@ -259,7 +279,7 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { .try_lock() .expect("failed to lock base_interval"); - let a_name: Option = base.name().map(|s| s.to_string()); + let a_name: String = base.name().unwrap_or(".").to_string(); let bed_record = match &*base { bedder::position::Position::Bed(bed) => &bed.0, @@ -280,12 +300,8 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { let b_name = b_pos.name().unwrap_or(".").to_string(); - if args.name_match { - if let Some(ref an) = a_name { - if b_name != *an { - continue; - } - } + if args.name_match && b_name != a_name { + continue; } if !groups.contains_key(&b_name) { @@ -294,7 +310,7 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { } let vecs = groups.get_mut(&b_name).unwrap(); for (i, (col, op)) in ops.iter().enumerate() { - if let Some(val) = extract_value(&b_pos, op, *col) { + if let Some(val) = extract_value(&b_pos, op, *col, &mut warned_columns) { vecs[i].push(val); } } @@ -332,15 +348,14 @@ pub fn map_command(args: MapCmdArgs) -> Result<(), Box> { .expect("failed to lock b interval"); if args.name_match { - if let Some(ref an) = a_name { - if b_pos.name() != Some(an.as_str()) { - continue; - } + let b_name = b_pos.name().unwrap_or("."); + if b_name != a_name { + continue; } } for (i, (col, op)) in ops.iter().enumerate() { - if let Some(val) = extract_value(&b_pos, op, *col) { + if let Some(val) = extract_value(&b_pos, op, *col, &mut warned_columns) { value_vecs[i].push(val); } } @@ -374,6 +389,9 @@ mod tests { assert_eq!(AggOp::Max.compute(&vals), "3"); assert_eq!(AggOp::Median.compute(&vals), "2"); + // single-element median + assert_eq!(AggOp::Median.compute(&[42.0]), "42"); + // even-length median assert_eq!(AggOp::Median.compute(&[1.0, 2.0, 3.0, 4.0]), "2.5"); diff --git a/tests/integration_map.rs b/tests/integration_map.rs index 43ea2c5..c43ec54 100644 --- a/tests/integration_map.rs +++ b/tests/integration_map.rs @@ -111,3 +111,76 @@ fn test_map_b_no_name_with_name_match() { assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\t0"); assert_eq!(lines[1], "chr1\t300\t400\tgeneB\t20\t0"); } + +#[test] +fn test_map_zero_overlaps() { + // geneC at chr1:500-600 has no overlapping B intervals + let output = Command::new("cargo") + .args([ + "run", "--", "map", + "-a", "tests/map_a_nohit.bed", + "-b", "tests/map_b.bed", + "-g", "tests/hg38.small.fai", + "-O", "sum,count", + ]) + .output() + .expect("failed to execute bedder map"); + + assert!(output.status.success(), "bedder map failed:\n{}", String::from_utf8_lossy(&output.stderr)); + + let lines: Vec = String::from_utf8_lossy(&output.stdout) + .lines() + .filter(|l| !l.is_empty()) + .map(String::from) + .collect(); + + assert_eq!(lines.len(), 2); + assert_eq!(lines[0], "chr1\t100\t200\tgeneA\t10\t15\t3"); + assert_eq!(lines[1], "chr1\t500\t600\tgeneC\t30\t.\t0"); +} + +#[test] +fn test_map_a_noname_with_name_match() { + // A has no name column; with -n, A name is treated as "." + // B intervals have names (geneA, geneB), so none match "." → all excluded + let output = Command::new("cargo") + .args([ + "run", "--", "map", + "-a", "tests/map_a_noname.bed", + "-b", "tests/map_b.bed", + "-g", "tests/hg38.small.fai", + "-n", "-O", "sum,count", + ]) + .output() + .expect("failed to execute bedder map"); + + assert!(output.status.success(), "bedder map failed:\n{}", String::from_utf8_lossy(&output.stderr)); + + let lines: Vec = String::from_utf8_lossy(&output.stdout) + .lines() + .filter(|l| !l.is_empty()) + .map(String::from) + .collect(); + + assert_eq!(lines.len(), 2); + // A name is "." which doesn't match any B name → empty aggregates + assert_eq!(lines[0], "chr1\t100\t200\t.\t0"); + assert_eq!(lines[1], "chr1\t300\t400\t.\t0"); +} + +#[test] +fn test_map_column_zero_error() { + // -c 0 should be rejected (columns are 1-indexed) + let output = Command::new("cargo") + .args([ + "run", "--", "map", + "-a", "tests/map_a.bed", + "-b", "tests/map_b.bed", + "-g", "tests/hg38.small.fai", + "-c", "0", + ]) + .output() + .expect("failed to execute bedder map"); + + assert!(!output.status.success(), "bedder map should have failed with -c 0"); +} diff --git a/tests/map_a_nohit.bed b/tests/map_a_nohit.bed new file mode 100644 index 0000000..1613c8a --- /dev/null +++ b/tests/map_a_nohit.bed @@ -0,0 +1,2 @@ +chr1 100 200 geneA 10 +chr1 500 600 geneC 30 diff --git a/tests/map_a_noname.bed b/tests/map_a_noname.bed new file mode 100644 index 0000000..9e5b74e --- /dev/null +++ b/tests/map_a_noname.bed @@ -0,0 +1,2 @@ +chr1 100 200 +chr1 300 400