From ca0c40f2daf6dc1e27a5ce3e89935c319fad5cf4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Jun 2026 12:32:42 +0000 Subject: [PATCH 1/2] Initial plan From 71731c0e835ffd5d194ef2d96af24a096888128e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 9 Jun 2026 12:38:18 +0000 Subject: [PATCH 2/2] Apply remaining changes --- bin/compute_hotspots_selection.py | 96 ++++++++++++++++++++++++ modules/local/hotspots_selection/main.nf | 46 ++++++++++++ patch_omega_main.py | 28 +++++++ patch_omega_main2.py | 14 ++++ subworkflows/local/omega/main.nf | 13 ++++ 5 files changed, 197 insertions(+) create mode 100755 bin/compute_hotspots_selection.py create mode 100644 modules/local/hotspots_selection/main.nf create mode 100644 patch_omega_main.py create mode 100644 patch_omega_main2.py diff --git a/bin/compute_hotspots_selection.py b/bin/compute_hotspots_selection.py new file mode 100755 index 00000000..3042b4b9 --- /dev/null +++ b/bin/compute_hotspots_selection.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +import click +import numpy as np +import pandas as pd +import scipy.stats as stats +import glob +import os + +from utils import MIN_NONZERO_PVALUE +from omega_comparison_per_site import poisson_pvalue, benjamini_hochberg + +def load_panel_hotspots(panel_file, hotspots_file): + panel = pd.read_csv(panel_file, sep="\t", compression="gzip") + hotspots = pd.read_csv(hotspots_file, sep="\t") + + # Merge hotspots with panel to find which sites are hotspots + # Hotspots have CHROM, POS, MUTTYPE. MUTTYPE is REF>ALT. + hotspots['REF'] = hotspots['MUTTYPE'].str.split('>').str[0] + hotspots['ALT'] = hotspots['MUTTYPE'].str.split('>').str[1] + + # Some hotspots might have MUTTYPE = '-' for non-SNV, but let's assume standard format for now + hotspots_panel = panel.merge(hotspots, on=['CHROM', 'POS', 'REF', 'ALT'], how='inner') + + return hotspots_panel + +def process_comparison(comparison_file, hotspots_panel, size_type, output_prefix): + comp_df = pd.read_csv(comparison_file, sep="\t", compression="gzip") + + if size_type == "site": + group_cols = ['CHROM', 'POS', 'REF', 'ALT', 'GENE'] + elif size_type == "aminoacid": + group_cols = ['GENE', 'Feature', 'Protein_position'] + elif size_type == "aminoacid_change": + group_cols = ['GENE', 'Feature', 'Protein_position', 'Amino_acids'] + else: + raise ValueError(f"Unknown size type: {size_type}") + + # Determine which entries are hotspots + # Get the unique hotspots for the current grouping + hotspots_grouped = hotspots_panel[group_cols].drop_duplicates() + hotspots_grouped['Hotspot'] = 'Yes' + + # Merge with the comparison data + comp_df = comp_df.merge(hotspots_grouped, on=group_cols, how='left') + comp_df['Hotspot'] = comp_df['Hotspot'].fillna('No') + + # Group by GENE and Hotspot + grouped = comp_df.groupby(['GENE', 'Hotspot'])[['OBSERVED_MUTS', 'EXPECTED_MUTS']].sum().reset_index() + + grouped["OBS/EXP"] = (grouped["OBSERVED_MUTS"] / grouped["EXPECTED_MUTS"]).fillna(0) + grouped["OBS/EXP"] = grouped["OBS/EXP"].replace([np.inf, -np.inf], 0) + grouped["p_value"] = grouped.apply(lambda row: poisson_pvalue(row["OBSERVED_MUTS"], row["EXPECTED_MUTS"]), axis=1) + + grouped["p_value"] = grouped["p_value"].replace(0, MIN_NONZERO_PVALUE) + + grouped["p_value_adj"] = np.nan + for gene, gene_df in grouped.groupby("GENE", dropna=False): + valid_mask = gene_df["p_value"].notna() + if not valid_mask.any(): + continue + adjusted = benjamini_hochberg(gene_df.loc[valid_mask, "p_value"].to_numpy()) + grouped.loc[gene_df.index[valid_mask], "p_value_adj"] = adjusted + + # Write output + output_file = f"{output_prefix}.{size_type}.hotspots_selection.tsv.gz" + grouped.to_csv(output_file, sep="\t", index=False) + click.echo(f"Results for size '{size_type}' written to {output_file}") + + +@click.command() +@click.option('--comparisons', multiple=True, type=click.Path(exists=True), required=True, help="Path to comparison files.") +@click.option('--panel-file', type=click.Path(exists=True), required=True, help="Path to captured panel file (gzip compressed).") +@click.option('--hotspots-file', type=click.Path(exists=True), required=True, help="Path to hotspots file.") +@click.option('--output-prefix', type=str, required=True, help="Output file prefix.") +def main(comparisons, panel_file, hotspots_file, output_prefix): + """Compute selection for known hotspots based on site comparison output.""" + hotspots_panel = load_panel_hotspots(panel_file, hotspots_file) + + for comp_file in comparisons: + filename = os.path.basename(comp_file) + if ".site.comparison." in filename: + size_type = "site" + elif ".aminoacid_change.comparison." in filename: + size_type = "aminoacid_change" + elif ".aminoacid.comparison." in filename: + size_type = "aminoacid" + else: + click.echo(f"Warning: Could not determine size type from filename {filename}. Skipping.") + continue + + click.echo(f"Processing size: {size_type} from {comp_file}") + process_comparison(comp_file, hotspots_panel, size_type, output_prefix) + +if __name__ == "__main__": + main() diff --git a/modules/local/hotspots_selection/main.nf b/modules/local/hotspots_selection/main.nf new file mode 100644 index 00000000..31ce3e3f --- /dev/null +++ b/modules/local/hotspots_selection/main.nf @@ -0,0 +1,46 @@ +process HOTSPOTS_SELECTION { + tag "$meta.id" + label 'cpu_single_fixed' + label 'time_low' + label 'process_high_memory' + + label 'deepcsa_core' + + input: + tuple val(meta), path(comparisons) + tuple val(meta2), path(annotated_panel_richer) + path(hotspots_file) + + output: + tuple val(meta), path("*.hotspots_selection.tsv.gz") , emit: selections + path "versions.yml" , topic: versions + + script: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + def comparison_args = comparisons.collect { "--comparisons ${it}" }.join(' ') + """ + compute_hotspots_selection.py \\ + ${comparison_args} \\ + --panel-file ${annotated_panel_richer} \\ + --hotspots-file ${hotspots_file} \\ + --output-prefix ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "" + prefix = "${meta.id}${prefix}" + """ + touch ${prefix}.site.hotspots_selection.tsv.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/patch_omega_main.py b/patch_omega_main.py new file mode 100644 index 00000000..95269b33 --- /dev/null +++ b/patch_omega_main.py @@ -0,0 +1,28 @@ +import sys + +with open('subworkflows/local/omega/main.nf', 'r') as f: + lines = f.readlines() + +new_lines = [] +for i, line in enumerate(lines): + new_lines.append(line) + if "include { OMEGA_MULTITEST as OMEGAMULTIPLETESTGLOBALLOC } from '../../../modules/local/omega_multipletesting/main'" in line: + new_lines.append("include { HOTSPOTS_SELECTION as HOTSPOTSSELECTION } from '../../../modules/local/hotspots_selection/main'\n") + new_lines.append("include { HOTSPOTS_SELECTION as HOTSPOTSSELECTIONGLOBALLOC } from '../../../modules/local/hotspots_selection/main'\n") + + if "site_comparison_results_flattened }" in line: + new_lines.append(""" + if (params.hotspots_annotation && params.hotspots_definition_file) { + hotspots_file = channel.fromPath(params.hotspots_definition_file, checkIfExists: true).first() + + HOTSPOTSSELECTION( + site_comparison_results, + QUERYPANEL.out.subset.first(), + hotspots_file + ) + // If needed, we can also collect or emit these results + } +""") + +with open('subworkflows/local/omega/main.nf', 'w') as f: + f.writelines(new_lines) diff --git a/patch_omega_main2.py b/patch_omega_main2.py new file mode 100644 index 00000000..3d70b1b8 --- /dev/null +++ b/patch_omega_main2.py @@ -0,0 +1,14 @@ +import sys + +with open('subworkflows/local/omega/main.nf', 'r') as f: + lines = f.readlines() + +new_lines = [] +for line in lines: + if "site_comparison_results_flattened }" in line: + pass # Handle this differently + + new_lines.append(line) + +# Let's just do it directly using a simple sed-like replacement in python + diff --git a/subworkflows/local/omega/main.nf b/subworkflows/local/omega/main.nf index d772f826..e638c55a 100644 --- a/subworkflows/local/omega/main.nf +++ b/subworkflows/local/omega/main.nf @@ -20,6 +20,8 @@ include { PLOT_OMEGA as PLOTOMEGAGLOBALLOC } from ' include { SITE_COMPARISON as SITECOMPARISONGLOBALLOC } from '../../../modules/local/bbgtools/sitecomparison/main' include { SITE_COMPARISON as SITECOMPARISONGLOBALLOCMULTI } from '../../../modules/local/bbgtools/sitecomparison/main' include { OMEGA_MULTITEST as OMEGAMULTIPLETESTGLOBALLOC } from '../../../modules/local/omega_multipletesting/main' +include { HOTSPOTS_SELECTION as HOTSPOTSSELECTION } from '../../../modules/local/hotspots_selection/main' +include { HOTSPOTS_SELECTION as HOTSPOTSSELECTIONGLOBALLOC } from '../../../modules/local/hotspots_selection/main' workflow OMEGA_ANALYSIS{ @@ -173,6 +175,17 @@ workflow OMEGA_ANALYSIS{ [meta, all_files] }.set{ site_comparison_results_flattened } + if (params.hotspots_annotation && params.hotspots_definition_file) { + hotspots_file = channel.fromPath(params.hotspots_definition_file, checkIfExists: true).first() + + HOTSPOTSSELECTION( + site_comparison_results, + QUERYPANEL.out.subset.first(), + hotspots_file + ) + // If needed, we can also collect or emit these results + } + ESTIMATOR.out.results.map{ it -> it[1]}.flatten().set{ all_indv_results } // Keep the concatenated file in the work directory; the corrected output is published below.