bbglab · Copilot · Jun 9, 2026 · Jun 9, 2026
diff --git a/bin/compute_hotspots_selection.py b/bin/compute_hotspots_selection.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+import click
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
+import glob
+import os
+
+from utils import MIN_NONZERO_PVALUE
+from omega_comparison_per_site import poisson_pvalue, benjamini_hochberg
+
+def load_panel_hotspots(panel_file, hotspots_file):
+    panel = pd.read_csv(panel_file, sep="\t", compression="gzip")
+    hotspots = pd.read_csv(hotspots_file, sep="\t")
+
+    # Merge hotspots with panel to find which sites are hotspots
+    # Hotspots have CHROM, POS, MUTTYPE. MUTTYPE is REF>ALT.
+    hotspots['REF'] = hotspots['MUTTYPE'].str.split('>').str[0]
+    hotspots['ALT'] = hotspots['MUTTYPE'].str.split('>').str[1]
+
+    # Some hotspots might have MUTTYPE = '-' for non-SNV, but let's assume standard format for now
+    hotspots_panel = panel.merge(hotspots, on=['CHROM', 'POS', 'REF', 'ALT'], how='inner')
+
+    return hotspots_panel
+
+def process_comparison(comparison_file, hotspots_panel, size_type, output_prefix):
+    comp_df = pd.read_csv(comparison_file, sep="\t", compression="gzip")
+
+    if size_type == "site":
+        group_cols = ['CHROM', 'POS', 'REF', 'ALT', 'GENE']
+    elif size_type == "aminoacid":
+        group_cols = ['GENE', 'Feature', 'Protein_position']
+    elif size_type == "aminoacid_change":
+        group_cols = ['GENE', 'Feature', 'Protein_position', 'Amino_acids']
+    else:
+        raise ValueError(f"Unknown size type: {size_type}")
+
+    # Determine which entries are hotspots
+    # Get the unique hotspots for the current grouping
+    hotspots_grouped = hotspots_panel[group_cols].drop_duplicates()
+    hotspots_grouped['Hotspot'] = 'Yes'
+
+    # Merge with the comparison data
+    comp_df = comp_df.merge(hotspots_grouped, on=group_cols, how='left')
+    comp_df['Hotspot'] = comp_df['Hotspot'].fillna('No')
+
+    # Group by GENE and Hotspot
+    grouped = comp_df.groupby(['GENE', 'Hotspot'])[['OBSERVED_MUTS', 'EXPECTED_MUTS']].sum().reset_index()
+
+    grouped["OBS/EXP"] = (grouped["OBSERVED_MUTS"] / grouped["EXPECTED_MUTS"]).fillna(0)
+    grouped["OBS/EXP"] = grouped["OBS/EXP"].replace([np.inf, -np.inf], 0)
+    grouped["p_value"] = grouped.apply(lambda row: poisson_pvalue(row["OBSERVED_MUTS"], row["EXPECTED_MUTS"]), axis=1)
+
+    grouped["p_value"] = grouped["p_value"].replace(0, MIN_NONZERO_PVALUE)
+
+    grouped["p_value_adj"] = np.nan
+    for gene, gene_df in grouped.groupby("GENE", dropna=False):
+        valid_mask = gene_df["p_value"].notna()
+        if not valid_mask.any():
+            continue
+        adjusted = benjamini_hochberg(gene_df.loc[valid_mask, "p_value"].to_numpy())
+        grouped.loc[gene_df.index[valid_mask], "p_value_adj"] = adjusted
+
+    # Write output
+    output_file = f"{output_prefix}.{size_type}.hotspots_selection.tsv.gz"
+    grouped.to_csv(output_file, sep="\t", index=False)
+    click.echo(f"Results for size '{size_type}' written to {output_file}")
+
+
+@click.command()
+@click.option('--comparisons', multiple=True, type=click.Path(exists=True), required=True, help="Path to comparison files.")
+@click.option('--panel-file', type=click.Path(exists=True), required=True, help="Path to captured panel file (gzip compressed).")
+@click.option('--hotspots-file', type=click.Path(exists=True), required=True, help="Path to hotspots file.")
+@click.option('--output-prefix', type=str, required=True, help="Output file prefix.")
+def main(comparisons, panel_file, hotspots_file, output_prefix):
+    """Compute selection for known hotspots based on site comparison output."""
+    hotspots_panel = load_panel_hotspots(panel_file, hotspots_file)
+
+    for comp_file in comparisons:
+        filename = os.path.basename(comp_file)
+        if ".site.comparison." in filename:
+            size_type = "site"
+        elif ".aminoacid_change.comparison." in filename:
+            size_type = "aminoacid_change"
+        elif ".aminoacid.comparison." in filename:
+            size_type = "aminoacid"
+        else:
+            click.echo(f"Warning: Could not determine size type from filename {filename}. Skipping.")
+            continue
+
+        click.echo(f"Processing size: {size_type} from {comp_file}")
+        process_comparison(comp_file, hotspots_panel, size_type, output_prefix)
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/local/hotspots_selection/main.nf b/modules/local/hotspots_selection/main.nf
@@ -0,0 +1,46 @@
+process HOTSPOTS_SELECTION {
+    tag "$meta.id"
+    label 'cpu_single_fixed'
+    label 'time_low'
+    label 'process_high_memory'
+
+    label 'deepcsa_core'
+
+    input:
+    tuple val(meta), path(comparisons)
+    tuple val(meta2), path(annotated_panel_richer)
+    path(hotspots_file)
+
+    output:
+    tuple val(meta), path("*.hotspots_selection.tsv.gz") , emit: selections
+    path "versions.yml"                                  , topic: versions
+
+    script:
+    def prefix = task.ext.prefix ?: ""
+    prefix = "${meta.id}${prefix}"
+    def comparison_args = comparisons.collect { "--comparisons ${it}" }.join(' ')
+    """
+    compute_hotspots_selection.py \\
+        ${comparison_args} \\
+        --panel-file ${annotated_panel_richer} \\
+        --hotspots-file ${hotspots_file} \\
+        --output-prefix ${prefix}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: ""
+    prefix = "${meta.id}${prefix}"
+    """
+    touch ${prefix}.site.hotspots_selection.tsv.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/patch_omega_main.py b/patch_omega_main.py
@@ -0,0 +1,28 @@
+import sys
+
+with open('subworkflows/local/omega/main.nf', 'r') as f:
+    lines = f.readlines()
+
+new_lines = []
+for i, line in enumerate(lines):
+    new_lines.append(line)
+    if "include { OMEGA_MULTITEST           as OMEGAMULTIPLETESTGLOBALLOC       } from '../../../modules/local/omega_multipletesting/main'" in line:
+        new_lines.append("include { HOTSPOTS_SELECTION        as HOTSPOTSSELECTION                } from '../../../modules/local/hotspots_selection/main'\n")
+        new_lines.append("include { HOTSPOTS_SELECTION        as HOTSPOTSSELECTIONGLOBALLOC       } from '../../../modules/local/hotspots_selection/main'\n")
+
+    if "site_comparison_results_flattened }" in line:
+        new_lines.append("""
+    if (params.hotspots_annotation && params.hotspots_definition_file) {
+        hotspots_file = channel.fromPath(params.hotspots_definition_file, checkIfExists: true).first()
+
+        HOTSPOTSSELECTION(
+            site_comparison_results,
+            QUERYPANEL.out.subset.first(),
+            hotspots_file
+        )
+        // If needed, we can also collect or emit these results
+    }
+""")
+
+with open('subworkflows/local/omega/main.nf', 'w') as f:
+    f.writelines(new_lines)
diff --git a/patch_omega_main2.py b/patch_omega_main2.py
@@ -0,0 +1,14 @@
+import sys
+
+with open('subworkflows/local/omega/main.nf', 'r') as f:
+    lines = f.readlines()
+
+new_lines = []
+for line in lines:
+    if "site_comparison_results_flattened }" in line:
+        pass # Handle this differently
+
+    new_lines.append(line)
+
+# Let's just do it directly using a simple sed-like replacement in python
+
diff --git a/subworkflows/local/omega/main.nf b/subworkflows/local/omega/main.nf
@@ -20,6 +20,8 @@ include { PLOT_OMEGA                as PLOTOMEGAGLOBALLOC               } from '
 include { SITE_COMPARISON           as SITECOMPARISONGLOBALLOC          } from '../../../modules/local/bbgtools/sitecomparison/main'
 include { SITE_COMPARISON           as SITECOMPARISONGLOBALLOCMULTI     } from '../../../modules/local/bbgtools/sitecomparison/main'
 include { OMEGA_MULTITEST           as OMEGAMULTIPLETESTGLOBALLOC       } from '../../../modules/local/omega_multipletesting/main'
+include { HOTSPOTS_SELECTION        as HOTSPOTSSELECTION                } from '../../../modules/local/hotspots_selection/main'
+include { HOTSPOTS_SELECTION        as HOTSPOTSSELECTIONGLOBALLOC       } from '../../../modules/local/hotspots_selection/main'
 
 workflow OMEGA_ANALYSIS{
 
@@ -173,6 +175,17 @@ workflow OMEGA_ANALYSIS{
         [meta, all_files]
     }.set{ site_comparison_results_flattened }
 
+    if (params.hotspots_annotation && params.hotspots_definition_file) {
+        hotspots_file = channel.fromPath(params.hotspots_definition_file, checkIfExists: true).first()
+
+        HOTSPOTSSELECTION(
+            site_comparison_results,
+            QUERYPANEL.out.subset.first(),
+            hotspots_file
+        )
+        // If needed, we can also collect or emit these results
+    }
+
 
     ESTIMATOR.out.results.map{ it -> it[1]}.flatten().set{ all_indv_results }
     // Keep the concatenated file in the work directory; the corrected output is published below.