Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions bin/compute_hotspots_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python

import click
import numpy as np
import pandas as pd
import scipy.stats as stats
import glob
import os

from utils import MIN_NONZERO_PVALUE
from omega_comparison_per_site import poisson_pvalue, benjamini_hochberg

def load_panel_hotspots(panel_file, hotspots_file):
panel = pd.read_csv(panel_file, sep="\t", compression="gzip")
hotspots = pd.read_csv(hotspots_file, sep="\t")

# Merge hotspots with panel to find which sites are hotspots
# Hotspots have CHROM, POS, MUTTYPE. MUTTYPE is REF>ALT.
hotspots['REF'] = hotspots['MUTTYPE'].str.split('>').str[0]
hotspots['ALT'] = hotspots['MUTTYPE'].str.split('>').str[1]

# Some hotspots might have MUTTYPE = '-' for non-SNV, but let's assume standard format for now
hotspots_panel = panel.merge(hotspots, on=['CHROM', 'POS', 'REF', 'ALT'], how='inner')

return hotspots_panel

def process_comparison(comparison_file, hotspots_panel, size_type, output_prefix):
comp_df = pd.read_csv(comparison_file, sep="\t", compression="gzip")

if size_type == "site":
group_cols = ['CHROM', 'POS', 'REF', 'ALT', 'GENE']
elif size_type == "aminoacid":
group_cols = ['GENE', 'Feature', 'Protein_position']
elif size_type == "aminoacid_change":
group_cols = ['GENE', 'Feature', 'Protein_position', 'Amino_acids']
else:
raise ValueError(f"Unknown size type: {size_type}")

# Determine which entries are hotspots
# Get the unique hotspots for the current grouping
hotspots_grouped = hotspots_panel[group_cols].drop_duplicates()
hotspots_grouped['Hotspot'] = 'Yes'

# Merge with the comparison data
comp_df = comp_df.merge(hotspots_grouped, on=group_cols, how='left')
comp_df['Hotspot'] = comp_df['Hotspot'].fillna('No')

# Group by GENE and Hotspot
grouped = comp_df.groupby(['GENE', 'Hotspot'])[['OBSERVED_MUTS', 'EXPECTED_MUTS']].sum().reset_index()

grouped["OBS/EXP"] = (grouped["OBSERVED_MUTS"] / grouped["EXPECTED_MUTS"]).fillna(0)
grouped["OBS/EXP"] = grouped["OBS/EXP"].replace([np.inf, -np.inf], 0)
grouped["p_value"] = grouped.apply(lambda row: poisson_pvalue(row["OBSERVED_MUTS"], row["EXPECTED_MUTS"]), axis=1)

grouped["p_value"] = grouped["p_value"].replace(0, MIN_NONZERO_PVALUE)

grouped["p_value_adj"] = np.nan
for gene, gene_df in grouped.groupby("GENE", dropna=False):
valid_mask = gene_df["p_value"].notna()
if not valid_mask.any():
continue
adjusted = benjamini_hochberg(gene_df.loc[valid_mask, "p_value"].to_numpy())
grouped.loc[gene_df.index[valid_mask], "p_value_adj"] = adjusted

# Write output
output_file = f"{output_prefix}.{size_type}.hotspots_selection.tsv.gz"
grouped.to_csv(output_file, sep="\t", index=False)
click.echo(f"Results for size '{size_type}' written to {output_file}")


@click.command()
@click.option('--comparisons', multiple=True, type=click.Path(exists=True), required=True, help="Path to comparison files.")
@click.option('--panel-file', type=click.Path(exists=True), required=True, help="Path to captured panel file (gzip compressed).")
@click.option('--hotspots-file', type=click.Path(exists=True), required=True, help="Path to hotspots file.")
@click.option('--output-prefix', type=str, required=True, help="Output file prefix.")
def main(comparisons, panel_file, hotspots_file, output_prefix):
"""Compute selection for known hotspots based on site comparison output."""
hotspots_panel = load_panel_hotspots(panel_file, hotspots_file)

for comp_file in comparisons:
filename = os.path.basename(comp_file)
if ".site.comparison." in filename:
size_type = "site"
elif ".aminoacid_change.comparison." in filename:
size_type = "aminoacid_change"
elif ".aminoacid.comparison." in filename:
size_type = "aminoacid"
else:
click.echo(f"Warning: Could not determine size type from filename {filename}. Skipping.")
continue

click.echo(f"Processing size: {size_type} from {comp_file}")
process_comparison(comp_file, hotspots_panel, size_type, output_prefix)

if __name__ == "__main__":
main()
46 changes: 46 additions & 0 deletions modules/local/hotspots_selection/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
process HOTSPOTS_SELECTION {
tag "$meta.id"
label 'cpu_single_fixed'
label 'time_low'
label 'process_high_memory'

label 'deepcsa_core'

input:
tuple val(meta), path(comparisons)
tuple val(meta2), path(annotated_panel_richer)
path(hotspots_file)

output:
tuple val(meta), path("*.hotspots_selection.tsv.gz") , emit: selections
path "versions.yml" , topic: versions

script:
def prefix = task.ext.prefix ?: ""
prefix = "${meta.id}${prefix}"
def comparison_args = comparisons.collect { "--comparisons ${it}" }.join(' ')
"""
compute_hotspots_selection.py \\
${comparison_args} \\
--panel-file ${annotated_panel_richer} \\
--hotspots-file ${hotspots_file} \\
--output-prefix ${prefix}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: ""
prefix = "${meta.id}${prefix}"
"""
touch ${prefix}.site.hotspots_selection.tsv.gz

cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""
}
28 changes: 28 additions & 0 deletions patch_omega_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import sys

with open('subworkflows/local/omega/main.nf', 'r') as f:
lines = f.readlines()

new_lines = []
for i, line in enumerate(lines):
new_lines.append(line)
if "include { OMEGA_MULTITEST as OMEGAMULTIPLETESTGLOBALLOC } from '../../../modules/local/omega_multipletesting/main'" in line:
new_lines.append("include { HOTSPOTS_SELECTION as HOTSPOTSSELECTION } from '../../../modules/local/hotspots_selection/main'\n")
new_lines.append("include { HOTSPOTS_SELECTION as HOTSPOTSSELECTIONGLOBALLOC } from '../../../modules/local/hotspots_selection/main'\n")

if "site_comparison_results_flattened }" in line:
new_lines.append("""
if (params.hotspots_annotation && params.hotspots_definition_file) {
hotspots_file = channel.fromPath(params.hotspots_definition_file, checkIfExists: true).first()

HOTSPOTSSELECTION(
site_comparison_results,
QUERYPANEL.out.subset.first(),
hotspots_file
)
// If needed, we can also collect or emit these results
}
""")

with open('subworkflows/local/omega/main.nf', 'w') as f:
f.writelines(new_lines)
14 changes: 14 additions & 0 deletions patch_omega_main2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sys

with open('subworkflows/local/omega/main.nf', 'r') as f:
lines = f.readlines()

new_lines = []
for line in lines:
if "site_comparison_results_flattened }" in line:
pass # Handle this differently

new_lines.append(line)

# Let's just do it directly using a simple sed-like replacement in python

13 changes: 13 additions & 0 deletions subworkflows/local/omega/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ include { PLOT_OMEGA as PLOTOMEGAGLOBALLOC } from '
include { SITE_COMPARISON as SITECOMPARISONGLOBALLOC } from '../../../modules/local/bbgtools/sitecomparison/main'
include { SITE_COMPARISON as SITECOMPARISONGLOBALLOCMULTI } from '../../../modules/local/bbgtools/sitecomparison/main'
include { OMEGA_MULTITEST as OMEGAMULTIPLETESTGLOBALLOC } from '../../../modules/local/omega_multipletesting/main'
include { HOTSPOTS_SELECTION as HOTSPOTSSELECTION } from '../../../modules/local/hotspots_selection/main'
include { HOTSPOTS_SELECTION as HOTSPOTSSELECTIONGLOBALLOC } from '../../../modules/local/hotspots_selection/main'

workflow OMEGA_ANALYSIS{

Expand Down Expand Up @@ -173,6 +175,17 @@ workflow OMEGA_ANALYSIS{
[meta, all_files]
}.set{ site_comparison_results_flattened }

if (params.hotspots_annotation && params.hotspots_definition_file) {
hotspots_file = channel.fromPath(params.hotspots_definition_file, checkIfExists: true).first()

HOTSPOTSSELECTION(
site_comparison_results,
QUERYPANEL.out.subset.first(),
hotspots_file
)
// If needed, we can also collect or emit these results
}


ESTIMATOR.out.results.map{ it -> it[1]}.flatten().set{ all_indv_results }
// Keep the concatenated file in the work directory; the corrected output is published below.
Expand Down