Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions bin/omega_comparison_per_site.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@
import pandas as pd
import scipy.stats as stats

def load_data(panel_file, mutations_file, mutabilities_file):
from utils import MIN_NONZERO_PVALUE

def load_data(panel_file, mutations_file, mutabilities_file, genes_subset=None):
# Load captured panel
panel = pd.read_csv(panel_file, sep="\t", compression="gzip")
if genes_subset:
genes_list = [gene.strip() for gene in genes_subset.split(',')]
panel = panel[panel['GENE'].isin(genes_list)].reset_index(drop=True)
# Load mutations
mutations = pd.read_csv(mutations_file, sep="\t")
if "EFFECTIVE_MUTS" not in mutations.columns:
Expand All @@ -32,6 +37,17 @@ def poisson_pvalue(observed, expected):
"""
return 1 - stats.poisson.cdf(observed - 1, expected)

def benjamini_hochberg(pvals):
if pvals.size == 0:
return pvals
order = np.argsort(pvals)
ranked = np.arange(1, pvals.size + 1)
adjusted = np.empty_like(pvals, dtype=float)
adjusted[order] = pvals[order] * pvals.size / ranked
adjusted_sorted = np.minimum.accumulate(adjusted[order][::-1])[::-1]
adjusted[order] = adjusted_sorted
return np.clip(adjusted, 0.0, 1.0)

def compute_by_size(panel, mutations, mutabilities, size, sample_column):
# Merge mutations and panel to associate protein positions
mutations = mutations.merge(panel[['CHROM', 'POS', 'REF', 'ALT', 'Protein_position', 'Amino_acids', 'GENE', 'Feature']],
Expand Down Expand Up @@ -66,6 +82,17 @@ def compute_by_size(panel, mutations, mutabilities, size, sample_column):
result["OBS/EXP"] = (result["OBSERVED_MUTS"] / result["EXPECTED_MUTS"]).fillna(0)
result["OBS/EXP"] = result["OBS/EXP"].replace([np.inf, -np.inf], 0)
result["p_value"] = result[["OBSERVED_MUTS", "EXPECTED_MUTS"]].apply(lambda row: poisson_pvalue(row["OBSERVED_MUTS"], row["EXPECTED_MUTS"]), axis=1)

# fill pvalues == 0 with the value or resolution limit
result["p_value"] = result["p_value"].replace(0, MIN_NONZERO_PVALUE)

result["p_value_adj"] = np.nan
for gene, gene_df in result.groupby("GENE", dropna=False):
valid_mask = gene_df["p_value"].notna()
if not valid_mask.any():
continue
adjusted = benjamini_hochberg(gene_df.loc[valid_mask, "p_value"].to_numpy())
result.loc[gene_df.index[valid_mask], "p_value_adj"] = adjusted

return result

Expand All @@ -78,9 +105,10 @@ def compute_by_size(panel, mutations, mutabilities, size, sample_column):
@click.option('--panel-file', type=click.Path(exists=True), required=True, help="Path to captured panel file (gzip compressed).")
@click.option('--size', type=str, default='aminoacid_change', help="Comma-separated list of sizes or 'all' for all sizes. The options are: 'site', 'aminoacid', 'aminoacid_change'")
@click.option('--output-prefix', type=str, required=True, help="Output file prefix.")
def main(panel_file, mutations_file, mutabilities_file, size, output_prefix):
@click.option('--genes', type=str, default= '', required=False, help="List of genes to subset (comma-separated). If not provided, all genes will be included.")
def main(panel_file, mutations_file, mutabilities_file, size, output_prefix, genes):
"""Compute comparison between observed mutations and accumulated mutabilities."""
panel, mutations, mutabilities = load_data(panel_file, mutations_file, mutabilities_file)
panel, mutations, mutabilities = load_data(panel_file, mutations_file, mutabilities_file, genes if genes != '' else None)

# Get sample column dynamically
sample_column = get_sample_column(mutabilities)
Expand Down
26 changes: 14 additions & 12 deletions bin/plot_gene_saturation.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def plot_gene_selection(mut_count_df,
plot_pars,
title,
ddg_df=None,
thr_selection=1e-5,
thr_selection=0.05,
lst_tracks=["Mut_count", "Site_selection", "Res_depth", "Domain"],
default_track_order=False,
save=False,
Expand Down Expand Up @@ -517,7 +517,7 @@ def plot_gene_selection(mut_count_df,
)

site_selection_hits_df = pd.DataFrame({"Protein_position" : np.arange(protein_len)+1}).merge(
site_selection_df[site_selection_df["p_value"] < thr_selection].reset_index(drop=True), how="left")
site_selection_df[site_selection_df["p_value_adj"] < thr_selection].reset_index(drop=True), how="left")
axes[ax].fill_between(
site_selection_hits_df.Protein_position, 0, site_selection_hits_df.Selection,
color=plot_pars["colors"]["site_selection_hits"], alpha=1, zorder=2, lw=1.5, label="Significant"
Expand Down Expand Up @@ -583,7 +583,7 @@ def plot_gene_selection(mut_count_df,
for impact in ["missense", "truncating"]:

exon_selection_impact = exon_selection_df[exon_selection_df["impact"] == impact]
exon_selection_impact_hits = exon_selection_impact[exon_selection_impact["pvalue"] < thr_selection].reset_index(drop=True)
exon_selection_impact_hits = exon_selection_impact[exon_selection_impact["pvalue_adj"] < thr_selection].reset_index(drop=True)
axes[ax].scatter(
exon_selection_impact["MID_PROT_POS"], exon_selection_impact["dnds"],
zorder=3, color=plot_pars["colors"]["exon_selection"][impact], s=60, lw=0.1, ec="black"
Expand Down Expand Up @@ -984,7 +984,7 @@ def plot_domain_selection(
# this dictionary is defined in the gene plot with the gene--domain name
domain_selection_in_gene["color"] = domain_selection_in_gene["DOMAIN_ID"].map(color_map)

domain_selection_in_gene["edge_width"] = domain_selection_in_gene["pvalue"].apply(lambda p: 1.5 if p < pvalue_threshold else 0.2)
domain_selection_in_gene["edge_width"] = domain_selection_in_gene["pvalue_adj"].apply(lambda p: 1.5 if p < pvalue_threshold else 0.2)

fig = plt.figure(figsize=(5, 2.5))

Expand Down Expand Up @@ -1085,9 +1085,9 @@ def get_selection_groups(df, thr_selection):

df = df.copy().rename(columns={"Protein_position": "Pos"})
g0 = df[df["Selection"] == 0].copy()
g1 = df[(df["Selection"] != 0) & (df["p_value"] >= thr_selection)].copy()
g1 = df[(df["Selection"] != 0) & (df["p_value_adj"] >= thr_selection)].copy()

g2 = df[df["p_value"] < thr_selection].copy()
g2 = df[df["p_value_adj"] < thr_selection].copy()
g2 = g2.sort_values("Selection").reset_index(drop=True)

g0["Group"] = "G0"
Expand Down Expand Up @@ -1159,7 +1159,7 @@ def plot_all_domain_selection(df, color_map, figsize=(10, 3), show_domain_legend
custom_markers = {"missense": "o", "truncating": "D"} # 'o' = Circle, 'D' = Rotated Square
custom_size = {"missense": 150, "truncating": 100} # Larger for missense
df["color"] = df["domain_id"].map(color_map)
df["edge_width"] = df["pvalue"].apply(lambda p: 1.5 if p < 0.05 else 0.2)
df["edge_width"] = df["pvalue_adj"].apply(lambda p: 1.5 if p < 0.05 else 0.2)

fig = plt.figure(figsize=figsize)

Expand Down Expand Up @@ -1268,7 +1268,8 @@ def plotting_wrapper(maf, exons_depth, o3d_df, exon_selection,
def plotting_single_gene(gene, maf, exons_depth, o3d_df,
exon_selection, domain_selection, site_selection,
output_dir, o3d_seq_df, o3d_pdb_tool_df, domain,
lst_tracks
lst_tracks,
p_value_threshold = 0.05
):

# Data
Expand Down Expand Up @@ -1311,7 +1312,7 @@ def plotting_single_gene(gene, maf, exons_depth, o3d_df,

exon_selection_gene = exon_selection[exon_selection["gene"].str.startswith(gene)]
exon_selection_gene = exon_selection_gene.sort_values("exon_rank").reset_index(drop=True).rename(columns={"exon_rank" : "EXON_RANK"})
exon_selection_gene = exon_selection_gene[["EXON_RANK", "impact", "dnds", "lower", "upper", "pvalue"]]
exon_selection_gene = exon_selection_gene[["EXON_RANK", "impact", "dnds", "lower", "upper", "pvalue_adj"]]

site_selection_gene = site_selection[site_selection["GENE"] == gene].sort_values("Protein_position").reset_index(drop=True)

Expand Down Expand Up @@ -1360,7 +1361,7 @@ def plotting_single_gene(gene, maf, exons_depth, o3d_df,
plot_pars=plot_pars,
title=f"{gene}",
lst_tracks=lst_tracks,
thr_selection=0.00001,
thr_selection=p_value_threshold,
default_track_order=False,
save=True,
filename=f"{output_dir}/{gene}.saturation_all.png"
Expand Down Expand Up @@ -1397,7 +1398,7 @@ def plotting_single_gene(gene, maf, exons_depth, o3d_df,

# Selection groups feat
# =====================
site_selection_gene_grouped = get_selection_groups(site_selection_gene, thr_selection=0.00001)
site_selection_gene_grouped = get_selection_groups(site_selection_gene, thr_selection=p_value_threshold)
site_selection_gene_grouped = site_selection_gene_grouped.merge(pdb_tool_gene[["Pos", "pACC"]])

if ddg_gene is not None:
Expand All @@ -1417,14 +1418,15 @@ def data_loading(sample_name, domain, exons_depth, track_list):
# helper: check for files and call readers only when present

## Positive selection files
omega_file = f"output_mle.{sample_name}.global_loc.tsv"
omega_file = f"all_omegas_global_loc.tsv"
site_selection_file = f"{sample_name}.aminoacid.comparison.tsv.gz"
o3d_df_file = f"{sample_name}.3d_clustering_pos.csv"

# Omega data (may be missing)
if os.path.isfile(omega_file):
try:
omega_table = pd.read_table(omega_file)
omega_table = omega_table[omega_table["sample"] == sample_name]
except Exception as e:
print(f"Could not read omega file {omega_file}: {e}")
omega_table = pd.DataFrame()
Expand Down
2 changes: 2 additions & 0 deletions bin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import json
import pandas as pd

MIN_NONZERO_PVALUE = 1.17e-38


def add_filter(old_filt, add_filt, filt_name):
"""
Expand Down
1 change: 1 addition & 0 deletions conf/tools/omega.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ process {

withName: 'SITECOMPARISON.*' {
ext.size = params.site_comparison_grouping
ext.genes_subset = params.selected_genes
}


Expand Down
5 changes: 4 additions & 1 deletion modules/local/bbgtools/sitecomparison/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ process SITE_COMPARISON {
def prefix = task.ext.prefix ?: ""
prefix = "${meta.id}${prefix}"
def size = task.ext.size ?: "all" // other options are 'site', 'aa_change', 'aa', '3aa', '3aa_rolling' // think if is worth having 'Naa', 'Naa_rolling'
def genes_subset = task.ext.genes_subset ?: ""
target_genes = genes_subset != "" ? "--genes ${genes_subset}": ""
"""
omega_comparison_per_site.py --mutations-file ${mutations} \\
--panel-file ${annotated_panel_richer} \\
--mutabilities-file ${mutabilities_per_site} \\
--size ${size} \\
--output-prefix ${prefix}
--output-prefix ${prefix} \\
${target_genes}
Comment thread
FerriolCalvet marked this conversation as resolved.

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
3 changes: 3 additions & 0 deletions workflows/deepcsa.nf
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,9 @@ workflow DEEPCSA {
if (params.omega || params.oncodrive3d || params.oncodrivefml || params.indels || run_mutdensity) {
if (params.omega){
positive_selection_results = positive_selection_results.combine(PLOTTINGQC.out.flagged_omegas)
if (params.omega_globalloc){
positive_selection_results = positive_selection_results.combine(all_compiled_omegasgloballoc)
}
}
positive_selection_results_ready = positive_selection_results.map { element -> [element[0], element[1..-1]] }
PLOTTINGSUMMARY(positive_selection_results_ready,
Expand Down