bbglab · FerriolCalvet · Jun 1, 2026 · May 21, 2026 · May 21, 2026 · May 31, 2026
diff --git a/bin/omega_comparison_per_site.py b/bin/omega_comparison_per_site.py
@@ -6,9 +6,14 @@
 import pandas as pd
 import scipy.stats as stats
 
-def load_data(panel_file, mutations_file, mutabilities_file):
+from utils import MIN_NONZERO_PVALUE
+
+def load_data(panel_file, mutations_file, mutabilities_file, genes_subset=None):
     # Load captured panel
     panel = pd.read_csv(panel_file, sep="\t", compression="gzip")
+    if genes_subset:
+        genes_list = [gene.strip() for gene in genes_subset.split(',')]
+        panel = panel[panel['GENE'].isin(genes_list)].reset_index(drop=True)
     # Load mutations
     mutations = pd.read_csv(mutations_file, sep="\t")
     if "EFFECTIVE_MUTS" not in mutations.columns:
@@ -32,6 +37,17 @@ def poisson_pvalue(observed, expected):
     """
     return 1 - stats.poisson.cdf(observed - 1, expected)
 
+def benjamini_hochberg(pvals):
+    if pvals.size == 0:
+        return pvals
+    order = np.argsort(pvals)
+    ranked = np.arange(1, pvals.size + 1)
+    adjusted = np.empty_like(pvals, dtype=float)
+    adjusted[order] = pvals[order] * pvals.size / ranked
+    adjusted_sorted = np.minimum.accumulate(adjusted[order][::-1])[::-1]
+    adjusted[order] = adjusted_sorted
+    return np.clip(adjusted, 0.0, 1.0)
+
 def compute_by_size(panel, mutations, mutabilities, size, sample_column):
     # Merge mutations and panel to associate protein positions
     mutations = mutations.merge(panel[['CHROM', 'POS', 'REF', 'ALT', 'Protein_position', 'Amino_acids', 'GENE', 'Feature']],
@@ -66,6 +82,17 @@ def compute_by_size(panel, mutations, mutabilities, size, sample_column):
     result["OBS/EXP"] = (result["OBSERVED_MUTS"] / result["EXPECTED_MUTS"]).fillna(0)
     result["OBS/EXP"] = result["OBS/EXP"].replace([np.inf, -np.inf], 0)
     result["p_value"] = result[["OBSERVED_MUTS", "EXPECTED_MUTS"]].apply(lambda row: poisson_pvalue(row["OBSERVED_MUTS"], row["EXPECTED_MUTS"]), axis=1)
+
+    # fill pvalues == 0 with the value or resolution limit
+    result["p_value"] = result["p_value"].replace(0, MIN_NONZERO_PVALUE)
+
+    result["p_value_adj"] = np.nan
+    for gene, gene_df in result.groupby("GENE", dropna=False):
+        valid_mask = gene_df["p_value"].notna()
+        if not valid_mask.any():
+            continue
+        adjusted = benjamini_hochberg(gene_df.loc[valid_mask, "p_value"].to_numpy())
+        result.loc[gene_df.index[valid_mask], "p_value_adj"] = adjusted
 
     return result
 
@@ -78,9 +105,10 @@ def compute_by_size(panel, mutations, mutabilities, size, sample_column):
 @click.option('--panel-file', type=click.Path(exists=True), required=True, help="Path to captured panel file (gzip compressed).")
 @click.option('--size', type=str, default='aminoacid_change', help="Comma-separated list of sizes or 'all' for all sizes. The options are: 'site', 'aminoacid', 'aminoacid_change'")
 @click.option('--output-prefix', type=str, required=True, help="Output file prefix.")
-def main(panel_file, mutations_file, mutabilities_file, size, output_prefix):
+@click.option('--genes', type=str, default= '', required=False, help="List of genes to subset (comma-separated). If not provided, all genes will be included.")
+def main(panel_file, mutations_file, mutabilities_file, size, output_prefix, genes):
     """Compute comparison between observed mutations and accumulated mutabilities."""
-    panel, mutations, mutabilities = load_data(panel_file, mutations_file, mutabilities_file)
+    panel, mutations, mutabilities = load_data(panel_file, mutations_file, mutabilities_file, genes if genes != '' else None)
 
     # Get sample column dynamically
     sample_column = get_sample_column(mutabilities)

diff --git a/bin/plot_gene_saturation.py b/bin/plot_gene_saturation.py
@@ -447,7 +447,7 @@ def plot_gene_selection(mut_count_df,
                         plot_pars,
                         title,
                         ddg_df=None,
-                        thr_selection=1e-5,
+                        thr_selection=0.05,
                         lst_tracks=["Mut_count", "Site_selection", "Res_depth", "Domain"],
                         default_track_order=False,
                         save=False,
@@ -517,7 +517,7 @@ def plot_gene_selection(mut_count_df,
             )
 
         site_selection_hits_df = pd.DataFrame({"Protein_position" : np.arange(protein_len)+1}).merge(
-            site_selection_df[site_selection_df["p_value"] < thr_selection].reset_index(drop=True), how="left")
+            site_selection_df[site_selection_df["p_value_adj"] < thr_selection].reset_index(drop=True), how="left")
         axes[ax].fill_between(
             site_selection_hits_df.Protein_position, 0, site_selection_hits_df.Selection,
             color=plot_pars["colors"]["site_selection_hits"], alpha=1, zorder=2, lw=1.5, label="Significant"
@@ -583,7 +583,7 @@ def plot_gene_selection(mut_count_df,
         for impact in ["missense", "truncating"]:
 
             exon_selection_impact = exon_selection_df[exon_selection_df["impact"] == impact]
-            exon_selection_impact_hits = exon_selection_impact[exon_selection_impact["pvalue"] < thr_selection].reset_index(drop=True)
+            exon_selection_impact_hits = exon_selection_impact[exon_selection_impact["pvalue_adj"] < thr_selection].reset_index(drop=True)
             axes[ax].scatter(
                 exon_selection_impact["MID_PROT_POS"], exon_selection_impact["dnds"],
                 zorder=3, color=plot_pars["colors"]["exon_selection"][impact], s=60, lw=0.1, ec="black"
@@ -984,7 +984,7 @@ def plot_domain_selection(
     # this dictionary is defined in the gene plot with the gene--domain name
     domain_selection_in_gene["color"] = domain_selection_in_gene["DOMAIN_ID"].map(color_map)
 
-    domain_selection_in_gene["edge_width"] = domain_selection_in_gene["pvalue"].apply(lambda p: 1.5 if p < pvalue_threshold else 0.2)
+    domain_selection_in_gene["edge_width"] = domain_selection_in_gene["pvalue_adj"].apply(lambda p: 1.5 if p < pvalue_threshold else 0.2)
 
     fig = plt.figure(figsize=(5, 2.5))
 
@@ -1085,9 +1085,9 @@ def get_selection_groups(df, thr_selection):
 
     df = df.copy().rename(columns={"Protein_position": "Pos"})
     g0 = df[df["Selection"] == 0].copy()
-    g1 = df[(df["Selection"] != 0) & (df["p_value"] >= thr_selection)].copy()
+    g1 = df[(df["Selection"] != 0) & (df["p_value_adj"] >= thr_selection)].copy()
 
-    g2 = df[df["p_value"] < thr_selection].copy()
+    g2 = df[df["p_value_adj"] < thr_selection].copy()
     g2 = g2.sort_values("Selection").reset_index(drop=True)
 
     g0["Group"] = "G0"
@@ -1159,7 +1159,7 @@ def plot_all_domain_selection(df, color_map, figsize=(10, 3), show_domain_legend
     custom_markers = {"missense": "o", "truncating": "D"}  # 'o' = Circle, 'D' = Rotated Square
     custom_size = {"missense": 150, "truncating": 100}  # Larger for missense
     df["color"] = df["domain_id"].map(color_map) 
-    df["edge_width"] = df["pvalue"].apply(lambda p: 1.5 if p < 0.05 else 0.2)
+    df["edge_width"] = df["pvalue_adj"].apply(lambda p: 1.5 if p < 0.05 else 0.2)
 
     fig = plt.figure(figsize=figsize)
 
@@ -1268,7 +1268,8 @@ def plotting_wrapper(maf, exons_depth, o3d_df, exon_selection,
 def plotting_single_gene(gene, maf, exons_depth, o3d_df,
                             exon_selection, domain_selection, site_selection,
                             output_dir, o3d_seq_df, o3d_pdb_tool_df, domain,
-                            lst_tracks
+                            lst_tracks,
+                            p_value_threshold = 0.05
                             ):
 
     # Data
@@ -1311,7 +1312,7 @@ def plotting_single_gene(gene, maf, exons_depth, o3d_df,
 
     exon_selection_gene = exon_selection[exon_selection["gene"].str.startswith(gene)]
     exon_selection_gene = exon_selection_gene.sort_values("exon_rank").reset_index(drop=True).rename(columns={"exon_rank" : "EXON_RANK"})
-    exon_selection_gene = exon_selection_gene[["EXON_RANK", "impact", "dnds", "lower", "upper", "pvalue"]]
+    exon_selection_gene = exon_selection_gene[["EXON_RANK", "impact", "dnds", "lower", "upper", "pvalue_adj"]]
 
     site_selection_gene = site_selection[site_selection["GENE"] == gene].sort_values("Protein_position").reset_index(drop=True)
 
@@ -1360,7 +1361,7 @@ def plotting_single_gene(gene, maf, exons_depth, o3d_df,
         plot_pars=plot_pars,
         title=f"{gene}",
         lst_tracks=lst_tracks,
-        thr_selection=0.00001,
+        thr_selection=p_value_threshold,
         default_track_order=False,
         save=True,
         filename=f"{output_dir}/{gene}.saturation_all.png"
@@ -1397,7 +1398,7 @@ def plotting_single_gene(gene, maf, exons_depth, o3d_df,
 
     # Selection groups feat
     # =====================
-    site_selection_gene_grouped = get_selection_groups(site_selection_gene, thr_selection=0.00001)
+    site_selection_gene_grouped = get_selection_groups(site_selection_gene, thr_selection=p_value_threshold)
     site_selection_gene_grouped = site_selection_gene_grouped.merge(pdb_tool_gene[["Pos", "pACC"]])
 
     if ddg_gene is not None:
@@ -1417,14 +1418,15 @@ def data_loading(sample_name, domain, exons_depth, track_list):
     # helper: check for files and call readers only when present
 
     ## Positive selection files
-    omega_file = f"output_mle.{sample_name}.global_loc.tsv"
+    omega_file = f"all_omegas_global_loc.tsv"
     site_selection_file = f"{sample_name}.aminoacid.comparison.tsv.gz"    
     o3d_df_file = f"{sample_name}.3d_clustering_pos.csv"
 
     # Omega data (may be missing)
     if os.path.isfile(omega_file):
         try:
             omega_table = pd.read_table(omega_file)
+            omega_table = omega_table[omega_table["sample"] == sample_name]
         except Exception as e:
             print(f"Could not read omega file {omega_file}: {e}")
             omega_table = pd.DataFrame()

diff --git a/bin/utils.py b/bin/utils.py
@@ -2,6 +2,8 @@
 import json
 import pandas as pd
 
+MIN_NONZERO_PVALUE = 1.17e-38
+
 
 def add_filter(old_filt, add_filt, filt_name):
     """

diff --git a/conf/tools/omega.config b/conf/tools/omega.config
@@ -14,6 +14,7 @@ process {
 
     withName: 'SITECOMPARISON.*' {
         ext.size = params.site_comparison_grouping
+        ext.genes_subset = params.selected_genes
     }
 
 

diff --git a/modules/local/bbgtools/sitecomparison/main.nf b/modules/local/bbgtools/sitecomparison/main.nf
@@ -19,12 +19,15 @@ process SITE_COMPARISON {
     def prefix = task.ext.prefix ?: ""
     prefix = "${meta.id}${prefix}"
     def size = task.ext.size ?: "all" // other options are 'site', 'aa_change', 'aa', '3aa', '3aa_rolling' // think if is worth having 'Naa', 'Naa_rolling'
+    def genes_subset = task.ext.genes_subset ?: ""
+    target_genes = genes_subset != "" ? "--genes ${genes_subset}": ""
     """
     omega_comparison_per_site.py --mutations-file ${mutations} \\
                                     --panel-file ${annotated_panel_richer} \\
                                     --mutabilities-file ${mutabilities_per_site} \\
                                     --size ${size} \\
-                                    --output-prefix ${prefix}
+                                    --output-prefix ${prefix} \\
+                                    ${target_genes}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf
@@ -623,6 +623,9 @@ workflow DEEPCSA {
     if (params.omega || params.oncodrive3d || params.oncodrivefml || params.indels || run_mutdensity) {
         if (params.omega){
             positive_selection_results = positive_selection_results.combine(PLOTTINGQC.out.flagged_omegas)
+            if (params.omega_globalloc){
+                positive_selection_results = positive_selection_results.combine(all_compiled_omegasgloballoc)
+            }
         }
         positive_selection_results_ready = positive_selection_results.map { element -> [element[0], element[1..-1]] }
         PLOTTINGSUMMARY(positive_selection_results_ready,
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,7 @@ process { @@
         withName: 'SITECOMPARISON.*' {
             ext.size = params.site_comparison_grouping
+            ext.genes_subset = params.selected_genes
         }
@@ Expand Down @@