MannLabs · ammarcsj · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -229,3 +229,5 @@ paper_analyses
 
 alphaquant/resources/reference_databases
 alphaquant/resources/phosphopred_databases
+.claude/settings.local.json
+.claude/settings.local.json
diff --git a/alphaquant/classify/ml_info_table.py b/alphaquant/classify/ml_info_table.py
@@ -64,10 +64,12 @@ def _adapt_precursor_name_to_modification_type(self):
 
 
     def _define_ml_info_filename(self):
-        self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv")
+        self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv.zip")
 
     def _write_ml_info_table(self):
-        self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False)
+        archive_name = self.ml_info_filename.split("/")[-1].removesuffix(".zip")
+        compression = {"method": "zip", "archive_name": archive_name}
+        self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False, compression=compression)
 
 
 class MLInfoTableLoader():
@@ -81,4 +83,3 @@ def __init__(self, ml_info_file, samples_used):
     def _subset_df_to_relevant_samples(self):
         self.ml_info_df = self.ml_info_df[self.ml_info_df["sample_ID"].isin(self._samples_used)]
         self.ml_info_df = self.ml_info_df.drop(columns=["sample_ID"])
-
diff --git a/alphaquant/cluster/cluster_ions.py b/alphaquant/cluster/cluster_ions.py
@@ -1,3 +1,28 @@
+"""Hierarchical proteoform clustering and differential-expression aggregation.
+
+This module builds a hierarchical tree for each protein (fragments → peptides
+→ modified peptides → unmodified peptides → protein) and performs two
+*independent* statistical procedures at each level:
+
+1. **Proteoform clustering** — pairwise double-differential tests assess
+   whether sibling ions share the same fold change (null hypothesis: "ions
+   have equal regulation"). The resulting similarity p-values are corrected
+   with Benjamini-Yekutieli (appropriate for the intrinsic dependencies among
+   pairwise comparisons) and then used for hierarchical clustering to separate
+   proteoforms.
+
+2. **Differential-expression aggregation** — the per-ion z-values from
+   individual differential-expression tests (null hypothesis: "no change
+   between conditions") are combined into parent-level z-values via Stouffer's
+   method. This aggregation propagates evidence of regulation up the tree.
+
+The two sets of p-values address fundamentally different questions and are
+corrected separately. The Benjamini-Yekutieli correction in step 1 applies
+*within* a single protein to the similarity matrix; the final protein-level
+p-values produced by step 2 are later corrected across all proteins with
+Benjamini-Hochberg (see ``diffquant_table.py``).
+"""
+
 import scipy.spatial.distance
 import scipy.cluster.hierarchy
 import alphaquant.cluster.cluster_utils as aqcluster_utils
@@ -28,7 +53,7 @@
 
 
 
-def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, fragment_outlier_filtering=True):
+def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01):
     """Main entry point for hierarchical clustering and tree-based quantification of a protein.
 
     This function creates a hierarchical tree structure from fragment ions up to the protein level
@@ -47,7 +72,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
         fcfc_threshold: Fold-change difference threshold for clustering
         take_median_ion: If True, use median-centered ions for clustering
         fcdiff_cutoff_clustermerge: Fold-change threshold for merging similar clusters
-        fragment_outlier_filtering: Whether to filter outlier fragments when aggregating to peptides
+        aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)
 
     Returns:
         anytree.Node: Root node of the hierarchical tree containing all statistics and clustering results
@@ -63,7 +88,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
     root_node = create_hierarchical_ion_grouping(gene_name, diffions)
     add_reduced_names_to_root(root_node)
     #LOGGER.info(anytree.RenderTree(root_node))
-    root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering)
+    root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode=aggregation_mode, cluster_threshold_ion_type=cluster_threshold_ion_type)
 
     level_sorted_nodes = [[node for node in children] for children in anytree.ZigZagGroupIter(root_node_clust)]
     level_sorted_nodes.reverse() #the base nodes are first
@@ -114,7 +139,7 @@ def add_reduced_names_to_root(node):
 
 
 import pandas as pd
-def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering=True):#~60% of overall runtime
+def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01):#~60% of overall runtime
     """Performs hierarchical clustering at each level of the tree from bottom to top.
 
     Starting from base ions (fragments/MS1), this function iterates through each level
@@ -123,6 +148,31 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
     pairwise for consistent fold-change differences, clustered hierarchically, and
     statistics are aggregated to parent nodes.
 
+    Important: this function interleaves two *independent* statistical procedures
+    that address different questions:
+
+    1. **Proteoform clustering** (``find_fold_change_clusters``): tests whether
+       pairs of sibling ions (e.g. two peptides of the same protein) have
+       *different* fold changes, i.e. whether they belong to different
+       proteoforms. The resulting pairwise p-values form a dependent similarity
+       matrix that is corrected for multiple testing with Benjamini-Yekutieli
+       (appropriate for dependent tests) *before* hierarchical clustering.
+
+    2. **Differential-expression aggregation** (``aggregate_node_properties``):
+       combines the per-ion z-values (derived from individual differential
+       expression tests) into a single parent-level z-value via Stouffer's
+       method. These z-values quantify *how much* each ion changes between
+       conditions and are unrelated to the proteoform-similarity p-values
+       from step 1.
+
+    The Benjamini-Yekutieli correction in step 1 therefore precedes the
+    Stouffer aggregation in step 2 by design, because they operate on
+    different null hypotheses: step 1 asks "do these two peptides differ
+    from each other?" while step 2 asks "does this protein change between
+    conditions?". The final protein-level p-values produced in step 2
+    are later corrected with Benjamini-Hochberg across all proteins (see
+    ``diffquant_table.py``).
+
     Args:
         root_node: Root of the hierarchical tree (protein level)
         ionname2diffion: Dictionary mapping ion names to DifferentialIon objects
@@ -134,7 +184,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
         pval_threshold_basis: P-value threshold for clustering decisions
         fcfc_threshold: Fold-change threshold for clustering
         take_median_ion: Whether to use median-centered ions
-        fragment_outlier_filtering: Whether to filter fragment outliers
+        aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)
 
     Returns:
         anytree.Node: The root node with all clustering annotations and aggregated statistics
@@ -164,15 +214,15 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
                     if take_median_ion:
                         grouped_mainclust_leafs = aqcluster_utils.select_median_fc_leafs(grouped_mainclust_leafs)
                     diffions = aqcluster_utils.map_grouped_leafs_to_diffions(grouped_mainclust_leafs, ionname2diffion) #the diffions are the ions that are actually compared
-                    childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold) #the clustering is performed on the child nodes
+                    childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=cluster_threshold_ion_type) #the clustering is performed on the child nodes
                     childnode2clust = merge_similar_clusters_if_applicable(childnode2clust, type_node, fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE)
                     childnode2clust = aq_cluster_sorting.decide_cluster_order(childnode2clust)
 
                 aq_cluster_pfstats.add_proteoform_statistics_to_nodes(childnode2clust, take_median_ion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist)
                 aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust)
                 aqcluster_utils.annotate_mainclust_leaves(childnode2clust)
                 aqcluster_utils.assign_cluster_number(type_node, childnode2clust)
-                aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, fragment_outlier_filtering=fragment_outlier_filtering)
+                aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, aggregation_mode=aggregation_mode)
 
     return root_node
 
@@ -183,21 +233,46 @@ def get_childnode2clust_for_single_ion(type_node):
     return {type_node.children[0]: 0}
 
 
-def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold):
-    """Compares the fold changes of the ions corresponding to the nodes that are compared and returns the set of ions with consistent fold changes.
+def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=0.01):
+    """Cluster sibling ions by the similarity of their fold changes (proteoform inference).
+
+    For each pair of sibling ion groups, a *double-differential* test
+    (``evaluate_similarity``) computes a p-value for the null hypothesis that
+    their fold changes are equal (i.e. they belong to the same proteoform).
+    These pairwise p-values form a condensed similarity matrix that is
+    corrected for multiple testing with the Benjamini-Yekutieli procedure
+    (``get_multiple_testing_corrected_condensed_similarity_matrix``), which is
+    appropriate here because the pairwise comparisons are intrinsically
+    dependent. The corrected matrix is then converted to a distance matrix and
+    subjected to Ward hierarchical clustering.
+
+    Note: the p-values computed and corrected here test *inter-ion similarity*
+    for proteoform grouping. They are entirely separate from the per-ion
+    differential-expression p-values that are later aggregated via Stouffer's
+    method in ``aggregate_node_properties``.
 
     Args:
-        diffions (list[list[ionnames]]): contains the sets of ions to be tested, for example [[fragion1_precursor1, fragion2_precursor1, fragion3_precursor1],[fragion1_precursor2],[fragion1_precursor3, fragion2_precursor3]]. The ions are assumed to be similar in type (e.g. fragment, precursor)!
-        normed_c1 (ConditionBackground): [description]
-        normed_c2 (ConditionBackground): [description]
-        ion2diffDist (dict(ion : SubtractedBackground)): [description]
-        p2z ([type]): [description]
-        deedpair2doublediffdist ([type]): [description]
-        fc_threshold (float, optional): [description]. Defaults to 0.
-        pval_threshold_basis (float, optional): the threshold at which to merge peptides at the gene level. Defaults to 0.01
+        type_node: Parent node whose children are being clustered
+        diffions: List of lists of DifferentialIon objects, one sublist per
+            child node, e.g. ``[[fragion1_prec1, fragion2_prec1], [fragion1_prec2]]``
+        normed_c1: ConditionBackgrounds for condition 1
+        normed_c2: ConditionBackgrounds for condition 2
+        ion2diffDist: Mapping of ion pairs to differential background distributions
+        p2z: Cache for p-value to z-value conversions
+        deedpair2doublediffdist: Cache for double-differential distributions
+        pval_threshold_basis: P-value threshold at the gene level for cutting the
+            clustering dendrogram; thresholds for lower levels are looked up in
+            ``LEVEL2PVALTHRESH``
+        fcfc_threshold: Minimum fold-change difference below which two ion groups
+            are assumed similar without a formal test
+        cluster_threshold_ion_type: P-value threshold at the ion_type level
+
+    Returns:
+        list[tuple[Node, int]]: Pairs of (child node, cluster index) sorted by
+            node name for reproducibility
     """
 
-    pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis)
+    pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=cluster_threshold_ion_type)
     diffions_idxs = [[x] for x in range(len(diffions))]
     diffions_fcs = aqcluster_utils.get_fcs_ions(diffions)
     #mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
@@ -216,26 +291,36 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif
 
     return childnode2clust
 
-def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
+def get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=0.01): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
     if type_node.level == "gene":
         return pval_threshold_basis
-    else:
-        return LEVEL2PVALTHRESH.get(type_node.level, 0.2)
+    if type_node.level == "ion_type":
+        return cluster_threshold_ion_type
+    return LEVEL2PVALTHRESH.get(type_node.level, 0.2)
 
 def get_multiple_testing_corrected_condensed_similarity_matrix(condensed_distance_matrix: np.array):
-    """
-    condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
+    """Apply Benjamini-Yekutieli FDR correction to pairwise ion-similarity p-values.
+
+    The condensed matrix contains p-values from *double-differential* tests
+    between all pairs of sibling ions (see ``evaluate_similarity``). Each
+    p-value tests the null hypothesis that two ions share the same fold
+    change. Because every ion appears in multiple pairwise comparisons, the
+    tests are intrinsically dependent, which is why the Benjamini-Yekutieli
+    (``fdr_by``) procedure is used instead of Benjamini-Hochberg.
+
+    These corrected p-values are used exclusively for proteoform clustering
+    (deciding which ions belong together) and are unrelated to the per-ion
+    differential-expression p-values that are later aggregated via Stouffer's
+    method.
 
     Args:
-    condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
+        condensed_distance_matrix: 1-D array of pairwise similarity p-values
+            in scipy condensed-matrix format.
 
     Returns:
-    np.array: Corrected condensed distance matrix.
+        np.array: Corrected p-values in the same condensed-matrix layout.
     """
-    # Apply Benjamini-Yekutieli correction
     _, corrected_pvalues, _, _ = multitest.multipletests(condensed_distance_matrix, method='fdr_by')
-
-    # Return the corrected condensed matrix
     return corrected_pvalues
 
 
@@ -341,4 +426,3 @@ def exclude_node(node):
     node.is_included = False
     for descendant in node.descendants:
         descendant.is_included = False
-