Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f868690
Add graphviz pure-Python fallback layout for treeviz
ammarcsj May 25, 2026
6b6172d
Minor fcviz cleanup
ammarcsj May 25, 2026
ff6ba0e
Update quant reader config with new input types
ammarcsj May 25, 2026
fa84b3d
Update alphadia reader
ammarcsj May 25, 2026
3e1f1ea
Add split ion backgrounds and new z-aggregation modes to background d…
ammarcsj May 25, 2026
46dfd6b
Add missing value clustering updates
ammarcsj May 25, 2026
c18a375
Add variance predictor module and tests
ammarcsj May 25, 2026
c5b30a6
Add intensity summarization module and tests
ammarcsj May 25, 2026
d81b6d8
Add ICC correction module and tests
ammarcsj May 25, 2026
09ca634
Add residual decorrelation module and tests
ammarcsj May 25, 2026
6633f16
Update cluster utils with new aggregation modes and fragment filtering
ammarcsj May 25, 2026
e420be3
Update config variables for new features
ammarcsj May 25, 2026
c4941d9
Wire new features into condpair analysis
ammarcsj May 25, 2026
7feed67
Update run_pipeline, dashboard and PTM wiring
ammarcsj May 25, 2026
a9b9758
Add module docstring, structure overview and inline comments to resid…
ammarcsj May 25, 2026
b018098
Add .claude/settings.local.json to .gitignore
ammarcsj May 25, 2026
01fc591
Restore notebook plotting functions removed by dead code cleanup
ammarcsj May 25, 2026
c277629
Merge pull request #130 from MannLabs/add_background_split_and_z_aggr…
ammarcsj May 25, 2026
977cc95
Merge pull request #131 from MannLabs/add_variance_predictor
ammarcsj May 25, 2026
2e9a75d
Merge pull request #132 from MannLabs/add_intensity_summarization
ammarcsj May 25, 2026
9701197
Merge pull request #133 from MannLabs/add_icc_correction
ammarcsj May 25, 2026
af965ff
Merge pull request #134 from MannLabs/add_residual_decorrelation
ammarcsj May 25, 2026
050d664
Merge pull request #135 from MannLabs/wire_new_aggregation_features
ammarcsj May 25, 2026
a3aa371
Merge pull request #136 from MannLabs/document_residual_decorrelation
ammarcsj May 25, 2026
c70ab0e
Merge pull request #137 from MannLabs/fix_notebook_plotting_functions
ammarcsj May 25, 2026
8dab154
Resolve merge conflict: take main's version of cluster_missingval.py
ammarcsj May 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,5 @@ paper_analyses

alphaquant/resources/reference_databases
alphaquant/resources/phosphopred_databases
.claude/settings.local.json
.claude/settings.local.json
7 changes: 4 additions & 3 deletions alphaquant/classify/ml_info_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,12 @@ def _adapt_precursor_name_to_modification_type(self):


def _define_ml_info_filename(self):
self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv")
self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv.zip")

def _write_ml_info_table(self):
self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False)
archive_name = self.ml_info_filename.split("/")[-1].removesuffix(".zip")
compression = {"method": "zip", "archive_name": archive_name}
self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False, compression=compression)


class MLInfoTableLoader():
Expand All @@ -81,4 +83,3 @@ def __init__(self, ml_info_file, samples_used):
def _subset_df_to_relevant_samples(self):
self.ml_info_df = self.ml_info_df[self.ml_info_df["sample_ID"].isin(self._samples_used)]
self.ml_info_df = self.ml_info_df.drop(columns=["sample_ID"])

142 changes: 113 additions & 29 deletions alphaquant/cluster/cluster_ions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
"""Hierarchical proteoform clustering and differential-expression aggregation.

This module builds a hierarchical tree for each protein (fragments → peptides
→ modified peptides → unmodified peptides → protein) and performs two
*independent* statistical procedures at each level:

1. **Proteoform clustering** — pairwise double-differential tests assess
whether sibling ions share the same fold change (null hypothesis: "ions
have equal regulation"). The resulting similarity p-values are corrected
with Benjamini-Yekutieli (appropriate for the intrinsic dependencies among
pairwise comparisons) and then used for hierarchical clustering to separate
proteoforms.

2. **Differential-expression aggregation** — the per-ion z-values from
individual differential-expression tests (null hypothesis: "no change
between conditions") are combined into parent-level z-values via Stouffer's
method. This aggregation propagates evidence of regulation up the tree.

The two sets of p-values address fundamentally different questions and are
corrected separately. The Benjamini-Yekutieli correction in step 1 applies
*within* a single protein to the similarity matrix; the final protein-level
p-values produced by step 2 are later corrected across all proteins with
Benjamini-Hochberg (see ``diffquant_table.py``).
"""

import scipy.spatial.distance
import scipy.cluster.hierarchy
import alphaquant.cluster.cluster_utils as aqcluster_utils
Expand Down Expand Up @@ -28,7 +53,7 @@



def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, fragment_outlier_filtering=True):
def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01):
"""Main entry point for hierarchical clustering and tree-based quantification of a protein.

This function creates a hierarchical tree structure from fragment ions up to the protein level
Expand All @@ -47,7 +72,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
fcfc_threshold: Fold-change difference threshold for clustering
take_median_ion: If True, use median-centered ions for clustering
fcdiff_cutoff_clustermerge: Fold-change threshold for merging similar clusters
fragment_outlier_filtering: Whether to filter outlier fragments when aggregating to peptides
aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)

Returns:
anytree.Node: Root node of the hierarchical tree containing all statistics and clustering results
Expand All @@ -63,7 +88,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
root_node = create_hierarchical_ion_grouping(gene_name, diffions)
add_reduced_names_to_root(root_node)
#LOGGER.info(anytree.RenderTree(root_node))
root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering)
root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode=aggregation_mode, cluster_threshold_ion_type=cluster_threshold_ion_type)

level_sorted_nodes = [[node for node in children] for children in anytree.ZigZagGroupIter(root_node_clust)]
level_sorted_nodes.reverse() #the base nodes are first
Expand Down Expand Up @@ -114,7 +139,7 @@ def add_reduced_names_to_root(node):


import pandas as pd
def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering=True):#~60% of overall runtime
def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01):#~60% of overall runtime
"""Performs hierarchical clustering at each level of the tree from bottom to top.

Starting from base ions (fragments/MS1), this function iterates through each level
Expand All @@ -123,6 +148,31 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
pairwise for consistent fold-change differences, clustered hierarchically, and
statistics are aggregated to parent nodes.

Important: this function interleaves two *independent* statistical procedures
that address different questions:

1. **Proteoform clustering** (``find_fold_change_clusters``): tests whether
pairs of sibling ions (e.g. two peptides of the same protein) have
*different* fold changes, i.e. whether they belong to different
proteoforms. The resulting pairwise p-values form a dependent similarity
matrix that is corrected for multiple testing with Benjamini-Yekutieli
(appropriate for dependent tests) *before* hierarchical clustering.

2. **Differential-expression aggregation** (``aggregate_node_properties``):
combines the per-ion z-values (derived from individual differential
expression tests) into a single parent-level z-value via Stouffer's
method. These z-values quantify *how much* each ion changes between
conditions and are unrelated to the proteoform-similarity p-values
from step 1.

The Benjamini-Yekutieli correction in step 1 therefore precedes the
Stouffer aggregation in step 2 by design, because they operate on
different null hypotheses: step 1 asks "do these two peptides differ
from each other?" while step 2 asks "does this protein change between
conditions?". The final protein-level p-values produced in step 2
are later corrected with Benjamini-Hochberg across all proteins (see
``diffquant_table.py``).

Args:
root_node: Root of the hierarchical tree (protein level)
ionname2diffion: Dictionary mapping ion names to DifferentialIon objects
Expand All @@ -134,7 +184,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
pval_threshold_basis: P-value threshold for clustering decisions
fcfc_threshold: Fold-change threshold for clustering
take_median_ion: Whether to use median-centered ions
fragment_outlier_filtering: Whether to filter fragment outliers
aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)

Returns:
anytree.Node: The root node with all clustering annotations and aggregated statistics
Expand Down Expand Up @@ -164,15 +214,15 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
if take_median_ion:
grouped_mainclust_leafs = aqcluster_utils.select_median_fc_leafs(grouped_mainclust_leafs)
diffions = aqcluster_utils.map_grouped_leafs_to_diffions(grouped_mainclust_leafs, ionname2diffion) #the diffions are the ions that are actually compared
childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold) #the clustering is performed on the child nodes
childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=cluster_threshold_ion_type) #the clustering is performed on the child nodes
childnode2clust = merge_similar_clusters_if_applicable(childnode2clust, type_node, fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE)
childnode2clust = aq_cluster_sorting.decide_cluster_order(childnode2clust)

aq_cluster_pfstats.add_proteoform_statistics_to_nodes(childnode2clust, take_median_ion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist)
aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust)
aqcluster_utils.annotate_mainclust_leaves(childnode2clust)
aqcluster_utils.assign_cluster_number(type_node, childnode2clust)
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, fragment_outlier_filtering=fragment_outlier_filtering)
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, aggregation_mode=aggregation_mode)

return root_node

Expand All @@ -183,21 +233,46 @@ def get_childnode2clust_for_single_ion(type_node):
return {type_node.children[0]: 0}


def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold):
"""Compares the fold changes of the ions corresponding to the nodes that are compared and returns the set of ions with consistent fold changes.
def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=0.01):
"""Cluster sibling ions by the similarity of their fold changes (proteoform inference).

For each pair of sibling ion groups, a *double-differential* test
(``evaluate_similarity``) computes a p-value for the null hypothesis that
their fold changes are equal (i.e. they belong to the same proteoform).
These pairwise p-values form a condensed similarity matrix that is
corrected for multiple testing with the Benjamini-Yekutieli procedure
(``get_multiple_testing_corrected_condensed_similarity_matrix``), which is
appropriate here because the pairwise comparisons are intrinsically
dependent. The corrected matrix is then converted to a distance matrix and
subjected to Ward hierarchical clustering.

Note: the p-values computed and corrected here test *inter-ion similarity*
for proteoform grouping. They are entirely separate from the per-ion
differential-expression p-values that are later aggregated via Stouffer's
method in ``aggregate_node_properties``.

Args:
diffions (list[list[ionnames]]): contains the sets of ions to be tested, for example [[fragion1_precursor1, fragion2_precursor1, fragion3_precursor1],[fragion1_precursor2],[fragion1_precursor3, fragion2_precursor3]]. The ions are assumed to be similar in type (e.g. fragment, precursor)!
normed_c1 (ConditionBackground): [description]
normed_c2 (ConditionBackground): [description]
ion2diffDist (dict(ion : SubtractedBackground)): [description]
p2z ([type]): [description]
deedpair2doublediffdist ([type]): [description]
fc_threshold (float, optional): [description]. Defaults to 0.
pval_threshold_basis (float, optional): the threshold at which to merge peptides at the gene level. Defaults to 0.01
type_node: Parent node whose children are being clustered
diffions: List of lists of DifferentialIon objects, one sublist per
child node, e.g. ``[[fragion1_prec1, fragion2_prec1], [fragion1_prec2]]``
normed_c1: ConditionBackgrounds for condition 1
normed_c2: ConditionBackgrounds for condition 2
ion2diffDist: Mapping of ion pairs to differential background distributions
p2z: Cache for p-value to z-value conversions
deedpair2doublediffdist: Cache for double-differential distributions
pval_threshold_basis: P-value threshold at the gene level for cutting the
clustering dendrogram; thresholds for lower levels are looked up in
``LEVEL2PVALTHRESH``
fcfc_threshold: Minimum fold-change difference below which two ion groups
are assumed similar without a formal test
cluster_threshold_ion_type: P-value threshold at the ion_type level

Returns:
list[tuple[Node, int]]: Pairs of (child node, cluster index) sorted by
node name for reproducibility
"""

pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis)
pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=cluster_threshold_ion_type)
diffions_idxs = [[x] for x in range(len(diffions))]
diffions_fcs = aqcluster_utils.get_fcs_ions(diffions)
#mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
Expand All @@ -216,26 +291,36 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif

return childnode2clust

def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
def get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=0.01): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
if type_node.level == "gene":
return pval_threshold_basis
else:
return LEVEL2PVALTHRESH.get(type_node.level, 0.2)
if type_node.level == "ion_type":
return cluster_threshold_ion_type
return LEVEL2PVALTHRESH.get(type_node.level, 0.2)

def get_multiple_testing_corrected_condensed_similarity_matrix(condensed_distance_matrix: np.array):
"""
condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
"""Apply Benjamini-Yekutieli FDR correction to pairwise ion-similarity p-values.

The condensed matrix contains p-values from *double-differential* tests
between all pairs of sibling ions (see ``evaluate_similarity``). Each
p-value tests the null hypothesis that two ions share the same fold
change. Because every ion appears in multiple pairwise comparisons, the
tests are intrinsically dependent, which is why the Benjamini-Yekutieli
(``fdr_by``) procedure is used instead of Benjamini-Hochberg.

These corrected p-values are used exclusively for proteoform clustering
(deciding which ions belong together) and are unrelated to the per-ion
differential-expression p-values that are later aggregated via Stouffer's
method.

Args:
condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
condensed_distance_matrix: 1-D array of pairwise similarity p-values
in scipy condensed-matrix format.

Returns:
np.array: Corrected condensed distance matrix.
np.array: Corrected p-values in the same condensed-matrix layout.
"""
# Apply Benjamini-Yekutieli correction
_, corrected_pvalues, _, _ = multitest.multipletests(condensed_distance_matrix, method='fdr_by')

# Return the corrected condensed matrix
return corrected_pvalues


Expand Down Expand Up @@ -341,4 +426,3 @@ def exclude_node(node):
node.is_included = False
for descendant in node.descendants:
descendant.is_included = False

Loading
Loading