From e1b4dc8f7bf49dc5ec023c97bf6839d96b0cc1c7 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Wed, 11 Sep 2024 17:05:39 -0400 Subject: [PATCH 01/13] Create files for running LD on gnomAD v4 SNVs/Indels --- gnomad_qc/v4/assessment/generate_ld_data.py | 339 ++++++++++++++++++++ gnomad_qc/v4/resources/ld_resources.py | 62 ++++ 2 files changed, 401 insertions(+) create mode 100644 gnomad_qc/v4/assessment/generate_ld_data.py create mode 100644 gnomad_qc/v4/resources/ld_resources.py diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py new file mode 100644 index 000000000..983444bae --- /dev/null +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -0,0 +1,339 @@ +import argparse +import sys + +import gnomad.resources.grch37.gnomad_ld as ld_resources +from gnomad.utils.slack import slack_notifications +from hail.linalg import BlockMatrix +from hail.utils import new_temp_file +# from hail.utils.java import Env + +from gnomad_qc.slack_creds import slack_token +from gnomad_qc.v4.resources import * +from gnomad_qc.v3.resources.release import hgdp_tgp_subset +# this includes ld_resources + +HGDP_TGP_RELEASES + +COMMON_FREQ = 0.005 +RARE_FREQ = 0.0005 + +def get_pop_and_subpop_counters(mt): + cut_dict = { + "pop": hl.agg.filter( + hl.is_defined(mt.meta.pop) & (mt.meta.pop != "oth"), + hl.agg.counter(mt.meta.pop), + ), + } + cut_data = mt.aggregate_cols(hl.struct(**cut_dict)) + return cut_data + + +def filter_mt_for_ld( + mt, label, pop, common_only: bool = True, re_call_stats: bool = False +): + frequency = COMMON_FREQ if common_only else RARE_FREQ + # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants + # All work on standard machines except AFR + # Also creating `common_only` ones for most practical uses (above 0.5%) + pop_mt = mt.filter_cols(mt.meta[label] == pop) + meta_index = ( + hl.enumerate(pop_mt.freq_meta) + .find(lambda f: (f[1].get(label) == pop)) + .collect()[0][0] + ) + # No support for SV datasets in gnomAD v4 Production team code + # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on + # This is for test sets or 1kg_tdpg + if re_call_stats: + call_stats = hl.agg.call_stats(pop_mt.GT, pop_mt.alleles) + call_stats_bind = hl.bind( + lambda cs: cs.annotate( + AC=cs.AC[1], AF=cs.AF[1], homozygote_count=cs.homozygote_count[1] + ), + call_stats, + ) + pop_freq = call_stats_bind + else: + pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0)) + pop_freq = pop_mt.freq[meta_index] + + pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0)) + pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq) + pop_mt = pop_mt.filter_rows( + (pop_mt.pop_freq.AC > 1) + & (pop_mt.pop_freq.AN - pop_mt.pop_freq.AC > 1) + & (pop_mt.pop_freq.AF > frequency) + & + # the following filter is needed for "het-only" monomorphic sites + # as otherwise variance is 0, and so, row_correlation errors out + ~((pop_mt.pop_freq.AF == 0.5) & (pop_mt.pop_freq.homozygote_count == 0)) + ) + + return pop_mt + + +def generate_ld_pruned_set( + mt: hl.MatrixTable, + pop_data: dict, + data_type: str, + r2: str = "0.2", + radius: int = 1000000, + overwrite: bool = False, +): + for label, pops in dict(pop_data).items(): + for pop in pops: + pop_mt = filter_mt_for_ld(mt, label, pop) + + pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius) + + ht = pop_mt.rows().select("pop_freq") + ht = ht.filter(hl.is_defined(pruned_ht[ht.key])) + ht.write(ld_pruned_path(data_type, pop, r2), overwrite) + + +def generate_ld_matrix( + mt, + pop_data, + data_type, + radius: int = 1000000, + common_only: bool = True, + adj: bool = False, + overwrite: bool = False, +): + # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population + # Total of ~37 hours ($400) + for label, pops in dict(pop_data).items(): + for pop in pops: + if data_type == "genomes_snv_sv" and pop not in SNV_SV_POPS: + continue + pop_mt = filter_mt_for_ld( + mt, label, pop, common_only, sv_dataset=data_type == "genomes_snv_sv" + ) + + pop_mt.rows().select("pop_freq").add_index().write( + ld_resources._ld_index_path(data_type, pop, common_only, adj), overwrite + ) + ld = hl.ld_matrix(pop_mt.GT.n_alt_alleles(), pop_mt.locus, radius) + if data_type != "genomes_snv_sv": + ld = ld.sparsify_triangle() + ld.write( + ld_resources._ld_matrix_path(data_type, pop, common_only, adj), + overwrite, + ) + + + + + +def generate_ld_scores_from_ld_matrix( + pop_data, + data_type, + min_frequency=0.01, + call_rate_cutoff=0.8, + adj: bool = False, + radius: int = 1000000, + overwrite=False, +): + # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR + # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 + # mins to compute LD scores per population (~$150 total) + for label, pops in dict(pop_data).items(): + for pop, n in pops.items(): + ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj)) + ht = ht.filter( + (ht.pop_freq.AF >= min_frequency) + & (ht.pop_freq.AF <= 1 - min_frequency) + & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff) + ).add_index(name="new_idx") + + indices = ht.idx.collect() + + r2 = BlockMatrix.read( + ld_resources._ld_matrix_path( + data_type, pop, min_frequency >= COMMON_FREQ, adj=adj + ) + ) + r2 = r2.filter(indices, indices) ** 2 + r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) + + out_name = ld_resources._ld_scores_path(data_type, pop, adj) + compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) + + + + + +def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite): + starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) + r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) + + l2row = r2_adj.sum(axis=0).T + l2col = r2_adj.sum(axis=1) + l2 = l2row + l2col + 1 + l2_bm_tmp = new_temp_file() + l2_tsv_tmp = new_temp_file() + + l2.write(l2_bm_tmp, force_row_major=True) + BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) + + ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) + ht_scores = ht_scores.add_index().rename({"f0": "ld_score"}) + ht_scores = ht_scores.key_by("idx") + ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals() + ht.filter(hl.is_defined(ht.ld_score)).write(out_name, overwrite) + + +def main(args): + hl.init(log="/ld_annotations.log",tmp_dir="gs://gnomad-tmp-4day/ld_tmp/") + hl._set_flags(use_new_shuffle="1") + + # Only run on genomes so far, no choice to run on exomes, too sparse + data_type = "genomes" + if args.exomes: + raise NotImplementedError( + "LD Code does not work for exomes yet. Too sparse:(" + ) + + + + if args.hgdp_subset: + mt = hgdp_tgp_subset(dense=True,public=True).mt() + if args.test: + mt = mt.filter_rows(mt.locus.contig=="chr22") + mt = mt.annotate_globals( + freq_meta = mt.hgdp_tgp_freq_meta, + freq_index_dict = mt.hgdp_tgp_freq_index_dict + ) + + mt = mt.annotate_rows( + freq = mt.hgdp_tgp_freq + ) + + mt = mt.annotate_cols( + pop = mt.hgdp_tgp_meta.population + ) + else: + ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) + vds = basics.get_gnomad_v4_genomes_vds(test=args.test,annotate_meta=True,release_only=True,split=True) + mt_vds = vds.variant_data + mt = mt_vds.annotate( + **hl.eval(v4_release_ht.globals) + ) + + mt = mt.annotate( + pop = mt.meta.population_inference.pop + ) + + + + pop_data = get_pop_and_subpop_counters(mt) + + if args.generate_ld_pruned_set: + generate_ld_pruned_set( + mt, pop_data, data_type, args.r2, args.radius, args.overwrite + ) + + if args.generate_ld_matrix: + generate_ld_matrix( + mt, + pop_data, + data_type, + args.radius, + args.common_only, + args.adj, + args.overwrite, + ) + + if args.generate_ld_scores: + generate_ld_scores_from_ld_matrix( + pop_data, + data_type, + args.min_frequency, + args.min_call_rate, + args.adj, + overwrite=args.overwrite, + ) + + if args.cross_pop_ld_scores: + generate_all_cross_pop_ld_scores( + pop_data, + data_type, + args.min_frequency, + args.min_call_rate, + args.adj, + overwrite=args.overwrite, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--exomes", + help="Input MT is exomes. One of --exomes or --genomes is required.", + action="store_true", + ) + parser.add_argument( + "--generate_ld_pruned_set", + help="Calculates LD pruned set of variants", + action="store_true", + ) + parser.add_argument( + "--generate_ld_matrix", help="Calculates LD matrix", action="store_true" + ) + parser.add_argument( + "--common_only", + help="Calculates LD matrix only on common variants (above 0.5%)", + action="store_true", + ) + parser.add_argument( + "--adj", + help="Calculates LD matrix only on using high-quality genotypes", + action="store_true", + ) + parser.add_argument( + "--generate_ld_scores", + help="Calculates LD scores from LD matrix", + action="store_true", + ) + parser.add_argument( + "--min_frequency", + help="Minimum allele frequency to compute LD scores (default 0.01)", + default=0.01, + type=float, + ) + parser.add_argument( + "--min_call_rate", + help="Minimum call rate to compute LD scores (default 0.8)", + default=0.8, + type=float, + ) + parser.add_argument( + "--r2", help="r-squared to which to prune LD (default 0.2)", default="0.2" + ) + parser.add_argument( + "--radius", + help="Radius at which to calculate LD information (bp; default 1e6)", + default=1000000, + type=int, + ) + parser.add_argument( + "--test", + help="Use test dataset for whichever callset requested.", + action="store_true" + ) + parser.add_argument( + "--hgdp_subset", + help="Use hgdp dataset for callset.", + action="store_true" + ) + parser.add_argument( + "--slack_channel", help="Slack channel to post results and notifications to." + ) + parser.add_argument("--overwrite", help="Overwrite data", action="store_true") + args = parser.parse_args() + + if args.slack_channel: + with slack_notifications(slack_token, args.slack_channel): + main(args) + else: + main(args) diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py new file mode 100644 index 000000000..b98ca649c --- /dev/null +++ b/gnomad_qc/v4/resources/ld_resources.py @@ -0,0 +1,62 @@ +"""Script containing annotation related resources.""" + +from typing import Optional + +from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE +from gnomad.resources.resource_utils import ( + GnomadPublicBlockMatrixResource, + GnomadPublicTableResource, +) + +def ld_matrix_path( + data_type: str, + pop: str, + common_only: bool = True, + adj: bool = True, + version: Optional[str] = None, +): + if version is None: + version = ( + CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE + ) + return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm' + +def ld_index_path( + data_type: str, + pop: str, + common_only: bool = True, + adj: bool = True, + version: Optional[str] = None, +): + if version is None: + version = ( + CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE + ) + return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht' +# +def ld_scores_path( + data_type: str, pop: str, adj: bool = True, version: Optional[str] = None +): + if version is None: + version = ( + CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE + ) + return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' + + +def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_RELEASE): + return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht" + +def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource: + """Get resource for the LD matrix for the given population.""" + return GnomadPublicBlockMatrixResource(path=ld_matrix_path("genomes", pop)) + + +def ld_index(pop: str) -> GnomadPublicTableResource: + """Get resource for the LD indices for the given population.""" + return GnomadPublicTableResource(path=ld_index_path("genomes", pop)) + + +def ld_scores(pop: str) -> GnomadPublicTableResource: + """Get resource for the LD scores for the given population.""" + return GnomadPublicTableResource(path=ld_scores_path("genomes", pop)) \ No newline at end of file From 31b495d08f4164c87fbfdafa7e386a11c6794ab8 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Wed, 11 Sep 2024 17:06:23 -0400 Subject: [PATCH 02/13] black formatting --- gnomad_qc/v4/assessment/generate_ld_data.py | 62 ++++++++------------- gnomad_qc/v4/resources/ld_resources.py | 9 ++- 2 files changed, 29 insertions(+), 42 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 983444bae..2bda418a6 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -5,11 +5,13 @@ from gnomad.utils.slack import slack_notifications from hail.linalg import BlockMatrix from hail.utils import new_temp_file -# from hail.utils.java import Env + +# from hail.utils.java import Env from gnomad_qc.slack_creds import slack_token from gnomad_qc.v4.resources import * from gnomad_qc.v3.resources.release import hgdp_tgp_subset + # this includes ld_resources HGDP_TGP_RELEASES @@ -17,6 +19,7 @@ COMMON_FREQ = 0.005 RARE_FREQ = 0.0005 + def get_pop_and_subpop_counters(mt): cut_dict = { "pop": hl.agg.filter( @@ -41,8 +44,8 @@ def filter_mt_for_ld( .find(lambda f: (f[1].get(label) == pop)) .collect()[0][0] ) - # No support for SV datasets in gnomAD v4 Production team code - # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on + # No support for SV datasets in gnomAD v4 Production team code + # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on # This is for test sets or 1kg_tdpg if re_call_stats: call_stats = hl.agg.call_stats(pop_mt.GT, pop_mt.alleles) @@ -122,9 +125,6 @@ def generate_ld_matrix( ) - - - def generate_ld_scores_from_ld_matrix( pop_data, data_type, @@ -160,9 +160,6 @@ def generate_ld_scores_from_ld_matrix( compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) - - - def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite): starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) @@ -184,47 +181,34 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite): def main(args): - hl.init(log="/ld_annotations.log",tmp_dir="gs://gnomad-tmp-4day/ld_tmp/") + hl.init(log="/ld_annotations.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/") hl._set_flags(use_new_shuffle="1") - # Only run on genomes so far, no choice to run on exomes, too sparse - data_type = "genomes" + # Only run on genomes so far, no choice to run on exomes, too sparse + data_type = "genomes" if args.exomes: - raise NotImplementedError( - "LD Code does not work for exomes yet. Too sparse:(" - ) - - + raise NotImplementedError("LD Code does not work for exomes yet. Too sparse:(") if args.hgdp_subset: - mt = hgdp_tgp_subset(dense=True,public=True).mt() + mt = hgdp_tgp_subset(dense=True, public=True).mt() if args.test: - mt = mt.filter_rows(mt.locus.contig=="chr22") + mt = mt.filter_rows(mt.locus.contig == "chr22") mt = mt.annotate_globals( - freq_meta = mt.hgdp_tgp_freq_meta, - freq_index_dict = mt.hgdp_tgp_freq_index_dict + freq_meta=mt.hgdp_tgp_freq_meta, freq_index_dict=mt.hgdp_tgp_freq_index_dict ) - mt = mt.annotate_rows( - freq = mt.hgdp_tgp_freq - ) + mt = mt.annotate_rows(freq=mt.hgdp_tgp_freq) - mt = mt.annotate_cols( - pop = mt.hgdp_tgp_meta.population - ) + mt = mt.annotate_cols(pop=mt.hgdp_tgp_meta.population) else: ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) - vds = basics.get_gnomad_v4_genomes_vds(test=args.test,annotate_meta=True,release_only=True,split=True) - mt_vds = vds.variant_data - mt = mt_vds.annotate( - **hl.eval(v4_release_ht.globals) + vds = basics.get_gnomad_v4_genomes_vds( + test=args.test, annotate_meta=True, release_only=True, split=True ) + mt_vds = vds.variant_data + mt = mt_vds.annotate(**hl.eval(v4_release_ht.globals)) - mt = mt.annotate( - pop = mt.meta.population_inference.pop - ) - - + mt = mt.annotate(pop=mt.meta.population_inference.pop) pop_data = get_pop_and_subpop_counters(mt) @@ -319,12 +303,10 @@ def main(args): parser.add_argument( "--test", help="Use test dataset for whichever callset requested.", - action="store_true" + action="store_true", ) parser.add_argument( - "--hgdp_subset", - help="Use hgdp dataset for callset.", - action="store_true" + "--hgdp_subset", help="Use hgdp dataset for callset.", action="store_true" ) parser.add_argument( "--slack_channel", help="Slack channel to post results and notifications to." diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py index b98ca649c..38ab3b8c8 100644 --- a/gnomad_qc/v4/resources/ld_resources.py +++ b/gnomad_qc/v4/resources/ld_resources.py @@ -8,6 +8,7 @@ GnomadPublicTableResource, ) + def ld_matrix_path( data_type: str, pop: str, @@ -21,6 +22,7 @@ def ld_matrix_path( ) return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm' + def ld_index_path( data_type: str, pop: str, @@ -33,7 +35,9 @@ def ld_index_path( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht' -# + + +# def ld_scores_path( data_type: str, pop: str, adj: bool = True, version: Optional[str] = None ): @@ -47,6 +51,7 @@ def ld_scores_path( def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_RELEASE): return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht" + def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource: """Get resource for the LD matrix for the given population.""" return GnomadPublicBlockMatrixResource(path=ld_matrix_path("genomes", pop)) @@ -59,4 +64,4 @@ def ld_index(pop: str) -> GnomadPublicTableResource: def ld_scores(pop: str) -> GnomadPublicTableResource: """Get resource for the LD scores for the given population.""" - return GnomadPublicTableResource(path=ld_scores_path("genomes", pop)) \ No newline at end of file + return GnomadPublicTableResource(path=ld_scores_path("genomes", pop)) From d5a5dfe8e1a43ef359bdb4706cceef6fa3c39e59 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Thu, 12 Sep 2024 15:51:51 -0400 Subject: [PATCH 03/13] testing, further params and labels --- gnomad_qc/v4/assessment/generate_ld_data.py | 120 ++++++++++++-------- gnomad_qc/v4/resources/ld_resources.py | 2 +- 2 files changed, 76 insertions(+), 46 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 2bda418a6..770735ea3 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -10,6 +10,7 @@ from gnomad_qc.slack_creds import slack_token from gnomad_qc.v4.resources import * +from gnomad_qc.v4.resources.ld_resources import * from gnomad_qc.v3.resources.release import hgdp_tgp_subset # this includes ld_resources @@ -20,14 +21,15 @@ RARE_FREQ = 0.0005 -def get_pop_and_subpop_counters(mt): +def get_pop_and_subpop_counters(mt,label): cut_dict = { - "pop": hl.agg.filter( - hl.is_defined(mt.meta.pop) & (mt.meta.pop != "oth"), - hl.agg.counter(mt.meta.pop), + f"{label}": hl.agg.filter( + hl.is_defined(mt.meta.population_inference.pop) & (mt.meta.population_inference.pop != "oth"), + hl.agg.counter(mt.meta.population_inference.pop), ), } cut_data = mt.aggregate_cols(hl.struct(**cut_dict)) + logger.info(f'Counts: {cut_data}') return cut_data @@ -38,7 +40,7 @@ def filter_mt_for_ld( # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants # All work on standard machines except AFR # Also creating `common_only` ones for most practical uses (above 0.5%) - pop_mt = mt.filter_cols(mt.meta[label] == pop) + pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop) meta_index = ( hl.enumerate(pop_mt.freq_meta) .find(lambda f: (f[1].get(label) == pop)) @@ -56,12 +58,14 @@ def filter_mt_for_ld( call_stats, ) pop_freq = call_stats_bind + pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq) + else: pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0)) pop_freq = pop_mt.freq[meta_index] + pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq) pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0)) - pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq) pop_mt = pop_mt.filter_rows( (pop_mt.pop_freq.AC > 1) & (pop_mt.pop_freq.AN - pop_mt.pop_freq.AC > 1) @@ -82,16 +86,25 @@ def generate_ld_pruned_set( r2: str = "0.2", radius: int = 1000000, overwrite: bool = False, + re_call_stats: bool = False, + version: str = None, ): + for label, pops in dict(pop_data).items(): for pop in pops: - pop_mt = filter_mt_for_ld(mt, label, pop) + logger.info(f"Filtering for {pop}...") + pop_mt = filter_mt_for_ld(mt, label, pop, re_call_stats) + logger.info(f"Count after filter_mt_for_ld() : {pop_mt.count()}") pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius) + logger.info(f"Count after hl.ld_prune() : {pruned_ht.count()}") + ht = pop_mt.rows().select("pop_freq") ht = ht.filter(hl.is_defined(pruned_ht[ht.key])) - ht.write(ld_pruned_path(data_type, pop, r2), overwrite) + ld_path = ld_pruned_path(data_type, pop, r2, version=version) + logger.info(f"Writing pruned ht for {pop} to {ld_path} ...") + ht.write(ld_path, overwrite) def generate_ld_matrix( @@ -102,25 +115,27 @@ def generate_ld_matrix( common_only: bool = True, adj: bool = False, overwrite: bool = False, + re_call_stats: bool = False, + version: str = None, ): # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population # Total of ~37 hours ($400) + for label, pops in dict(pop_data).items(): for pop in pops: - if data_type == "genomes_snv_sv" and pop not in SNV_SV_POPS: - continue + pop_mt = filter_mt_for_ld( - mt, label, pop, common_only, sv_dataset=data_type == "genomes_snv_sv" + mt, label, pop, common_only, re_call_stats ) pop_mt.rows().select("pop_freq").add_index().write( - ld_resources._ld_index_path(data_type, pop, common_only, adj), overwrite + ld_resources._ld_index_path(data_type, pop, common_only, adj, version=version), overwrite ) ld = hl.ld_matrix(pop_mt.GT.n_alt_alleles(), pop_mt.locus, radius) if data_type != "genomes_snv_sv": ld = ld.sparsify_triangle() ld.write( - ld_resources._ld_matrix_path(data_type, pop, common_only, adj), + ld_resources._ld_matrix_path(data_type, pop, common_only, adj, version=version), overwrite, ) @@ -132,14 +147,15 @@ def generate_ld_scores_from_ld_matrix( call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, - overwrite=False, + overwrite: bool = False, + version: str = None ): # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 # mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): - ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj)) + ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj, version=version)) ht = ht.filter( (ht.pop_freq.AF >= min_frequency) & (ht.pop_freq.AF <= 1 - min_frequency) @@ -150,17 +166,17 @@ def generate_ld_scores_from_ld_matrix( r2 = BlockMatrix.read( ld_resources._ld_matrix_path( - data_type, pop, min_frequency >= COMMON_FREQ, adj=adj + data_type, pop, min_frequency >= COMMON_FREQ, adj=adj, version=version ) ) r2 = r2.filter(indices, indices) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) - out_name = ld_resources._ld_scores_path(data_type, pop, adj) + out_name = ld_scores_path(data_type, pop, adj,version=version) compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite): +def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite,version): starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) @@ -181,7 +197,7 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite): def main(args): - hl.init(log="/ld_annotations.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/") + hl.init(log="/ld_assessment.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/") hl._set_flags(use_new_shuffle="1") # Only run on genomes so far, no choice to run on exomes, too sparse @@ -191,30 +207,48 @@ def main(args): if args.hgdp_subset: mt = hgdp_tgp_subset(dense=True, public=True).mt() - if args.test: - mt = mt.filter_rows(mt.locus.contig == "chr22") mt = mt.annotate_globals( - freq_meta=mt.hgdp_tgp_freq_meta, freq_index_dict=mt.hgdp_tgp_freq_index_dict + freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict ) + mt = mt.annotate_rows(freq=mt.gnomad_freq) + mt = mt.annotate_cols(meta=hl.struct( + population_inference = hl.struct( + pop=mt.gnomad_population_inference.pop))) - mt = mt.annotate_rows(freq=mt.hgdp_tgp_freq) + if args.test: + mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1) + mt = mt.filter_cols(mt.meta.population_inference.pop == "afr") - mt = mt.annotate_cols(pop=mt.hgdp_tgp_meta.population) else: ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) vds = basics.get_gnomad_v4_genomes_vds( test=args.test, annotate_meta=True, release_only=True, split=True ) mt_vds = vds.variant_data - mt = mt_vds.annotate(**hl.eval(v4_release_ht.globals)) + mt = mt_vds.annotate_globals( + **hl.eval(ht_release.globals) + ) + + mt = mt.annotate_rows( + freq = ht_release[mt.row_key].freq, + filters = ht_release[mt.row_key].filters + ) - mt = mt.annotate(pop=mt.meta.population_inference.pop) + if args.test: + mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1) + mt = mt.filter_cols(mt.meta.population_inference.pop == "afr") + + # it is already in the right place for the gnomAD MT , or whatever + # mt = mt.annotate(pop=mt.meta.population_inference.pop) + label = 'gen_anc' if not args.hgdp_subset else 'pop' + pop_data = get_pop_and_subpop_counters(mt,label=label) - pop_data = get_pop_and_subpop_counters(mt) + # Version is populated via ld_resources.py if None + version = None if not args.hgdp_subset else 'hgdp' if args.generate_ld_pruned_set: generate_ld_pruned_set( - mt, pop_data, data_type, args.r2, args.radius, args.overwrite + mt, pop_data, data_type, args.r2, args.radius, args.overwrite, re_call_stats=args.re_call_stats, version=version ) if args.generate_ld_matrix: @@ -226,6 +260,8 @@ def main(args): args.common_only, args.adj, args.overwrite, + re_call_stats=args.re_call_stats, + version=version ) if args.generate_ld_scores: @@ -238,15 +274,6 @@ def main(args): overwrite=args.overwrite, ) - if args.cross_pop_ld_scores: - generate_all_cross_pop_ld_scores( - pop_data, - data_type, - args.min_frequency, - args.min_call_rate, - args.adj, - overwrite=args.overwrite, - ) if __name__ == "__main__": @@ -257,15 +284,15 @@ def main(args): action="store_true", ) parser.add_argument( - "--generate_ld_pruned_set", + "--generate-ld-pruned-set", help="Calculates LD pruned set of variants", action="store_true", ) parser.add_argument( - "--generate_ld_matrix", help="Calculates LD matrix", action="store_true" + "--generate-ld-matrix", help="Calculates LD matrix", action="store_true" ) parser.add_argument( - "--common_only", + "--common-only", help="Calculates LD matrix only on common variants (above 0.5%)", action="store_true", ) @@ -275,18 +302,18 @@ def main(args): action="store_true", ) parser.add_argument( - "--generate_ld_scores", + "--generate-ld-scores", help="Calculates LD scores from LD matrix", action="store_true", ) parser.add_argument( - "--min_frequency", + "--min-frequency", help="Minimum allele frequency to compute LD scores (default 0.01)", default=0.01, type=float, ) parser.add_argument( - "--min_call_rate", + "--min-call-rate", help="Minimum call rate to compute LD scores (default 0.8)", default=0.8, type=float, @@ -306,10 +333,13 @@ def main(args): action="store_true", ) parser.add_argument( - "--hgdp_subset", help="Use hgdp dataset for callset.", action="store_true" + "--hgdp-subset", help="Use hgdp dataset for callset.", action="store_true" + ) + parser.add_argument( + "--re-call-stats", help="Regenerate callstats for LD work. Can be useful for some subsets.", action="store_true" ) parser.add_argument( - "--slack_channel", help="Slack channel to post results and notifications to." + "--slack-channel", help="Slack channel to post results and notifications to." ) parser.add_argument("--overwrite", help="Overwrite data", action="store_true") args = parser.parse_args() diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py index 38ab3b8c8..dd0dd274e 100644 --- a/gnomad_qc/v4/resources/ld_resources.py +++ b/gnomad_qc/v4/resources/ld_resources.py @@ -48,7 +48,7 @@ def ld_scores_path( return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' -def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_RELEASE): +def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE): return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht" From 0f5bd4e43914caa08491c901379cee554804cebb Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Thu, 12 Sep 2024 16:12:17 -0400 Subject: [PATCH 04/13] Update generate_ld_data.py --- gnomad_qc/v4/assessment/generate_ld_data.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 770735ea3..d2c8bd366 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -50,6 +50,7 @@ def filter_mt_for_ld( # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on # This is for test sets or 1kg_tdpg if re_call_stats: + logger.info("Regenerating call_stats...") call_stats = hl.agg.call_stats(pop_mt.GT, pop_mt.alleles) call_stats_bind = hl.bind( lambda cs: cs.annotate( @@ -240,6 +241,10 @@ def main(args): # it is already in the right place for the gnomAD MT , or whatever # mt = mt.annotate(pop=mt.meta.population_inference.pop) + + if args.pop: + mt = mt.filter_cols(mt.meta.population_inference.pop==args.pop) + label = 'gen_anc' if not args.hgdp_subset else 'pop' pop_data = get_pop_and_subpop_counters(mt,label=label) @@ -338,6 +343,9 @@ def main(args): parser.add_argument( "--re-call-stats", help="Regenerate callstats for LD work. Can be useful for some subsets.", action="store_true" ) + parser.add_argument( + "--pop", help="Filter to, and run on, one individual pop", type=str, + ) parser.add_argument( "--slack-channel", help="Slack channel to post results and notifications to." ) From 2d48987d767c91744de5068d29639132fea2493f Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Thu, 12 Sep 2024 16:16:47 -0400 Subject: [PATCH 05/13] run Black for formatting --- gnomad_qc/v4/assessment/generate_ld_data.py | 81 +++++++++++++-------- gnomad_qc/v4/resources/ld_resources.py | 4 +- 2 files changed, 53 insertions(+), 32 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index d2c8bd366..7fb399fb4 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -21,15 +21,16 @@ RARE_FREQ = 0.0005 -def get_pop_and_subpop_counters(mt,label): +def get_pop_and_subpop_counters(mt, label): cut_dict = { f"{label}": hl.agg.filter( - hl.is_defined(mt.meta.population_inference.pop) & (mt.meta.population_inference.pop != "oth"), + hl.is_defined(mt.meta.population_inference.pop) + & (mt.meta.population_inference.pop != "oth"), hl.agg.counter(mt.meta.population_inference.pop), ), } cut_data = mt.aggregate_cols(hl.struct(**cut_dict)) - logger.info(f'Counts: {cut_data}') + logger.info(f"Counts: {cut_data}") return cut_data @@ -125,18 +126,21 @@ def generate_ld_matrix( for label, pops in dict(pop_data).items(): for pop in pops: - pop_mt = filter_mt_for_ld( - mt, label, pop, common_only, re_call_stats - ) + pop_mt = filter_mt_for_ld(mt, label, pop, common_only, re_call_stats) pop_mt.rows().select("pop_freq").add_index().write( - ld_resources._ld_index_path(data_type, pop, common_only, adj, version=version), overwrite + ld_resources._ld_index_path( + data_type, pop, common_only, adj, version=version + ), + overwrite, ) ld = hl.ld_matrix(pop_mt.GT.n_alt_alleles(), pop_mt.locus, radius) if data_type != "genomes_snv_sv": ld = ld.sparsify_triangle() ld.write( - ld_resources._ld_matrix_path(data_type, pop, common_only, adj, version=version), + ld_resources._ld_matrix_path( + data_type, pop, common_only, adj, version=version + ), overwrite, ) @@ -149,14 +153,16 @@ def generate_ld_scores_from_ld_matrix( adj: bool = False, radius: int = 1000000, overwrite: bool = False, - version: str = None + version: str = None, ): # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 # mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): - ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj, version=version)) + ht = hl.read_table( + ld_resources._ld_index_path(data_type, pop, adj=adj, version=version) + ) ht = ht.filter( (ht.pop_freq.AF >= min_frequency) & (ht.pop_freq.AF <= 1 - min_frequency) @@ -167,17 +173,21 @@ def generate_ld_scores_from_ld_matrix( r2 = BlockMatrix.read( ld_resources._ld_matrix_path( - data_type, pop, min_frequency >= COMMON_FREQ, adj=adj, version=version + data_type, + pop, + min_frequency >= COMMON_FREQ, + adj=adj, + version=version, ) ) r2 = r2.filter(indices, indices) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) - out_name = ld_scores_path(data_type, pop, adj,version=version) + out_name = ld_scores_path(data_type, pop, adj, version=version) compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite,version): +def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite, version): starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) @@ -212,9 +222,11 @@ def main(args): freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict ) mt = mt.annotate_rows(freq=mt.gnomad_freq) - mt = mt.annotate_cols(meta=hl.struct( - population_inference = hl.struct( - pop=mt.gnomad_population_inference.pop))) + mt = mt.annotate_cols( + meta=hl.struct( + population_inference=hl.struct(pop=mt.gnomad_population_inference.pop) + ) + ) if args.test: mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1) @@ -226,13 +238,10 @@ def main(args): test=args.test, annotate_meta=True, release_only=True, split=True ) mt_vds = vds.variant_data - mt = mt_vds.annotate_globals( - **hl.eval(ht_release.globals) - ) + mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals)) mt = mt.annotate_rows( - freq = ht_release[mt.row_key].freq, - filters = ht_release[mt.row_key].filters + freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters ) if args.test: @@ -243,17 +252,24 @@ def main(args): # mt = mt.annotate(pop=mt.meta.population_inference.pop) if args.pop: - mt = mt.filter_cols(mt.meta.population_inference.pop==args.pop) + mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop) - label = 'gen_anc' if not args.hgdp_subset else 'pop' - pop_data = get_pop_and_subpop_counters(mt,label=label) + label = "gen_anc" if not args.hgdp_subset else "pop" + pop_data = get_pop_and_subpop_counters(mt, label=label) # Version is populated via ld_resources.py if None - version = None if not args.hgdp_subset else 'hgdp' + version = None if not args.hgdp_subset else "hgdp" if args.generate_ld_pruned_set: generate_ld_pruned_set( - mt, pop_data, data_type, args.r2, args.radius, args.overwrite, re_call_stats=args.re_call_stats, version=version + mt, + pop_data, + data_type, + args.r2, + args.radius, + args.overwrite, + re_call_stats=args.re_call_stats, + version=version, ) if args.generate_ld_matrix: @@ -265,8 +281,8 @@ def main(args): args.common_only, args.adj, args.overwrite, - re_call_stats=args.re_call_stats, - version=version + re_call_stats=args.re_call_stats, + version=version, ) if args.generate_ld_scores: @@ -280,7 +296,6 @@ def main(args): ) - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -341,10 +356,14 @@ def main(args): "--hgdp-subset", help="Use hgdp dataset for callset.", action="store_true" ) parser.add_argument( - "--re-call-stats", help="Regenerate callstats for LD work. Can be useful for some subsets.", action="store_true" + "--re-call-stats", + help="Regenerate callstats for LD work. Can be useful for some subsets.", + action="store_true", ) parser.add_argument( - "--pop", help="Filter to, and run on, one individual pop", type=str, + "--pop", + help="Filter to, and run on, one individual pop", + type=str, ) parser.add_argument( "--slack-channel", help="Slack channel to post results and notifications to." diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py index dd0dd274e..c35cfebed 100644 --- a/gnomad_qc/v4/resources/ld_resources.py +++ b/gnomad_qc/v4/resources/ld_resources.py @@ -48,7 +48,9 @@ def ld_scores_path( return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' -def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE): +def ld_pruned_path( + data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE +): return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht" From 7714d5219edb68ce8127d6f4d2291b964c37713a Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Tue, 24 Sep 2024 15:34:42 -0400 Subject: [PATCH 06/13] Documentation & Paths --- gnomad_qc/v4/assessment/generate_ld_data.py | 302 +++++++++++++++----- gnomad_qc/v4/resources/ld_resources.py | 40 ++- 2 files changed, 245 insertions(+), 97 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 7fb399fb4..18a12f157 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -1,27 +1,38 @@ +""" +Script to generate LD scores for gnomAD v4 Genome SNPs & Indels. Example usage: + +hailctl dataproc submit cluster_name generate_ld_data.py --test --overwrite --hgdp-subset --pop afr --re-call-stats \ + --generate-ld-mt \ + --generate-ld-pruned-set \ + --generate-ld-matrix \ + --generate-ld-scores +""" + import argparse import sys -import gnomad.resources.grch37.gnomad_ld as ld_resources from gnomad.utils.slack import slack_notifications from hail.linalg import BlockMatrix from hail.utils import new_temp_file -# from hail.utils.java import Env - from gnomad_qc.slack_creds import slack_token +from gnomad_qc.v3.resources.release import hgdp_tgp_subset from gnomad_qc.v4.resources import * from gnomad_qc.v4.resources.ld_resources import * -from gnomad_qc.v3.resources.release import hgdp_tgp_subset - -# this includes ld_resources - -HGDP_TGP_RELEASES COMMON_FREQ = 0.005 RARE_FREQ = 0.0005 -def get_pop_and_subpop_counters(mt, label): +def get_pop_counters(mt, label) -> dict: + """ + Calculate and return genetic ancestry counts given an mt and column label. + + :param mt: Input MT + :param label: Col field containing genetic ancestry information + :return: Dict of pop counts + """ + cut_dict = { f"{label}": hl.agg.filter( hl.is_defined(mt.meta.population_inference.pop) @@ -36,7 +47,18 @@ def get_pop_and_subpop_counters(mt, label): def filter_mt_for_ld( mt, label, pop, common_only: bool = True, re_call_stats: bool = False -): +) -> hl.MatrixTable: + """ + Filter MT to only variants and samples appropriate for LD analyses + + :param mt: Input MT to filter + :param label: Col field containing genetic ancestry information + :param pop: Given genetic ancestry group to filter to + :param common_only: Bool of whether to filter to only common variants (>0.5%) + :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. + :return: MT filtered to appropriate variants and samples + """ + frequency = COMMON_FREQ if common_only else RARE_FREQ # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants # All work on standard machines except AFR @@ -90,7 +112,19 @@ def generate_ld_pruned_set( overwrite: bool = False, re_call_stats: bool = False, version: str = None, -): +) -> None: + """ + Generate and write set of variants uncorrelated with eachother. Wrapper for hl.ld_prune() + + :param mt: Input MT to filter + :param pop_data: Dict of population counts + :param data_type: Genetic data type for exomes or genomes. Exomes are currently too sparse, only runs on genomes. + :param r2: via Hail: Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]. + :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound). + :param overwrite: Bool to write over previous outputs or not + :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. + :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. + """ for label, pops in dict(pop_data).items(): for pop in pops: @@ -119,7 +153,20 @@ def generate_ld_matrix( overwrite: bool = False, re_call_stats: bool = False, version: str = None, -): +) -> None: + """ + Generate and write matrix of LD correlcations as Hail BlockMatrix using hl.ld_matrix. Read by generate_ld_scores_from_ld_matrix() + + :param mt: Input MT to filter and generate LDs for + :param pop_data: Dict of population counts + :param data_type: Genetic data type for exomes or genomes. Exomes are currently too sparse, only runs on genomes. + :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound). + :param common_only: Bool of whether to filter to only common variants (>0.5%) + :param adj: Whether to use adj ("adjusted" or "passing") frequency + :param overwrite: Bool to write over previous outputs or not + :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. + :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. + """ # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population # Total of ~37 hours ($400) @@ -129,8 +176,12 @@ def generate_ld_matrix( pop_mt = filter_mt_for_ld(mt, label, pop, common_only, re_call_stats) pop_mt.rows().select("pop_freq").add_index().write( - ld_resources._ld_index_path( - data_type, pop, common_only, adj, version=version + ld_index_path( + data_type=data_type, + pop=pop, + common_only=common_only, + adj=adj, + version=version, ), overwrite, ) @@ -138,9 +189,7 @@ def generate_ld_matrix( if data_type != "genomes_snv_sv": ld = ld.sparsify_triangle() ld.write( - ld_resources._ld_matrix_path( - data_type, pop, common_only, adj, version=version - ), + ld_matrix_path(data_type, pop, common_only, adj, version=version), overwrite, ) @@ -150,18 +199,38 @@ def generate_ld_scores_from_ld_matrix( data_type, min_frequency=0.01, call_rate_cutoff=0.8, + common_only: bool = True, adj: bool = False, radius: int = 1000000, overwrite: bool = False, version: str = None, -): +) -> None: + """ + Generate LD scores from Hail BlockMatrix written by generate_ld_scores_from_ld_matrix(). + + :param pop_data: Dict of population counts + :param data_type: Genetic data type for exomes or genomes. Exomes are currently too sparse, only runs on genomes. + :param min_frequency: Lowest frequency for variants to calculate LD scores of. + :param call_rate_cutoff: Lowest call rate for variants to calculate LD scores of. + :param common_only: Bool of whether to filter to only common variants (>0.5%) + :param adj: Whether to use adj ("adjusted" or "passing") frequency + :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound). + :param overwrite: Bool to write over previous outputs or not + :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. + """ # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 # mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): ht = hl.read_table( - ld_resources._ld_index_path(data_type, pop, adj=adj, version=version) + ld_index_path( + data_type=data_type, + pop=pop, + common_only=common_only, + adj=adj, + version=version, + ) ) ht = ht.filter( (ht.pop_freq.AF >= min_frequency) @@ -172,7 +241,7 @@ def generate_ld_scores_from_ld_matrix( indices = ht.idx.collect() r2 = BlockMatrix.read( - ld_resources._ld_matrix_path( + ld_matrix_path( data_type, pop, min_frequency >= COMMON_FREQ, @@ -184,10 +253,21 @@ def generate_ld_scores_from_ld_matrix( r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) out_name = ld_scores_path(data_type, pop, adj, version=version) - compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) + compute_and_annotate_ld_score( + ht, r2_adj, radius, out_name, overwrite, version=version + ) + +def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> None: + """ + Annotate LD scores onto Hail Table containing variants and write to path. -def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite, version): + :param ht: Hail Table of variants and LD indices + :param r2: via Hail: Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]. + :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound). + :param out_name: Path to write outputs to. Designed to be output of ld_scores_path() + :param overwrite: Bool to write over previous outputs or not + """ starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) @@ -208,7 +288,11 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite, versi def main(args): - hl.init(log="/ld_assessment.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/") + hl.init( + log="/ld_assessment.log", + tmp_dir="gs://gnomad-tmp-4day/ld_tmp/", + gcs_requester_pays_configuration="broad-mpg-gnomad", + ) hl._set_flags(use_new_shuffle="1") # Only run on genomes so far, no choice to run on exomes, too sparse @@ -216,83 +300,148 @@ def main(args): if args.exomes: raise NotImplementedError("LD Code does not work for exomes yet. Too sparse:(") - if args.hgdp_subset: - mt = hgdp_tgp_subset(dense=True, public=True).mt() - mt = mt.annotate_globals( - freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict + # Define command line args used more than once + test = args.test + is_hgdp = args.hgdp_subset + overwrite = args.overwrite + + if test and not args.re_call_stats: + logger.info( + "WARNING: Test subsets without re-called stats may result in None or inaccurate LD scores." ) - mt = mt.annotate_rows(freq=mt.gnomad_freq) - mt = mt.annotate_cols( - meta=hl.struct( - population_inference=hl.struct(pop=mt.gnomad_population_inference.pop) + + # Version is populated via ld_resources.py if None + version = None if not is_hgdp else "hgdp" + + # Path for LD MT includes all pops by design + ld_mt_path = ld_mt_checkpoint_path( + data_type="genomes", pops="all_pops", version=version, test=test + ) + + def _ld_test_mt( + mt: hl.MatrixTable, + filter_contig: bool = False, + sample_cols: bool = False, + ): + if filter_contig: + mt = mt.filter_rows(mt.locus.contig == "chr22") + if sample_cols: + mt = mt.sample_cols(0.1) + + return mt + + def _generate_ld_mt( + version: str = version, + is_hgdp: bool = False, + test: bool = False, + ld_mt_path: str = ld_mt_path, + overwrite=overwrite, + ) -> hl.MatrixTable: + + if is_hgdp: + mt = hgdp_tgp_subset(dense=True, public=True).mt() + mt = mt.annotate_globals( + freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict + ) + mt = mt.annotate_rows(freq=mt.gnomad_freq) + mt = mt.annotate_cols( + meta=hl.struct( + population_inference=hl.struct( + pop=mt.gnomad_population_inference.pop + ) + ) ) - ) - if args.test: - mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1) - mt = mt.filter_cols(mt.meta.population_inference.pop == "afr") + if test: + logger.info( + "Filter HGDP to Test, to chr22, and downsample cols by 0.1..." + ) + mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True) - else: - ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) - vds = basics.get_gnomad_v4_genomes_vds( - test=args.test, annotate_meta=True, release_only=True, split=True - ) - mt_vds = vds.variant_data - mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals)) + else: + ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) + vds = basics.get_gnomad_v4_genomes_vds( + test=test, annotate_meta=True, release_only=True, split=True + ) + mt_vds = vds.variant_data + mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals)) - mt = mt.annotate_rows( - freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters - ) + if test: + logger.info( + "Filter Test VDS to Chr22, do not touch sample level information, then naive coalesce to 1000 partitions..." + ) + mt = _ld_test_mt(mt, filter_contig=True, sample_cols=False) + mt = mt.naive_coalesce(1000) + + mt = mt.annotate_rows( + freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters + ) - if args.test: - mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1) - mt = mt.filter_cols(mt.meta.population_inference.pop == "afr") + # Select and checkpoint to speed up calculations + mt = mt.select_rows(mt.freq, mt.filters) + mt = mt.select_cols(mt.meta) - # it is already in the right place for the gnomAD MT , or whatever - # mt = mt.annotate(pop=mt.meta.population_inference.pop) + logger.info(f"Checkpoint MT to {ld_mt_path} ...") + mt = mt.checkpoint(ld_mt_path, overwrite=overwrite) + + return mt + + # Read in Hail MatrixTable for analysis + mt = None + if args.generate_ld_mt: + mt = _generate_ld_mt( + version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path + ) + else: + mt = hl.read_matrix_table(ld_mt_path) if args.pop: mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop) + mt = mt.checkpoint( + ld_mt_checkpoint_path( + data_type="genomes", pops=args.pop, version=version, test=test + ), + overwrite=overwrite, + ) - label = "gen_anc" if not args.hgdp_subset else "pop" - pop_data = get_pop_and_subpop_counters(mt, label=label) - - # Version is populated via ld_resources.py if None - version = None if not args.hgdp_subset else "hgdp" + label = "gen_anc" if not is_hgdp else "pop" + pop_data = get_pop_counters(mt, label=label) if args.generate_ld_pruned_set: generate_ld_pruned_set( - mt, - pop_data, - data_type, - args.r2, - args.radius, - args.overwrite, + mt=mt, + pop_data=pop_data, + data_type=data_type, + r2=args.r2, + radius=args.radius, + overwrite=overwrite, re_call_stats=args.re_call_stats, version=version, ) if args.generate_ld_matrix: generate_ld_matrix( - mt, - pop_data, - data_type, - args.radius, - args.common_only, - args.adj, - args.overwrite, + mt=mt, + pop_data=pop_data, + data_type=data_type, + radius=args.radius, + common_only=args.common_only, + adj=args.adj, + overwrite=overwrite, re_call_stats=args.re_call_stats, version=version, ) if args.generate_ld_scores: generate_ld_scores_from_ld_matrix( - pop_data, - data_type, - args.min_frequency, - args.min_call_rate, - args.adj, - overwrite=args.overwrite, + pop_data=pop_data, + data_type=data_type, + min_frequency=args.min_frequency, + call_rate_cutoff=args.min_call_rate, + common_only=args.common_only, + adj=args.adj, + version=version, + overwrite=overwrite, ) @@ -303,6 +452,11 @@ def main(args): help="Input MT is exomes. One of --exomes or --genomes is required.", action="store_true", ) + parser.add_argument( + "--generate-ld-mt", + help="Generate Hail MatrixTable of variants and calls to calculate LD for. ", + action="store_true", + ) parser.add_argument( "--generate-ld-pruned-set", help="Calculates LD pruned set of variants", diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py index c35cfebed..7eeda4a88 100644 --- a/gnomad_qc/v4/resources/ld_resources.py +++ b/gnomad_qc/v4/resources/ld_resources.py @@ -3,10 +3,6 @@ from typing import Optional from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE -from gnomad.resources.resource_utils import ( - GnomadPublicBlockMatrixResource, - GnomadPublicTableResource, -) def ld_matrix_path( @@ -20,7 +16,7 @@ def ld_matrix_path( version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm' + return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm' def ld_index_path( @@ -34,7 +30,7 @@ def ld_index_path( version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht' + return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht' # @@ -45,25 +41,23 @@ def ld_scores_path( version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' + return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' -def ld_pruned_path( - data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE +def ld_mt_checkpoint_path( + data_type: str, + pops: str = None, + version: str = CURRENT_GENOME_RELEASE, + test: bool = False, ): - return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht" - - -def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource: - """Get resource for the LD matrix for the given population.""" - return GnomadPublicBlockMatrixResource(path=ld_matrix_path("genomes", pop)) - - -def ld_index(pop: str) -> GnomadPublicTableResource: - """Get resource for the LD indices for the given population.""" - return GnomadPublicTableResource(path=ld_index_path("genomes", pop)) + if version is None: + version = CURRENT_GENOME_RELEASE + return f'gs://gnomad-tmp-30day/ld/gnomad.{data_type}.{"all_pops" if not pops else f"{pops}"}.{version}{f".test" if test else ""}.mt' -def ld_scores(pop: str) -> GnomadPublicTableResource: - """Get resource for the LD scores for the given population.""" - return GnomadPublicTableResource(path=ld_scores_path("genomes", pop)) +def ld_pruned_path( + data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE +): + if version is None: + version = CURRENT_GENOME_RELEASE + return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}.{pop}.ld.pruned_set.r2_{r2}.ht" From 331cd125d8e57fb2f269dd2b69ad9d1286ee878f Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Tue, 24 Sep 2024 16:19:49 -0400 Subject: [PATCH 07/13] added better test params --- gnomad_qc/v4/assessment/generate_ld_data.py | 32 ++++++++++++++++----- gnomad_qc/v4/resources/ld_resources.py | 14 +++++---- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 18a12f157..22f6aec81 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -112,6 +112,7 @@ def generate_ld_pruned_set( overwrite: bool = False, re_call_stats: bool = False, version: str = None, + test: bool = False, ) -> None: """ Generate and write set of variants uncorrelated with eachother. Wrapper for hl.ld_prune() @@ -124,6 +125,7 @@ def generate_ld_pruned_set( :param overwrite: Bool to write over previous outputs or not :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. + :param test: Filter to test versions and/or write to test paths. """ for label, pops in dict(pop_data).items(): @@ -138,7 +140,7 @@ def generate_ld_pruned_set( ht = pop_mt.rows().select("pop_freq") ht = ht.filter(hl.is_defined(pruned_ht[ht.key])) - ld_path = ld_pruned_path(data_type, pop, r2, version=version) + ld_path = ld_pruned_path(data_type, pop, r2, version=version, test=test) logger.info(f"Writing pruned ht for {pop} to {ld_path} ...") ht.write(ld_path, overwrite) @@ -153,6 +155,7 @@ def generate_ld_matrix( overwrite: bool = False, re_call_stats: bool = False, version: str = None, + test: bool = False, ) -> None: """ Generate and write matrix of LD correlcations as Hail BlockMatrix using hl.ld_matrix. Read by generate_ld_scores_from_ld_matrix() @@ -166,6 +169,7 @@ def generate_ld_matrix( :param overwrite: Bool to write over previous outputs or not :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. + :param test: Filter to test versions and/or write to test paths. """ # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population # Total of ~37 hours ($400) @@ -182,6 +186,7 @@ def generate_ld_matrix( common_only=common_only, adj=adj, version=version, + test=test, ), overwrite, ) @@ -189,7 +194,7 @@ def generate_ld_matrix( if data_type != "genomes_snv_sv": ld = ld.sparsify_triangle() ld.write( - ld_matrix_path(data_type, pop, common_only, adj, version=version), + ld_matrix_path(data_type, pop, common_only, adj, version=version, test=test), overwrite, ) @@ -204,6 +209,7 @@ def generate_ld_scores_from_ld_matrix( radius: int = 1000000, overwrite: bool = False, version: str = None, + test: bool = False, ) -> None: """ Generate LD scores from Hail BlockMatrix written by generate_ld_scores_from_ld_matrix(). @@ -217,6 +223,7 @@ def generate_ld_scores_from_ld_matrix( :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound). :param overwrite: Bool to write over previous outputs or not :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. + :param test: Filter to test versions and/or write to test paths. """ # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 @@ -230,6 +237,7 @@ def generate_ld_scores_from_ld_matrix( common_only=common_only, adj=adj, version=version, + test=test ) ) ht = ht.filter( @@ -247,14 +255,15 @@ def generate_ld_scores_from_ld_matrix( min_frequency >= COMMON_FREQ, adj=adj, version=version, + test=test ) ) r2 = r2.filter(indices, indices) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) - out_name = ld_scores_path(data_type, pop, adj, version=version) + out_name = ld_scores_path(data_type, pop, adj, version=version, test=test) compute_and_annotate_ld_score( - ht, r2_adj, radius, out_name, overwrite, version=version + ht, r2_adj, radius, out_name, overwrite ) @@ -324,8 +333,10 @@ def _ld_test_mt( sample_cols: bool = False, ): if filter_contig: + logger.info('Filtering to chr22...') mt = mt.filter_rows(mt.locus.contig == "chr22") if sample_cols: + logger.info('Downsampling all cols by 0.1...') mt = mt.sample_cols(0.1) return mt @@ -339,6 +350,7 @@ def _generate_ld_mt( ) -> hl.MatrixTable: if is_hgdp: + logger.info('Reading in HGDP_TGP Subset...') mt = hgdp_tgp_subset(dense=True, public=True).mt() mt = mt.annotate_globals( freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict @@ -353,12 +365,10 @@ def _generate_ld_mt( ) if test: - logger.info( - "Filter HGDP to Test, to chr22, and downsample cols by 0.1..." - ) mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True) else: + logger.info('Reading in gnomAD release HT and VDS...') ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) vds = basics.get_gnomad_v4_genomes_vds( test=test, annotate_meta=True, release_only=True, split=True @@ -389,6 +399,7 @@ def _generate_ld_mt( # Read in Hail MatrixTable for analysis mt = None if args.generate_ld_mt: + logger.info('Generating MT for LD with all genetic ancestry groups...') mt = _generate_ld_mt( version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path ) @@ -396,6 +407,7 @@ def _generate_ld_mt( mt = hl.read_matrix_table(ld_mt_path) if args.pop: + logger.info(f'Filtering to {args.pop}...') mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop) mt = mt.checkpoint( ld_mt_checkpoint_path( @@ -408,6 +420,7 @@ def _generate_ld_mt( pop_data = get_pop_counters(mt, label=label) if args.generate_ld_pruned_set: + logger.info('Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()...') generate_ld_pruned_set( mt=mt, pop_data=pop_data, @@ -417,9 +430,11 @@ def _generate_ld_mt( overwrite=overwrite, re_call_stats=args.re_call_stats, version=version, + test=test, ) if args.generate_ld_matrix: + logger.info('Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()...') generate_ld_matrix( mt=mt, pop_data=pop_data, @@ -430,9 +445,11 @@ def _generate_ld_mt( overwrite=overwrite, re_call_stats=args.re_call_stats, version=version, + test=test ) if args.generate_ld_scores: + logger.info('Generating in LD scores and annotating onto variant HT...') generate_ld_scores_from_ld_matrix( pop_data=pop_data, data_type=data_type, @@ -442,6 +459,7 @@ def _generate_ld_mt( adj=args.adj, version=version, overwrite=overwrite, + test=test ) diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py index 7eeda4a88..db0a880b5 100644 --- a/gnomad_qc/v4/resources/ld_resources.py +++ b/gnomad_qc/v4/resources/ld_resources.py @@ -11,12 +11,13 @@ def ld_matrix_path( common_only: bool = True, adj: bool = True, version: Optional[str] = None, + test: bool = False ): if version is None: version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm' + return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm' def ld_index_path( @@ -25,23 +26,24 @@ def ld_index_path( common_only: bool = True, adj: bool = True, version: Optional[str] = None, + test: bool = False ): if version is None: version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht' + return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht' # def ld_scores_path( - data_type: str, pop: str, adj: bool = True, version: Optional[str] = None + data_type: str, pop: str, adj: bool = True, version: Optional[str] = None, test: bool = False ): if version is None: version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' + return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"adj." if adj else ""}ld_scores.ht' def ld_mt_checkpoint_path( @@ -56,8 +58,8 @@ def ld_mt_checkpoint_path( def ld_pruned_path( - data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE + data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE, test: bool = False ): if version is None: version = CURRENT_GENOME_RELEASE - return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}.{pop}.ld.pruned_set.r2_{r2}.ht" + return f'gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.ld.pruned_set.r2_{r2}.ht' From 5f0ecdd661b9b0a89c37fb01b166e055c38f7219 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Tue, 24 Sep 2024 16:37:03 -0400 Subject: [PATCH 08/13] common_only logic, formatting... --- gnomad_qc/v4/assessment/generate_ld_data.py | 62 +++++++++++++-------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 22f6aec81..a00d2ff3b 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -125,7 +125,7 @@ def generate_ld_pruned_set( :param overwrite: Bool to write over previous outputs or not :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. - :param test: Filter to test versions and/or write to test paths. + :param test: Filter to test versions and/or write to test paths. """ for label, pops in dict(pop_data).items(): @@ -169,7 +169,7 @@ def generate_ld_matrix( :param overwrite: Bool to write over previous outputs or not :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. - :param test: Filter to test versions and/or write to test paths. + :param test: Filter to test versions and/or write to test paths. """ # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population # Total of ~37 hours ($400) @@ -194,7 +194,14 @@ def generate_ld_matrix( if data_type != "genomes_snv_sv": ld = ld.sparsify_triangle() ld.write( - ld_matrix_path(data_type, pop, common_only, adj, version=version, test=test), + ld_matrix_path( + data_type, + pop, + common_only=common_only, + adj=adj, + version=version, + test=test, + ), overwrite, ) @@ -223,11 +230,18 @@ def generate_ld_scores_from_ld_matrix( :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound). :param overwrite: Bool to write over previous outputs or not :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. - :param test: Filter to test versions and/or write to test paths. + :param test: Filter to test versions and/or write to test paths. """ # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 # mins to compute LD scores per population (~$150 total) + + freq_cutoff = COMMON_FREQ if common_only else RARE_FREQ + + logger.info( + f'Min frequency of {min_frequency} is {">=" if min_frequency >= my_cutoff else "<"} cutoff of {freq_cutoff}...' + ) + for label, pops in dict(pop_data).items(): for pop, n in pops.items(): ht = hl.read_table( @@ -237,7 +251,7 @@ def generate_ld_scores_from_ld_matrix( common_only=common_only, adj=adj, version=version, - test=test + test=test, ) ) ht = ht.filter( @@ -250,21 +264,19 @@ def generate_ld_scores_from_ld_matrix( r2 = BlockMatrix.read( ld_matrix_path( - data_type, - pop, - min_frequency >= COMMON_FREQ, + data_type=data_type, + pop=pop, + common_only=common_only, adj=adj, version=version, - test=test + test=test, ) ) r2 = r2.filter(indices, indices) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) out_name = ld_scores_path(data_type, pop, adj, version=version, test=test) - compute_and_annotate_ld_score( - ht, r2_adj, radius, out_name, overwrite - ) + compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> None: @@ -333,10 +345,10 @@ def _ld_test_mt( sample_cols: bool = False, ): if filter_contig: - logger.info('Filtering to chr22...') + logger.info("Filtering to chr22...") mt = mt.filter_rows(mt.locus.contig == "chr22") if sample_cols: - logger.info('Downsampling all cols by 0.1...') + logger.info("Downsampling all cols by 0.1...") mt = mt.sample_cols(0.1) return mt @@ -350,7 +362,7 @@ def _generate_ld_mt( ) -> hl.MatrixTable: if is_hgdp: - logger.info('Reading in HGDP_TGP Subset...') + logger.info("Reading in HGDP_TGP Subset...") mt = hgdp_tgp_subset(dense=True, public=True).mt() mt = mt.annotate_globals( freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict @@ -368,7 +380,7 @@ def _generate_ld_mt( mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True) else: - logger.info('Reading in gnomAD release HT and VDS...') + logger.info("Reading in gnomAD release HT and VDS...") ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) vds = basics.get_gnomad_v4_genomes_vds( test=test, annotate_meta=True, release_only=True, split=True @@ -399,7 +411,7 @@ def _generate_ld_mt( # Read in Hail MatrixTable for analysis mt = None if args.generate_ld_mt: - logger.info('Generating MT for LD with all genetic ancestry groups...') + logger.info("Generating MT for LD with all genetic ancestry groups...") mt = _generate_ld_mt( version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path ) @@ -407,7 +419,7 @@ def _generate_ld_mt( mt = hl.read_matrix_table(ld_mt_path) if args.pop: - logger.info(f'Filtering to {args.pop}...') + logger.info(f"Filtering to {args.pop}...") mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop) mt = mt.checkpoint( ld_mt_checkpoint_path( @@ -420,7 +432,9 @@ def _generate_ld_mt( pop_data = get_pop_counters(mt, label=label) if args.generate_ld_pruned_set: - logger.info('Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()...') + logger.info( + "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..." + ) generate_ld_pruned_set( mt=mt, pop_data=pop_data, @@ -434,7 +448,9 @@ def _generate_ld_mt( ) if args.generate_ld_matrix: - logger.info('Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()...') + logger.info( + "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..." + ) generate_ld_matrix( mt=mt, pop_data=pop_data, @@ -445,11 +461,11 @@ def _generate_ld_mt( overwrite=overwrite, re_call_stats=args.re_call_stats, version=version, - test=test + test=test, ) if args.generate_ld_scores: - logger.info('Generating in LD scores and annotating onto variant HT...') + logger.info("Generating in LD scores and annotating onto variant HT...") generate_ld_scores_from_ld_matrix( pop_data=pop_data, data_type=data_type, @@ -459,7 +475,7 @@ def _generate_ld_mt( adj=args.adj, version=version, overwrite=overwrite, - test=test + test=test, ) From 49e9733843b65f7edaad3077e45721c08ff588a0 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Tue, 24 Sep 2024 16:50:32 -0400 Subject: [PATCH 09/13] cutoff variable --- gnomad_qc/v4/assessment/generate_ld_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index a00d2ff3b..fa4b6acc2 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -239,7 +239,7 @@ def generate_ld_scores_from_ld_matrix( freq_cutoff = COMMON_FREQ if common_only else RARE_FREQ logger.info( - f'Min frequency of {min_frequency} is {">=" if min_frequency >= my_cutoff else "<"} cutoff of {freq_cutoff}...' + f'Min frequency of {min_frequency} is {">=" if min_frequency >= freq_cutoff else "<"} cutoff of {freq_cutoff}...' ) for label, pops in dict(pop_data).items(): From e25706a7051e97cb26dc6e34b0d55194bad033f2 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Tue, 28 Jan 2025 15:55:46 -0500 Subject: [PATCH 10/13] better gnomAD VDS performance still pretty slow checkpointing chr22 before join/annotation , and does not work for chr1 (stalls out) --- gnomad_qc/v4/assessment/generate_ld_data.py | 160 ++++++++++++++++---- 1 file changed, 127 insertions(+), 33 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index fa4b6acc2..9766c210d 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -45,6 +45,55 @@ def get_pop_counters(mt, label) -> dict: return cut_data +def _filter_ht_for_ld( + ht, + label="gen_anc", + pop="eas", + common_only: bool = True, + re_call_stats: bool = False, +) -> hl.MatrixTable: + """ + Filter HT to only variants appropriate for LD analyses for a given genetic ancestry. + + :param mt: Input HT to filter + :param label: Col field containing genetic ancestry information + :param pop: Given genetic ancestry group to filter to + :param common_only: Bool of whether to filter to only common variants (>0.5%) + :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. + :return: MT filtered to appropriate variants and samples + """ + + frequency = COMMON_FREQ if common_only else RARE_FREQ + # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants + # All work on standard machines except AFR + # Also creating `common_only` ones for most practical uses (above 0.5%) + # pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop) # irrelevant for HT + meta_index = ( + hl.enumerate(ht.freq_meta) + .find(lambda f: (f[1].get(label) == pop)) + .collect()[0][0] + ) + # No support for SV datasets in gnomAD v4 Production team code + # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on + # This is for test sets or 1kg_tdpg + # CANNOT re_call stats without GT + ht = ht.filter((hl.len(ht.filters) == 0)) + pop_freq = ht.freq[meta_index] + ht = ht.annotate(pop_freq=pop_freq) + + ht = ht.filter( + (ht.pop_freq.AC > 1) + & (ht.pop_freq.AN - ht.pop_freq.AC > 1) + & (ht.pop_freq.AF > frequency) + & + # the following filter is needed for "het-only" monomorphic sites + # as otherwise variance is 0, and so, row_correlation errors out + ~((ht.pop_freq.AF == 0.5) & (ht.pop_freq.homozygote_count == 0)) + ) + + return ht + + def filter_mt_for_ld( mt, label, pop, common_only: bool = True, re_call_stats: bool = False ) -> hl.MatrixTable: @@ -134,6 +183,14 @@ def generate_ld_pruned_set( pop_mt = filter_mt_for_ld(mt, label, pop, re_call_stats) logger.info(f"Count after filter_mt_for_ld() : {pop_mt.count()}") + # now we checkpoint + # pop_mt = pop_mt.checkpoint( + # ld_mt_checkpoint_path( + # data_type="genomes", pops=pop, version=version, test=test + # ), + # overwrite=overwrite, + # ) + pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius) logger.info(f"Count after hl.ld_prune() : {pruned_ht.count()}") @@ -325,6 +382,8 @@ def main(args): test = args.test is_hgdp = args.hgdp_subset overwrite = args.overwrite + pop = args.pop + chrom = args.chrom if test and not args.re_call_stats: logger.info( @@ -336,7 +395,7 @@ def main(args): # Path for LD MT includes all pops by design ld_mt_path = ld_mt_checkpoint_path( - data_type="genomes", pops="all_pops", version=version, test=test + data_type="genomes", pops=pop, version=version, test=test ) def _ld_test_mt( @@ -358,6 +417,7 @@ def _generate_ld_mt( is_hgdp: bool = False, test: bool = False, ld_mt_path: str = ld_mt_path, + pop: str = "eas", overwrite=overwrite, ) -> hl.MatrixTable: @@ -376,61 +436,88 @@ def _generate_ld_mt( ) ) + mt = mt.filter_cols(mt.meta.population_inference.pop == "pop") + if test: mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True) + mt = mt.checkpoint( + ld_mt_path, + overwrite=overwrite, + ) + else: logger.info("Reading in gnomAD release HT and VDS...") + filter_chrom = None + if test: + filter_chrom = "chr22" + elif chrom: + filter_chrom = chrom + ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) + ht_filter = _filter_ht_for_ld(ht_release,label='gen_anc',pop=pop) + if filter_chrom: + ht_filter = ht_filter.filter(ht_filter.locus.contig==filter_chrom) + + ht_filter = ht_filter.select('freq','filters').checkpoint(new_temp_file()) + vds = basics.get_gnomad_v4_genomes_vds( - test=test, annotate_meta=True, release_only=True, split=True + test=test, + annotate_meta=True, + release_only=True, + split=True, + chrom=filter_chrom, + filter_variant_ht=ht_filter, + entries_to_keep=["GT"], ) - mt_vds = vds.variant_data - mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals)) - if test: + if test or chrom: logger.info( - "Filter Test VDS to Chr22, do not touch sample level information, then naive coalesce to 1000 partitions..." + f"Due to Test or Chrom behavior, only working on {filter_chrom} - do naive coalesce to 1,000 partitions" ) - mt = _ld_test_mt(mt, filter_contig=True, sample_cols=False) - mt = mt.naive_coalesce(1000) + mt = vds.variant_data.naive_coalesce(1000) + else: + mt = vds.variant_data.naive_coalesce(9800) + logger.info( + f"Filtering to relevant cols - only meta - and no VDS/MT rows..." + ) + mt = mt.select_cols(mt.meta) + mt = mt.select_rows() + logger.info(f"Filtering to {args.pop}...") + mt = mt.filter_cols(mt.meta.population_inference.pop == pop) + + # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT + logger.info(f"Checkpointing {pop} on {chrom if chrom else default_str}") + mt = mt.checkpoint(new_temp_file()) + + # This part begins potential performance concerns - what is up here? + # Ideally, would checkpoint both of these beforehand. + # Is very slow if MT isn't checkpointed + # Checkpointing said MT is also very slow... mt = mt.annotate_rows( - freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters + freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters ) + mt = mt.annotate_globals(**hl.eval(ht_filter.globals)) # Select and checkpoint to speed up calculations - mt = mt.select_rows(mt.freq, mt.filters) - mt = mt.select_cols(mt.meta) - logger.info(f"Checkpoint MT to {ld_mt_path} ...") - mt = mt.checkpoint(ld_mt_path, overwrite=overwrite) + # logger.info(f"DO NOT Checkpoint MT to {ld_mt_path} ...") + # mt = mt.checkpoint(ld_mt_path, overwrite=overwrite) return mt - # Read in Hail MatrixTable for analysis - mt = None - if args.generate_ld_mt: - logger.info("Generating MT for LD with all genetic ancestry groups...") - mt = _generate_ld_mt( - version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path - ) - else: - mt = hl.read_matrix_table(ld_mt_path) - - if args.pop: - logger.info(f"Filtering to {args.pop}...") - mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop) - mt = mt.checkpoint( - ld_mt_checkpoint_path( - data_type="genomes", pops=args.pop, version=version, test=test - ), - overwrite=overwrite, - ) + mt = _generate_ld_mt( + version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path + ) + + # Something needs checkpointed before heading into generate_ld_pruned_set() + mt = mt.checkpoint(new_temp_file()) # skip if prior step is being checkpointed label = "gen_anc" if not is_hgdp else "pop" pop_data = get_pop_counters(mt, label=label) + # Previously established that THIS needs a checkpoint to have been ran to work effectively if args.generate_ld_pruned_set: logger.info( "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..." @@ -550,8 +637,15 @@ def _generate_ld_mt( ) parser.add_argument( "--pop", - help="Filter to, and run on, one individual pop", + help="Which individual pop to run LD on", + type=str, + required=True, + ) + parser.add_argument( + "--chrom", + help="Which individual chromosome to run LD on", type=str, + required=False, ) parser.add_argument( "--slack-channel", help="Slack channel to post results and notifications to." From b127899018e19d9ca459b236f77089f541c4f750 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Mon, 7 Apr 2025 10:13:27 -0400 Subject: [PATCH 11/13] April LD Code Code for LD, pending VDS performance investigation --- gnomad_qc/v4/assessment/generate_ld_data.py | 711 +++++++++++++------- gnomad_qc/v4/resources/ld_resources.py | 50 +- 2 files changed, 511 insertions(+), 250 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 9766c210d..6b2f9abf8 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -11,6 +11,7 @@ import argparse import sys +from gnomad.utils.annotations import get_adj_expr from gnomad.utils.slack import slack_notifications from hail.linalg import BlockMatrix from hail.utils import new_temp_file @@ -18,10 +19,15 @@ from gnomad_qc.slack_creds import slack_token from gnomad_qc.v3.resources.release import hgdp_tgp_subset from gnomad_qc.v4.resources import * -from gnomad_qc.v4.resources.ld_resources import * -COMMON_FREQ = 0.005 -RARE_FREQ = 0.0005 +EXPECTED_CONTIGS = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"] + +from typing import Optional + +from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE + +# Importing paths and Common+Rare Freqs +from gnomad_qc.v4.resources.ld_resources import * def get_pop_counters(mt, label) -> dict: @@ -45,12 +51,32 @@ def get_pop_counters(mt, label) -> dict: return cut_data +def _filter_call_stats( + ht, + freq: float = COMMON_FREQ, + ac_cutoff: int = 1, +) -> hl.Table: + """ + Filter ht to relevant variants. Must have field popfreq. This code is used for filtering ht and mt + """ + + ht = ht.filter( + (ht.pop_freq.AC > ac_cutoff) + & (ht.pop_freq.AN - ht.pop_freq.AC > 1) + & (ht.pop_freq.AF > freq) + & ~((ht.pop_freq.AF == 0.5) & (ht.pop_freq.homozygote_count == 0)) + # the above filter is needed for "het-only" monomorphic sites + # as otherwise variance is 0, and so, row_correlation errors out + ) + return ht + + def _filter_ht_for_ld( ht, + freq: float = COMMON_FREQ, label="gen_anc", pop="eas", - common_only: bool = True, - re_call_stats: bool = False, + ac_cutoff: int = 1, ) -> hl.MatrixTable: """ Filter HT to only variants appropriate for LD analyses for a given genetic ancestry. @@ -62,17 +88,12 @@ def _filter_ht_for_ld( :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. :return: MT filtered to appropriate variants and samples """ - - frequency = COMMON_FREQ if common_only else RARE_FREQ - # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants - # All work on standard machines except AFR - # Also creating `common_only` ones for most practical uses (above 0.5%) - # pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop) # irrelevant for HT meta_index = ( hl.enumerate(ht.freq_meta) .find(lambda f: (f[1].get(label) == pop)) .collect()[0][0] ) + # No support for SV datasets in gnomAD v4 Production team code # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on # This is for test sets or 1kg_tdpg @@ -81,21 +102,18 @@ def _filter_ht_for_ld( pop_freq = ht.freq[meta_index] ht = ht.annotate(pop_freq=pop_freq) - ht = ht.filter( - (ht.pop_freq.AC > 1) - & (ht.pop_freq.AN - ht.pop_freq.AC > 1) - & (ht.pop_freq.AF > frequency) - & - # the following filter is needed for "het-only" monomorphic sites - # as otherwise variance is 0, and so, row_correlation errors out - ~((ht.pop_freq.AF == 0.5) & (ht.pop_freq.homozygote_count == 0)) - ) - - return ht + return _filter_call_stats(ht, freq=freq, ac_cutoff=ac_cutoff) def filter_mt_for_ld( - mt, label, pop, common_only: bool = True, re_call_stats: bool = False + mt, + label, + pop, + freq: float = COMMON_FREQ, + re_call_stats: bool = False, + ld_pruned_path: str = None, + ac_cutoff: int = 1, + call_rate_cutoff: float = 0.8, ) -> hl.MatrixTable: """ Filter MT to only variants and samples appropriate for LD analyses @@ -107,17 +125,30 @@ def filter_mt_for_ld( :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols. :return: MT filtered to appropriate variants and samples """ + logger.info( + f"With label: {label} and pop: {pop} and freq: {freq} and ac_cutoff: {ac_cutoff} and call_rate_cutoff: {call_rate_cutoff}" + ) - frequency = COMMON_FREQ if common_only else RARE_FREQ - # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants - # All work on standard machines except AFR - # Also creating `common_only` ones for most practical uses (above 0.5%) pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop) meta_index = ( hl.enumerate(pop_mt.freq_meta) .find(lambda f: (f[1].get(label) == pop)) .collect()[0][0] ) + + pop_count_int = pop_mt.count_cols() + logger.info(f"Count1: {pop_mt.count()}") + + # If passed, read in pruned variants and remove them from our analyses + # Do this here to save having to compute on them + # ... and nothing here is sample-level, so we can count out + if ld_pruned_path: + logger.info("Filtering out LD pruned variants...") + ld_ht = hl.read_table(ld_pruned_path) + pop_mt = pop_mt.filter_rows(~hl.is_defined(ld_ht[pop_mt.row_key])) + + logger.info(f"Count2: {pop_mt.count()}") + # No support for SV datasets in gnomAD v4 Production team code # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on # This is for test sets or 1kg_tdpg @@ -133,21 +164,21 @@ def filter_mt_for_ld( pop_freq = call_stats_bind pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq) + logger.info(f"Count_recallstats: {pop_mt.count()}") + else: - pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0)) pop_freq = pop_mt.freq[meta_index] pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq) + logger.info(f"Count_not_recall_stats: {pop_mt.count()}") + + # Annotate callrate to check + pop_mt = pop_mt.annotate_rows(callrate=pop_mt.pop_freq.AN / (pop_count_int * 2)) pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0)) - pop_mt = pop_mt.filter_rows( - (pop_mt.pop_freq.AC > 1) - & (pop_mt.pop_freq.AN - pop_mt.pop_freq.AC > 1) - & (pop_mt.pop_freq.AF > frequency) - & - # the following filter is needed for "het-only" monomorphic sites - # as otherwise variance is 0, and so, row_correlation errors out - ~((pop_mt.pop_freq.AF == 0.5) & (pop_mt.pop_freq.homozygote_count == 0)) - ) + pop_ht_rows = _filter_call_stats(pop_mt.rows(), freq=freq, ac_cutoff=ac_cutoff) + pop_mt = pop_mt.filter_rows(hl.is_defined(pop_ht_rows[pop_mt.row_key])) + + logger.info(f"Count_final: {pop_mt.count()}") return pop_mt @@ -157,11 +188,16 @@ def generate_ld_pruned_set( pop_data: dict, data_type: str, r2: str = "0.2", + freq: float = COMMON_FREQ, radius: int = 1000000, overwrite: bool = False, re_call_stats: bool = False, version: str = None, test: bool = False, + ld_contig: str = None, + ac_cutoff: int = 1, + call_rate_cutoff: float = 0.8, + adj: bool = False, ) -> None: """ Generate and write set of variants uncorrelated with eachother. Wrapper for hl.ld_prune() @@ -180,24 +216,36 @@ def generate_ld_pruned_set( for label, pops in dict(pop_data).items(): for pop in pops: logger.info(f"Filtering for {pop}...") - pop_mt = filter_mt_for_ld(mt, label, pop, re_call_stats) + pop_mt = filter_mt_for_ld( + mt=mt, + label=label, + pop=pop, + freq=freq, + re_call_stats=re_call_stats, + ld_pruned_path=None, + ac_cutoff=ac_cutoff, + call_rate_cutoff=call_rate_cutoff, + ) logger.info(f"Count after filter_mt_for_ld() : {pop_mt.count()}") - # now we checkpoint - # pop_mt = pop_mt.checkpoint( - # ld_mt_checkpoint_path( - # data_type="genomes", pops=pop, version=version, test=test - # ), - # overwrite=overwrite, - # ) - pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius) - logger.info(f"Count after hl.ld_prune() : {pruned_ht.count()}") + logger.info(f"Count of pruned from hl.ld_prune() : {pruned_ht.count()}") ht = pop_mt.rows().select("pop_freq") ht = ht.filter(hl.is_defined(pruned_ht[ht.key])) - ld_path = ld_pruned_path(data_type, pop, r2, version=version, test=test) + + # Write out pruned set + ld_path = ld_pruned_path( + data_type, + pop, + r2, + version=version, + test=test, + freq=freq, + ld_contig=ld_contig, + adj=adj, + ) logger.info(f"Writing pruned ht for {pop} to {ld_path} ...") ht.write(ld_path, overwrite) @@ -207,12 +255,17 @@ def generate_ld_matrix( pop_data, data_type, radius: int = 1000000, - common_only: bool = True, + freq: float = COMMON_FREQ, adj: bool = False, overwrite: bool = False, re_call_stats: bool = False, version: str = None, test: bool = False, + ld_contig: str = None, + r2: str = "0.2", + custom_suffix: str = None, + ac_cutoff: int = 1, + call_rate_cutoff: float = 0.8, ) -> None: """ Generate and write matrix of LD correlcations as Hail BlockMatrix using hl.ld_matrix. Read by generate_ld_scores_from_ld_matrix() @@ -234,16 +287,36 @@ def generate_ld_matrix( for label, pops in dict(pop_data).items(): for pop in pops: - pop_mt = filter_mt_for_ld(mt, label, pop, common_only, re_call_stats) + pop_mt = filter_mt_for_ld( + mt, + label, + pop, + freq=freq, + re_call_stats=re_call_stats, + ac_cutoff=ac_cutoff, + call_rate_cutoff=call_rate_cutoff, + ld_pruned_path=ld_pruned_path( + data_type, + pop, + r2=r2, + version=version, + test=test, + freq=freq, + ld_contig=ld_contig, + adj=adj, + ), + ) pop_mt.rows().select("pop_freq").add_index().write( ld_index_path( data_type=data_type, pop=pop, - common_only=common_only, + freq=freq, adj=adj, version=version, test=test, + ld_contig=ld_contig, + custom_suffix=custom_suffix, ), overwrite, ) @@ -254,10 +327,12 @@ def generate_ld_matrix( ld_matrix_path( data_type, pop, - common_only=common_only, + freq=freq, adj=adj, version=version, test=test, + ld_contig=ld_contig, + custom_suffix=custom_suffix, ), overwrite, ) @@ -266,14 +341,15 @@ def generate_ld_matrix( def generate_ld_scores_from_ld_matrix( pop_data, data_type, - min_frequency=0.01, + freq: float = COMMON_FREQ, call_rate_cutoff=0.8, - common_only: bool = True, adj: bool = False, radius: int = 1000000, overwrite: bool = False, version: str = None, test: bool = False, + ld_contig: str = None, + custom_suffix: str = None, ) -> None: """ Generate LD scores from Hail BlockMatrix written by generate_ld_scores_from_ld_matrix(). @@ -293,11 +369,9 @@ def generate_ld_scores_from_ld_matrix( # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 # mins to compute LD scores per population (~$150 total) - freq_cutoff = COMMON_FREQ if common_only else RARE_FREQ - - logger.info( - f'Min frequency of {min_frequency} is {">=" if min_frequency >= freq_cutoff else "<"} cutoff of {freq_cutoff}...' - ) + # logger.info( + # f'Min frequency of {min_frequency} is {">=" if min_frequency >= freq_cutoff else "<"} cutoff of {freq_cutoff}...' + # ) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): @@ -305,15 +379,20 @@ def generate_ld_scores_from_ld_matrix( ld_index_path( data_type=data_type, pop=pop, - common_only=common_only, + freq=freq, adj=adj, version=version, test=test, + ld_contig=ld_contig, + custom_suffix=custom_suffix, ) ) + + ht = ht.annotate(callrate=(ht.pop_freq.AN / n) / 2) + ht = ht.filter( - (ht.pop_freq.AF >= min_frequency) - & (ht.pop_freq.AF <= 1 - min_frequency) + (ht.pop_freq.AF >= freq) + & (ht.pop_freq.AF <= 1 - freq) & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff) ).add_index(name="new_idx") @@ -323,16 +402,28 @@ def generate_ld_scores_from_ld_matrix( ld_matrix_path( data_type=data_type, pop=pop, - common_only=common_only, + freq=freq, adj=adj, version=version, test=test, + ld_contig=ld_contig, + custom_suffix=custom_suffix, ) ) r2 = r2.filter(indices, indices) ** 2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) - out_name = ld_scores_path(data_type, pop, adj, version=version, test=test) + out_name = ld_scores_path( + data_type, + pop, + adj, + version=version, + test=test, + freq=freq, + ld_contig=ld_contig, + custom_suffix=custom_suffix, + call_rate_cutoff=call_rate_cutoff, + ) compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) @@ -365,6 +456,152 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> No ht.filter(hl.is_defined(ht.ld_score)).write(out_name, overwrite) +def _ld_test_mt( + mt: hl.MatrixTable, + mt_contig: bool = False, + sample_cols: bool = False, +) -> hl.MatrixTable: + if mt_contig: + logger.info("Filtering to chr22...") + mt = mt.filter_rows(mt.locus.contig == "chr22") + if sample_cols: + logger.info("Downsampling all cols by 0.1...") + mt = mt.sample_cols(0.1) + + return mt + + +def generate_ld_mt( + ld_mt_path: str, + is_hgdp: bool = False, + test: bool = False, + pop: str = "eas", + adj: bool = False, + freq: float = COMMON_FREQ, + mt_contig: str = None, + do_v2_samples: bool = False, + hapmap: bool = False, + overwrite=False, +) -> hl.MatrixTable: + + if is_hgdp: + logger.info("Reading in HGDP_TGP Subset...") + mt = hgdp_tgp_subset(dense=True, public=True).mt() + mt = mt.annotate_globals( + freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict + ) + mt = mt.annotate_rows(freq=mt.gnomad_freq) + mt = mt.annotate_cols( + meta=hl.struct( + population_inference=hl.struct(pop=mt.gnomad_population_inference.pop) + ) + ) + + # Filter to appropriate genetic ancestry and frequency b efore checkpointing + mt = mt.filter_cols(mt.meta.population_inference.pop == pop) + mt = mt.filter_rows(mt.freq[mt.freq_index_dict[f"gnomad_{pop}"]].AF > freq) + + if mt_contig: + mt = mt.filter_rows(mt.locus.contig == mt_contig) + + if test: + mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True) + + logger.info( + f"Checkpointing HGDP_TGP Subset on {mt_contig if mt_contig else 'full'}" + ) + mt = mt.checkpoint(ld_mt_path, overwrite=overwrite) + + else: + logger.info("Reading in gnomAD release HT and VDS...") + + ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) + ht_filter = _filter_ht_for_ld(ht_release, freq=freq, label="gen_anc", pop=pop) + if mt_contig: + ht_filter = ht_filter.filter(ht_filter.locus.contig == mt_contig) + + ht_filter = ht_filter.select("freq", "filters").checkpoint(new_temp_file()) + + filter_samples_ht = meta(version="4.0", data_type="genomes").ht() + filter_samples_ht = filter_samples_ht.filter( + filter_samples_ht.population_inference.pop == pop + ) + + if do_v2_samples: + logger.info("Filtering to only v2 samples...") + filter_samples_ht = filter_samples_ht.filter( + (filter_samples_ht.project_meta.v2_pop == pop) + & (filter_samples_ht.project_meta.v2_release) + & (filter_samples_ht.project_meta.v2_high_quality) + ) + + naive_coalesce_partitions = 9800 if mt_contig is None else 1000 + + if mt_contig == "chr1": + naive_coalesce_partitions = 2500 + + if hapmap: + logger.info("Filtering to only hapmap variants...") + ht_hm3 = hl.read_table( + f"gs://gnomad-tmp/ld/HM3.UKBB.{pop.upper()}.qc.snplist.grch38.ht" + ) + ht_filter = ht_filter.filter(hl.is_defined(ht_hm3[ht_filter.key])) + logger.info( + f"Count after filtering to hapmap variants: {ht_filter.count()}" + ) + + entries_to_keep = ["GT"] if not adj else ["GT", "GQ", "DP", "AD"] + + vds = basics.get_gnomad_v4_genomes_vds( + test=test, + annotate_meta=True, + release_only=True, + split=True, + chrom=[mt_contig] if mt_contig else None, + filter_variant_ht=ht_filter, + naive_coalesce_partitions=naive_coalesce_partitions, + entries_to_keep=entries_to_keep, + filter_samples_ht=filter_samples_ht, + ) + + logger.info("From DENSE MT, performance beware...") + mt = hl.vds.to_dense_mt(vds) + + logger.info(f"Filtering to relevant cols - only meta - and no VDS/MT rows...") + mt = mt.select_cols(mt.meta) + mt = mt.select_rows() + + # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT + # NOTE: need to checkpoint either here or step below where noted + logger.info( + f"Checkpointing unannotated {pop} on {mt_contig if mt_contig else 'full'}" + ) + mt = mt.checkpoint(new_temp_file()) + + if adj: + logger.info("Filtering to adj sites only...") + adj_expr = get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) + mt = mt.filter_entries(adj_expr) + mt = mt.select_entries("GT") + + # logger.info('CUSTOM READ FROM PRIOR CHECKPOINT..') + # mt = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp//XLmYz1uA7NYEY2jgB55Yuc') + # This part begins potential performance concerns - what is up here? + # Ideally, would checkpoint both of these beforehand. + # Is very slow if MT isn't checkpointed + # Checkpointing said MT is also very slow... + logger.info("Annotating information from release HT...") + mt = mt.annotate_rows( + freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters + ) + mt = mt.annotate_globals(**hl.eval(ht_filter.globals)) + + logger.info("Checkpointing annotated MT...") + mt = mt.checkpoint(ld_mt_path, overwrite=overwrite) + + return mt + + def main(args): hl.init( log="/ld_assessment.log", @@ -380,10 +617,34 @@ def main(args): # Define command line args used more than once test = args.test + adj = args.adj is_hgdp = args.hgdp_subset overwrite = args.overwrite pop = args.pop - chrom = args.chrom + custom_mt = args.custom_mt + do_v2_samples = args.do_v2_samples + hapmap = args.do_hapmap + + # Chromosome filter logic + mt_contig = args.mt_contig + ld_contig = args.ld_contig + + if test: + mt_contig = "chr22" + + if mt_contig: + ld_contig = mt_contig + + # Get correct AF based on input logic... + vds_freq = args.vds_af + + ld_freq = args.ld_af + ld_ac = args.ld_ac + + if vds_freq < ld_freq: + pass # this is the desired behavior, for the vds freq to be more permisive + if ld_freq < vds_freq: + ld_freq = vds_freq if test and not args.re_call_stats: logger.info( @@ -391,179 +652,117 @@ def main(args): ) # Version is populated via ld_resources.py if None - version = None if not is_hgdp else "hgdp" + version = None + if is_hgdp: + version = "hgdp" + elif do_v2_samples: + version = "v2samples" + + if version: + version += "hapmap" + else: + version = "hapmap" - # Path for LD MT includes all pops by design ld_mt_path = ld_mt_checkpoint_path( - data_type="genomes", pops=pop, version=version, test=test + data_type="genomes", + freq=vds_freq, + pop=pop, + version=version, + test=test, + mt_contig=mt_contig, + adj=adj, ) - def _ld_test_mt( - mt: hl.MatrixTable, - filter_contig: bool = False, - sample_cols: bool = False, - ): - if filter_contig: - logger.info("Filtering to chr22...") - mt = mt.filter_rows(mt.locus.contig == "chr22") - if sample_cols: - logger.info("Downsampling all cols by 0.1...") - mt = mt.sample_cols(0.1) - - return mt - - def _generate_ld_mt( - version: str = version, - is_hgdp: bool = False, - test: bool = False, - ld_mt_path: str = ld_mt_path, - pop: str = "eas", - overwrite=overwrite, - ) -> hl.MatrixTable: - - if is_hgdp: - logger.info("Reading in HGDP_TGP Subset...") - mt = hgdp_tgp_subset(dense=True, public=True).mt() - mt = mt.annotate_globals( - freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict - ) - mt = mt.annotate_rows(freq=mt.gnomad_freq) - mt = mt.annotate_cols( - meta=hl.struct( - population_inference=hl.struct( - pop=mt.gnomad_population_inference.pop - ) - ) - ) - - mt = mt.filter_cols(mt.meta.population_inference.pop == "pop") - - if test: - mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True) - - mt = mt.checkpoint( - ld_mt_path, - overwrite=overwrite, - ) - + if args.generate_ld_mt: + mt = generate_ld_mt( + is_hgdp=is_hgdp, + test=test, + pop=pop, + mt_contig=mt_contig, + freq=vds_freq, + ld_mt_path=ld_mt_path, + overwrite=overwrite, + do_v2_samples=do_v2_samples, + adj=adj, + hapmap=hapmap, + ) + else: + if not custom_mt: + mt = hl.read_matrix_table(ld_mt_path) else: - logger.info("Reading in gnomAD release HT and VDS...") - filter_chrom = None - if test: - filter_chrom = "chr22" - elif chrom: - filter_chrom = chrom + mt = hl.read_matrix_table(custom_mt) - ht_release = hl.read_table(release.release_ht_path(data_type="genomes")) - ht_filter = _filter_ht_for_ld(ht_release,label='gen_anc',pop=pop) - if filter_chrom: - ht_filter = ht_filter.filter(ht_filter.locus.contig==filter_chrom) - - ht_filter = ht_filter.select('freq','filters').checkpoint(new_temp_file()) - - vds = basics.get_gnomad_v4_genomes_vds( + label = "gen_anc" if not is_hgdp else "pop" + pop_data = get_pop_counters( + mt, label=label + ) # NOTE: could be irrelevant later? tbd if needed + + contig_list = EXPECTED_CONTIGS if not ld_contig else [ld_contig] + + for ld_contig in contig_list: + # NOTE: This FULLY serializes the code, but hopefully it works + # Would we save time without so many separate filter_mt_for_ld calls ? + # The only point of concern is hl.ld_prune() and the size of the matrix + # Need to filter the mt to only the contig we are working on + # This is because the LD matrix is too large otherwise + mt = mt.filter_rows(mt.locus.contig == ld_contig) + logger.info(f"Filtering to {ld_contig}...") + + # Previously established that THIS needs a checkpoint to have been ran to work effectively + if args.generate_ld_pruned_set: + logger.info( + "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..." + ) + generate_ld_pruned_set( + mt=mt, + pop_data=pop_data, + data_type=data_type, + r2=args.r2, + freq=ld_freq, + ac_cutoff=ld_ac, + radius=args.radius, + overwrite=overwrite, + re_call_stats=args.re_call_stats, + version=version, test=test, - annotate_meta=True, - release_only=True, - split=True, - chrom=filter_chrom, - filter_variant_ht=ht_filter, - entries_to_keep=["GT"], + ld_contig=ld_contig, + adj=adj, ) - if test or chrom: - logger.info( - f"Due to Test or Chrom behavior, only working on {filter_chrom} - do naive coalesce to 1,000 partitions" - ) - mt = vds.variant_data.naive_coalesce(1000) - else: - mt = vds.variant_data.naive_coalesce(9800) - + if args.generate_ld_matrix: logger.info( - f"Filtering to relevant cols - only meta - and no VDS/MT rows..." + "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..." ) - mt = mt.select_cols(mt.meta) - mt = mt.select_rows() - logger.info(f"Filtering to {args.pop}...") - mt = mt.filter_cols(mt.meta.population_inference.pop == pop) - - # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT - logger.info(f"Checkpointing {pop} on {chrom if chrom else default_str}") - mt = mt.checkpoint(new_temp_file()) - - # This part begins potential performance concerns - what is up here? - # Ideally, would checkpoint both of these beforehand. - # Is very slow if MT isn't checkpointed - # Checkpointing said MT is also very slow... - mt = mt.annotate_rows( - freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters + generate_ld_matrix( + mt=mt, + pop_data=pop_data, + data_type=data_type, + radius=args.radius, + freq=ld_freq, + ac_cutoff=ld_ac, + adj=args.adj, + overwrite=overwrite, + re_call_stats=args.re_call_stats, + version=version, + test=test, + ld_contig=ld_contig, + custom_suffix=args.custom_suffix, ) - mt = mt.annotate_globals(**hl.eval(ht_filter.globals)) - - # Select and checkpoint to speed up calculations - - # logger.info(f"DO NOT Checkpoint MT to {ld_mt_path} ...") - # mt = mt.checkpoint(ld_mt_path, overwrite=overwrite) - - return mt - - mt = _generate_ld_mt( - version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path - ) - - # Something needs checkpointed before heading into generate_ld_pruned_set() - mt = mt.checkpoint(new_temp_file()) # skip if prior step is being checkpointed - - label = "gen_anc" if not is_hgdp else "pop" - pop_data = get_pop_counters(mt, label=label) - # Previously established that THIS needs a checkpoint to have been ran to work effectively - if args.generate_ld_pruned_set: - logger.info( - "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..." - ) - generate_ld_pruned_set( - mt=mt, - pop_data=pop_data, - data_type=data_type, - r2=args.r2, - radius=args.radius, - overwrite=overwrite, - re_call_stats=args.re_call_stats, - version=version, - test=test, - ) - - if args.generate_ld_matrix: - logger.info( - "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..." - ) - generate_ld_matrix( - mt=mt, - pop_data=pop_data, - data_type=data_type, - radius=args.radius, - common_only=args.common_only, - adj=args.adj, - overwrite=overwrite, - re_call_stats=args.re_call_stats, - version=version, - test=test, - ) - - if args.generate_ld_scores: - logger.info("Generating in LD scores and annotating onto variant HT...") - generate_ld_scores_from_ld_matrix( - pop_data=pop_data, - data_type=data_type, - min_frequency=args.min_frequency, - call_rate_cutoff=args.min_call_rate, - common_only=args.common_only, - adj=args.adj, - version=version, - overwrite=overwrite, - test=test, - ) + if args.generate_ld_scores: + logger.info("Generating in LD scores and annotating onto variant HT...") + generate_ld_scores_from_ld_matrix( + pop_data=pop_data, + data_type=data_type, + freq=ld_freq, + call_rate_cutoff=args.min_call_rate, + adj=args.adj, + version=version, + overwrite=overwrite, + test=test, + ld_contig=ld_contig, + custom_suffix=args.custom_suffix, + ) if __name__ == "__main__": @@ -575,7 +774,7 @@ def _generate_ld_mt( ) parser.add_argument( "--generate-ld-mt", - help="Generate Hail MatrixTable of variants and calls to calculate LD for. ", + help="Generate pop-specific Hail MatrixTable of variants and calls to calculate LD for. ", action="store_true", ) parser.add_argument( @@ -587,9 +786,10 @@ def _generate_ld_mt( "--generate-ld-matrix", help="Calculates LD matrix", action="store_true" ) parser.add_argument( - "--common-only", - help="Calculates LD matrix only on common variants (above 0.5%)", - action="store_true", + "--vds-af", + help="Minimum genetic ancestry-specific allele frequency to pull from for VDS.", + type=float, + default=RARE_FREQ, ) parser.add_argument( "--adj", @@ -601,18 +801,24 @@ def _generate_ld_mt( help="Calculates LD scores from LD matrix", action="store_true", ) - parser.add_argument( - "--min-frequency", - help="Minimum allele frequency to compute LD scores (default 0.01)", - default=0.01, - type=float, - ) parser.add_argument( "--min-call-rate", help="Minimum call rate to compute LD scores (default 0.8)", default=0.8, type=float, ) + parser.add_argument( + "--ld-ac", + help="Minimum allele count to compute LD scores (default 1).", + default=1, + type=int, + ) + parser.add_argument( + "--ld-af", + help="Minimum allele frequency to compute LD scores (default 0.0005).", + default=RARE_FREQ, + type=float, + ) parser.add_argument( "--r2", help="r-squared to which to prune LD (default 0.2)", default="0.2" ) @@ -637,19 +843,48 @@ def _generate_ld_mt( ) parser.add_argument( "--pop", - help="Which individual pop to run LD on", + help="Which individual pop to run LD on.", type=str, required=True, ) parser.add_argument( - "--chrom", - help="Which individual chromosome to run LD on", + "--mt-contig", + help="Which chromosome to pull VDS from. Applies for reading and writing.", + type=str, + required=False, + ) + parser.add_argument( + "--ld-contig", + help="Which chromosome to run LD on. Applies for reading and writing.", type=str, required=False, ) parser.add_argument( "--slack-channel", help="Slack channel to post results and notifications to." ) + parser.add_argument( + "--custom-mt", + type=str, + default=None, + help="Custom MT to run LD on", + ) + parser.add_argument( + "--do-v2-samples", + action="store_true", + help="Only use samples also in v2. Good for verification!", + ) + parser.add_argument( + "--custom-suffix", + type=str, + help="Custom string to append to output files", + required=False, + ) + parser.add_argument( + "--do-hapmap", + action="store_true", + help="Filter to HapMap3 SNPs when pulling VDS.", + ) + parser.add_argument("--overwrite", help="Overwrite data", action="store_true") args = parser.parse_args() diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py index db0a880b5..290820578 100644 --- a/gnomad_qc/v4/resources/ld_resources.py +++ b/gnomad_qc/v4/resources/ld_resources.py @@ -4,62 +4,88 @@ from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE +COMMON_FREQ = 0.005 +RARE_FREQ = 0.0005 + def ld_matrix_path( data_type: str, pop: str, - common_only: bool = True, + freq: float = COMMON_FREQ, adj: bool = True, + ld_contig: str = None, version: Optional[str] = None, - test: bool = False + test: bool = False, + custom_suffix: str = None, ): if version is None: version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm' + return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}{f".test" if test else ""}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.{f"af{freq}"}.{"adj." if adj else ""}ld{custom_suffix if custom_suffix else ""}.bm' def ld_index_path( data_type: str, pop: str, - common_only: bool = True, + freq: float = COMMON_FREQ, adj: bool = True, + ld_contig: str = None, version: Optional[str] = None, - test: bool = False + test: bool = False, + custom_suffix: str = None, ): if version is None: version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht' + return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}{f".test" if test else ""}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.{f"af{freq}"}.{"adj." if adj else ""}ld.variant_indices{custom_suffix if custom_suffix else ""}.ht' # def ld_scores_path( - data_type: str, pop: str, adj: bool = True, version: Optional[str] = None, test: bool = False + data_type: str, + pop: str, + adj: bool = True, + ld_contig: str = None, + freq: float = COMMON_FREQ, + version: Optional[str] = None, + test: bool = False, + custom_suffix: str = None, + call_rate_cutoff: float = 0.8, ): if version is None: version = ( CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE ) - return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"adj." if adj else ""}ld_scores.ht' + return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}{f".test" if test else ""}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.{f"af{freq}"}.{f"callrate{call_rate_cutoff}"}.{"adj." if adj else ""}ld_scores{f"{custom_suffix}" if custom_suffix else ""}.ht' def ld_mt_checkpoint_path( data_type: str, - pops: str = None, + freq: float = COMMON_FREQ, + pop: str = None, version: str = CURRENT_GENOME_RELEASE, + mt_contig: str = None, test: bool = False, + adj: bool = False, ): if version is None: version = CURRENT_GENOME_RELEASE - return f'gs://gnomad-tmp-30day/ld/gnomad.{data_type}.{"all_pops" if not pops else f"{pops}"}.{version}{f".test" if test else ""}.mt' + return f'gs://gnomad-tmp-30day/ld/gnomad.{data_type}.{"all_pops" if not pop else f"{pop}"}.{f"af{freq}"}.{"all_contigs" if not mt_contig else f"{mt_contig}"}.{version}{f".adj" if adj else ""}{f".test" if test else ""}.mt' def ld_pruned_path( - data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE, test: bool = False + data_type: str, + pop: str, + r2: str, + freq: float = COMMON_FREQ, + ld_contig: str = None, + version: str = CURRENT_GENOME_RELEASE, + test: bool = False, + ld_set: bool = False, + adj: bool = False, ): if version is None: version = CURRENT_GENOME_RELEASE - return f'gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.ld.pruned_set.r2_{r2}.ht' + return f'gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}{f".test" if test else ""}.{f"af{freq}"}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.ld.{f"pruned_set" if not ld_set else "ld_set"}{f".adj" if adj else ""}.r2_{r2}.ht' From 691b531ea241783e35639421ecb9e87cc3a9a6fd Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Tue, 8 Apr 2025 13:42:49 -0400 Subject: [PATCH 12/13] VDS checkpointing & linalg --- gnomad_qc/v4/assessment/generate_ld_data.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index 6b2f9abf8..bd97a1a44 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -442,7 +442,7 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> No l2row = r2_adj.sum(axis=0).T l2col = r2_adj.sum(axis=1) - l2 = l2row + l2col + 1 + l2 = l2row + l2col - 1 l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() @@ -564,6 +564,9 @@ def generate_ld_mt( filter_samples_ht=filter_samples_ht, ) + vds.variant_data = vds.variant_data.checkpoint(new_temp_file()) + vds.reference_data = vds.reference_data.checkpoint(new_temp_file()) + logger.info("From DENSE MT, performance beware...") mt = hl.vds.to_dense_mt(vds) @@ -576,7 +579,6 @@ def generate_ld_mt( logger.info( f"Checkpointing unannotated {pop} on {mt_contig if mt_contig else 'full'}" ) - mt = mt.checkpoint(new_temp_file()) if adj: logger.info("Filtering to adj sites only...") @@ -584,6 +586,8 @@ def generate_ld_mt( mt = mt.filter_entries(adj_expr) mt = mt.select_entries("GT") + # mt = mt.checkpoint(new_temp_file()) + # logger.info('CUSTOM READ FROM PRIOR CHECKPOINT..') # mt = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp//XLmYz1uA7NYEY2jgB55Yuc') # This part begins potential performance concerns - what is up here? From f6b92fb7b302fc26eb8ec55b5ee500f7432a625f Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Thu, 8 May 2025 12:08:05 -0400 Subject: [PATCH 13/13] Small fixes, cleaning up --- gnomad_qc/v4/assessment/generate_ld_data.py | 48 +++++++++------------ 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py index bd97a1a44..fc1b692c9 100644 --- a/gnomad_qc/v4/assessment/generate_ld_data.py +++ b/gnomad_qc/v4/assessment/generate_ld_data.py @@ -1,11 +1,7 @@ -""" +""" Script to generate LD scores for gnomAD v4 Genome SNPs & Indels. Example usage: -hailctl dataproc submit cluster_name generate_ld_data.py --test --overwrite --hgdp-subset --pop afr --re-call-stats \ - --generate-ld-mt \ - --generate-ld-pruned-set \ - --generate-ld-matrix \ - --generate-ld-scores +hailctl dataproc submit dmld10 ~/Documents/GitHub/gnomad_qc/gnomad_qc/v4/assessment/generate_ld_data.py --generate-ld-mt --custom-suffix alleassample --pop eas --generate-ld-pruned-set --generate-ld-matrix --generate-ld-scores --adj --overwrite """ import argparse @@ -281,6 +277,7 @@ def generate_ld_matrix( :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. :param test: Filter to test versions and/or write to test paths. """ + # From gnomAD v2 run: # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population # Total of ~37 hours ($400) @@ -365,6 +362,7 @@ def generate_ld_scores_from_ld_matrix( :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version. :param test: Filter to test versions and/or write to test paths. """ + # From gnomAD v2 run: # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 # mins to compute LD scores per population (~$150 total) @@ -564,7 +562,14 @@ def generate_ld_mt( filter_samples_ht=filter_samples_ht, ) + logger.info("New checkpointing logic...") vds.variant_data = vds.variant_data.checkpoint(new_temp_file()) + + # From a prior run, where it failed (or was failed) after writing the Variant Data, it can be read back in as follows: + # with args: hailctl dataproc submit dmld50 generate_ld_data.py --overwrite --pop eas --ld-contig chr22 --adj --generate-ld-mt --generate-ld-pruned-set --generate-ld-matrix --generate-ld-scores --custom-suffix alleassamples + # READ VARIANT DATA FROM THIS + # vds.variant_data = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp/Sms092zqq8FbN9QXweJnzA') # FOR EAS RUN + vds.reference_data = vds.reference_data.checkpoint(new_temp_file()) logger.info("From DENSE MT, performance beware...") @@ -574,26 +579,12 @@ def generate_ld_mt( mt = mt.select_cols(mt.meta) mt = mt.select_rows() - # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT - # NOTE: need to checkpoint either here or step below where noted - logger.info( - f"Checkpointing unannotated {pop} on {mt_contig if mt_contig else 'full'}" - ) - if adj: logger.info("Filtering to adj sites only...") adj_expr = get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD) mt = mt.filter_entries(adj_expr) mt = mt.select_entries("GT") - # mt = mt.checkpoint(new_temp_file()) - - # logger.info('CUSTOM READ FROM PRIOR CHECKPOINT..') - # mt = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp//XLmYz1uA7NYEY2jgB55Yuc') - # This part begins potential performance concerns - what is up here? - # Ideally, would checkpoint both of these beforehand. - # Is very slow if MT isn't checkpointed - # Checkpointing said MT is also very slow... logger.info("Annotating information from release HT...") mt = mt.annotate_rows( freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters @@ -646,7 +637,7 @@ def main(args): ld_ac = args.ld_ac if vds_freq < ld_freq: - pass # this is the desired behavior, for the vds freq to be more permisive + pass if ld_freq < vds_freq: ld_freq = vds_freq @@ -662,10 +653,11 @@ def main(args): elif do_v2_samples: version = "v2samples" - if version: - version += "hapmap" - else: - version = "hapmap" + if hapmap == True: + if version: + version += "hapmap" + else: + version = "hapmap" ld_mt_path = ld_mt_checkpoint_path( data_type="genomes", @@ -709,8 +701,8 @@ def main(args): # The only point of concern is hl.ld_prune() and the size of the matrix # Need to filter the mt to only the contig we are working on # This is because the LD matrix is too large otherwise - mt = mt.filter_rows(mt.locus.contig == ld_contig) logger.info(f"Filtering to {ld_contig}...") + mt_ld = mt.filter_rows(mt.locus.contig == ld_contig) # Previously established that THIS needs a checkpoint to have been ran to work effectively if args.generate_ld_pruned_set: @@ -718,7 +710,7 @@ def main(args): "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..." ) generate_ld_pruned_set( - mt=mt, + mt=mt_ld, pop_data=pop_data, data_type=data_type, r2=args.r2, @@ -738,7 +730,7 @@ def main(args): "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..." ) generate_ld_matrix( - mt=mt, + mt=mt_ld, pop_data=pop_data, data_type=data_type, radius=args.radius,