From e1b4dc8f7bf49dc5ec023c97bf6839d96b0cc1c7 Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Wed, 11 Sep 2024 17:05:39 -0400
Subject: [PATCH 01/13] Create files for running LD on gnomAD v4 SNVs/Indels

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 339 ++++++++++++++++++++
 gnomad_qc/v4/resources/ld_resources.py      |  62 ++++
 2 files changed, 401 insertions(+)
 create mode 100644 gnomad_qc/v4/assessment/generate_ld_data.py
 create mode 100644 gnomad_qc/v4/resources/ld_resources.py

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
new file mode 100644
index 000000000..983444bae
--- /dev/null
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -0,0 +1,339 @@
+import argparse
+import sys
+
+import gnomad.resources.grch37.gnomad_ld as ld_resources
+from gnomad.utils.slack import slack_notifications
+from hail.linalg import BlockMatrix
+from hail.utils import new_temp_file
+# from hail.utils.java import Env 
+
+from gnomad_qc.slack_creds import slack_token
+from gnomad_qc.v4.resources import *
+from gnomad_qc.v3.resources.release import hgdp_tgp_subset
+# this includes ld_resources
+
+HGDP_TGP_RELEASES
+
+COMMON_FREQ = 0.005
+RARE_FREQ = 0.0005
+
+def get_pop_and_subpop_counters(mt):
+    cut_dict = {
+        "pop": hl.agg.filter(
+            hl.is_defined(mt.meta.pop) & (mt.meta.pop != "oth"),
+            hl.agg.counter(mt.meta.pop),
+        ),
+    }
+    cut_data = mt.aggregate_cols(hl.struct(**cut_dict))
+    return cut_data
+
+
+def filter_mt_for_ld(
+    mt, label, pop, common_only: bool = True, re_call_stats: bool = False
+):
+    frequency = COMMON_FREQ if common_only else RARE_FREQ
+    # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants
+    # All work on standard machines except AFR
+    # Also creating `common_only` ones for most practical uses (above 0.5%)
+    pop_mt = mt.filter_cols(mt.meta[label] == pop)
+    meta_index = (
+        hl.enumerate(pop_mt.freq_meta)
+        .find(lambda f: (f[1].get(label) == pop))
+        .collect()[0][0]
+    )
+    # No support for SV datasets in gnomAD v4 Production team code 
+    # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on 
+    # This is for test sets or 1kg_tdpg
+    if re_call_stats:
+        call_stats = hl.agg.call_stats(pop_mt.GT, pop_mt.alleles)
+        call_stats_bind = hl.bind(
+            lambda cs: cs.annotate(
+                AC=cs.AC[1], AF=cs.AF[1], homozygote_count=cs.homozygote_count[1]
+            ),
+            call_stats,
+        )
+        pop_freq = call_stats_bind
+    else:
+        pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0))
+        pop_freq = pop_mt.freq[meta_index]
+
+    pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0))
+    pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq)
+    pop_mt = pop_mt.filter_rows(
+        (pop_mt.pop_freq.AC > 1)
+        & (pop_mt.pop_freq.AN - pop_mt.pop_freq.AC > 1)
+        & (pop_mt.pop_freq.AF > frequency)
+        &
+        #  the following filter is needed for "het-only" monomorphic sites
+        #  as otherwise variance is 0, and so, row_correlation errors out
+        ~((pop_mt.pop_freq.AF == 0.5) & (pop_mt.pop_freq.homozygote_count == 0))
+    )
+
+    return pop_mt
+
+
+def generate_ld_pruned_set(
+    mt: hl.MatrixTable,
+    pop_data: dict,
+    data_type: str,
+    r2: str = "0.2",
+    radius: int = 1000000,
+    overwrite: bool = False,
+):
+    for label, pops in dict(pop_data).items():
+        for pop in pops:
+            pop_mt = filter_mt_for_ld(mt, label, pop)
+
+            pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius)
+
+            ht = pop_mt.rows().select("pop_freq")
+            ht = ht.filter(hl.is_defined(pruned_ht[ht.key]))
+            ht.write(ld_pruned_path(data_type, pop, r2), overwrite)
+
+
+def generate_ld_matrix(
+    mt,
+    pop_data,
+    data_type,
+    radius: int = 1000000,
+    common_only: bool = True,
+    adj: bool = False,
+    overwrite: bool = False,
+):
+    # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population
+    # Total of ~37 hours ($400)
+    for label, pops in dict(pop_data).items():
+        for pop in pops:
+            if data_type == "genomes_snv_sv" and pop not in SNV_SV_POPS:
+                continue
+            pop_mt = filter_mt_for_ld(
+                mt, label, pop, common_only, sv_dataset=data_type == "genomes_snv_sv"
+            )
+
+            pop_mt.rows().select("pop_freq").add_index().write(
+                ld_resources._ld_index_path(data_type, pop, common_only, adj), overwrite
+            )
+            ld = hl.ld_matrix(pop_mt.GT.n_alt_alleles(), pop_mt.locus, radius)
+            if data_type != "genomes_snv_sv":
+                ld = ld.sparsify_triangle()
+            ld.write(
+                ld_resources._ld_matrix_path(data_type, pop, common_only, adj),
+                overwrite,
+            )
+
+
+
+
+
+def generate_ld_scores_from_ld_matrix(
+    pop_data,
+    data_type,
+    min_frequency=0.01,
+    call_rate_cutoff=0.8,
+    adj: bool = False,
+    radius: int = 1000000,
+    overwrite=False,
+):
+    # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
+    # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
+    # mins to compute LD scores per population (~$150 total)
+    for label, pops in dict(pop_data).items():
+        for pop, n in pops.items():
+            ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj))
+            ht = ht.filter(
+                (ht.pop_freq.AF >= min_frequency)
+                & (ht.pop_freq.AF <= 1 - min_frequency)
+                & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff)
+            ).add_index(name="new_idx")
+
+            indices = ht.idx.collect()
+
+            r2 = BlockMatrix.read(
+                ld_resources._ld_matrix_path(
+                    data_type, pop, min_frequency >= COMMON_FREQ, adj=adj
+                )
+            )
+            r2 = r2.filter(indices, indices) ** 2
+            r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))
+
+            out_name = ld_resources._ld_scores_path(data_type, pop, adj)
+            compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
+
+
+
+
+
+def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite):
+    starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False)
+    r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False)
+
+    l2row = r2_adj.sum(axis=0).T
+    l2col = r2_adj.sum(axis=1)
+    l2 = l2row + l2col + 1
+    l2_bm_tmp = new_temp_file()
+    l2_tsv_tmp = new_temp_file()
+
+    l2.write(l2_bm_tmp, force_row_major=True)
+    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)
+
+    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
+    ht_scores = ht_scores.add_index().rename({"f0": "ld_score"})
+    ht_scores = ht_scores.key_by("idx")
+    ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals()
+    ht.filter(hl.is_defined(ht.ld_score)).write(out_name, overwrite)
+
+
+def main(args):
+    hl.init(log="/ld_annotations.log",tmp_dir="gs://gnomad-tmp-4day/ld_tmp/")
+    hl._set_flags(use_new_shuffle="1")
+
+    # Only run on genomes so far, no choice to run on exomes, too sparse 
+    data_type = "genomes" 
+    if args.exomes:
+        raise NotImplementedError(
+            "LD Code does not work for exomes yet. Too sparse:("
+        )
+
+
+
+    if args.hgdp_subset:
+        mt = hgdp_tgp_subset(dense=True,public=True).mt()
+        if args.test:
+            mt = mt.filter_rows(mt.locus.contig=="chr22")
+        mt = mt.annotate_globals(
+            freq_meta = mt.hgdp_tgp_freq_meta,
+            freq_index_dict = mt.hgdp_tgp_freq_index_dict
+        )
+
+        mt = mt.annotate_rows(
+            freq = mt.hgdp_tgp_freq
+        )
+
+        mt = mt.annotate_cols(
+            pop = mt.hgdp_tgp_meta.population
+        )
+    else:
+        ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
+        vds = basics.get_gnomad_v4_genomes_vds(test=args.test,annotate_meta=True,release_only=True,split=True)
+        mt_vds = vds.variant_data
+        mt = mt_vds.annotate(
+            **hl.eval(v4_release_ht.globals)
+        )
+
+        mt = mt.annotate(
+            pop = mt.meta.population_inference.pop
+        )
+
+
+
+    pop_data = get_pop_and_subpop_counters(mt)
+
+    if args.generate_ld_pruned_set:
+        generate_ld_pruned_set(
+            mt, pop_data, data_type, args.r2, args.radius, args.overwrite
+        )
+
+    if args.generate_ld_matrix:
+        generate_ld_matrix(
+            mt,
+            pop_data,
+            data_type,
+            args.radius,
+            args.common_only,
+            args.adj,
+            args.overwrite,
+        )
+
+    if args.generate_ld_scores:
+        generate_ld_scores_from_ld_matrix(
+            pop_data,
+            data_type,
+            args.min_frequency,
+            args.min_call_rate,
+            args.adj,
+            overwrite=args.overwrite,
+        )
+
+    if args.cross_pop_ld_scores:
+        generate_all_cross_pop_ld_scores(
+            pop_data,
+            data_type,
+            args.min_frequency,
+            args.min_call_rate,
+            args.adj,
+            overwrite=args.overwrite,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--exomes",
+        help="Input MT is exomes. One of --exomes or --genomes is required.",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--generate_ld_pruned_set",
+        help="Calculates LD pruned set of variants",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--generate_ld_matrix", help="Calculates LD matrix", action="store_true"
+    )
+    parser.add_argument(
+        "--common_only",
+        help="Calculates LD matrix only on common variants (above 0.5%)",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--adj",
+        help="Calculates LD matrix only on using high-quality genotypes",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--generate_ld_scores",
+        help="Calculates LD scores from LD matrix",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--min_frequency",
+        help="Minimum allele frequency to compute LD scores (default 0.01)",
+        default=0.01,
+        type=float,
+    )
+    parser.add_argument(
+        "--min_call_rate",
+        help="Minimum call rate to compute LD scores (default 0.8)",
+        default=0.8,
+        type=float,
+    )
+    parser.add_argument(
+        "--r2", help="r-squared to which to prune LD (default 0.2)", default="0.2"
+    )
+    parser.add_argument(
+        "--radius",
+        help="Radius at which to calculate LD information (bp; default 1e6)",
+        default=1000000,
+        type=int,
+    )
+    parser.add_argument(
+        "--test",
+        help="Use test dataset for whichever callset requested.",
+        action="store_true"
+    )
+    parser.add_argument(
+        "--hgdp_subset",
+        help="Use hgdp dataset for callset.",
+        action="store_true"
+    )
+    parser.add_argument(
+        "--slack_channel", help="Slack channel to post results and notifications to."
+    )
+    parser.add_argument("--overwrite", help="Overwrite data", action="store_true")
+    args = parser.parse_args()
+
+    if args.slack_channel:
+        with slack_notifications(slack_token, args.slack_channel):
+            main(args)
+    else:
+        main(args)
diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py
new file mode 100644
index 000000000..b98ca649c
--- /dev/null
+++ b/gnomad_qc/v4/resources/ld_resources.py
@@ -0,0 +1,62 @@
+"""Script containing annotation related resources."""
+
+from typing import Optional
+
+from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE
+from gnomad.resources.resource_utils import (
+    GnomadPublicBlockMatrixResource,
+    GnomadPublicTableResource,
+)
+
+def ld_matrix_path(
+    data_type: str,
+    pop: str,
+    common_only: bool = True,
+    adj: bool = True,
+    version: Optional[str] = None,
+):
+    if version is None:
+        version = (
+            CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
+        )
+    return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
+
+def ld_index_path(
+    data_type: str,
+    pop: str,
+    common_only: bool = True,
+    adj: bool = True,
+    version: Optional[str] = None,
+):
+    if version is None:
+        version = (
+            CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
+        )
+    return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
+# 
+def ld_scores_path(
+    data_type: str, pop: str, adj: bool = True, version: Optional[str] = None
+):
+    if version is None:
+        version = (
+            CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
+        )
+    return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht'
+
+
+def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_RELEASE):
+    return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht"
+
+def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource:
+    """Get resource for the LD matrix for the given population."""
+    return GnomadPublicBlockMatrixResource(path=ld_matrix_path("genomes", pop))
+
+
+def ld_index(pop: str) -> GnomadPublicTableResource:
+    """Get resource for the LD indices for the given population."""
+    return GnomadPublicTableResource(path=ld_index_path("genomes", pop))
+
+
+def ld_scores(pop: str) -> GnomadPublicTableResource:
+    """Get resource for the LD scores for the given population."""
+    return GnomadPublicTableResource(path=ld_scores_path("genomes", pop))
\ No newline at end of file

From 31b495d08f4164c87fbfdafa7e386a11c6794ab8 Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Wed, 11 Sep 2024 17:06:23 -0400
Subject: [PATCH 02/13] black formatting

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 62 ++++++++-------------
 gnomad_qc/v4/resources/ld_resources.py      |  9 ++-
 2 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 983444bae..2bda418a6 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -5,11 +5,13 @@
 from gnomad.utils.slack import slack_notifications
 from hail.linalg import BlockMatrix
 from hail.utils import new_temp_file
-# from hail.utils.java import Env 
+
+# from hail.utils.java import Env
 
 from gnomad_qc.slack_creds import slack_token
 from gnomad_qc.v4.resources import *
 from gnomad_qc.v3.resources.release import hgdp_tgp_subset
+
 # this includes ld_resources
 
 HGDP_TGP_RELEASES
@@ -17,6 +19,7 @@
 COMMON_FREQ = 0.005
 RARE_FREQ = 0.0005
 
+
 def get_pop_and_subpop_counters(mt):
     cut_dict = {
         "pop": hl.agg.filter(
@@ -41,8 +44,8 @@ def filter_mt_for_ld(
         .find(lambda f: (f[1].get(label) == pop))
         .collect()[0][0]
     )
-    # No support for SV datasets in gnomAD v4 Production team code 
-    # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on 
+    # No support for SV datasets in gnomAD v4 Production team code
+    # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on
     # This is for test sets or 1kg_tdpg
     if re_call_stats:
         call_stats = hl.agg.call_stats(pop_mt.GT, pop_mt.alleles)
@@ -122,9 +125,6 @@ def generate_ld_matrix(
             )
 
 
-
-
-
 def generate_ld_scores_from_ld_matrix(
     pop_data,
     data_type,
@@ -160,9 +160,6 @@ def generate_ld_scores_from_ld_matrix(
             compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
 
 
-
-
-
 def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite):
     starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False)
     r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False)
@@ -184,47 +181,34 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite):
 
 
 def main(args):
-    hl.init(log="/ld_annotations.log",tmp_dir="gs://gnomad-tmp-4day/ld_tmp/")
+    hl.init(log="/ld_annotations.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/")
     hl._set_flags(use_new_shuffle="1")
 
-    # Only run on genomes so far, no choice to run on exomes, too sparse 
-    data_type = "genomes" 
+    # Only run on genomes so far, no choice to run on exomes, too sparse
+    data_type = "genomes"
     if args.exomes:
-        raise NotImplementedError(
-            "LD Code does not work for exomes yet. Too sparse:("
-        )
-
-
+        raise NotImplementedError("LD Code does not work for exomes yet. Too sparse:(")
 
     if args.hgdp_subset:
-        mt = hgdp_tgp_subset(dense=True,public=True).mt()
+        mt = hgdp_tgp_subset(dense=True, public=True).mt()
         if args.test:
-            mt = mt.filter_rows(mt.locus.contig=="chr22")
+            mt = mt.filter_rows(mt.locus.contig == "chr22")
         mt = mt.annotate_globals(
-            freq_meta = mt.hgdp_tgp_freq_meta,
-            freq_index_dict = mt.hgdp_tgp_freq_index_dict
+            freq_meta=mt.hgdp_tgp_freq_meta, freq_index_dict=mt.hgdp_tgp_freq_index_dict
         )
 
-        mt = mt.annotate_rows(
-            freq = mt.hgdp_tgp_freq
-        )
+        mt = mt.annotate_rows(freq=mt.hgdp_tgp_freq)
 
-        mt = mt.annotate_cols(
-            pop = mt.hgdp_tgp_meta.population
-        )
+        mt = mt.annotate_cols(pop=mt.hgdp_tgp_meta.population)
     else:
         ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
-        vds = basics.get_gnomad_v4_genomes_vds(test=args.test,annotate_meta=True,release_only=True,split=True)
-        mt_vds = vds.variant_data
-        mt = mt_vds.annotate(
-            **hl.eval(v4_release_ht.globals)
+        vds = basics.get_gnomad_v4_genomes_vds(
+            test=args.test, annotate_meta=True, release_only=True, split=True
         )
+        mt_vds = vds.variant_data
+        mt = mt_vds.annotate(**hl.eval(v4_release_ht.globals))
 
-        mt = mt.annotate(
-            pop = mt.meta.population_inference.pop
-        )
-
-
+        mt = mt.annotate(pop=mt.meta.population_inference.pop)
 
     pop_data = get_pop_and_subpop_counters(mt)
 
@@ -319,12 +303,10 @@ def main(args):
     parser.add_argument(
         "--test",
         help="Use test dataset for whichever callset requested.",
-        action="store_true"
+        action="store_true",
     )
     parser.add_argument(
-        "--hgdp_subset",
-        help="Use hgdp dataset for callset.",
-        action="store_true"
+        "--hgdp_subset", help="Use hgdp dataset for callset.", action="store_true"
     )
     parser.add_argument(
         "--slack_channel", help="Slack channel to post results and notifications to."
diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py
index b98ca649c..38ab3b8c8 100644
--- a/gnomad_qc/v4/resources/ld_resources.py
+++ b/gnomad_qc/v4/resources/ld_resources.py
@@ -8,6 +8,7 @@
     GnomadPublicTableResource,
 )
 
+
 def ld_matrix_path(
     data_type: str,
     pop: str,
@@ -21,6 +22,7 @@ def ld_matrix_path(
         )
     return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
 
+
 def ld_index_path(
     data_type: str,
     pop: str,
@@ -33,7 +35,9 @@ def ld_index_path(
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
     return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
-# 
+
+
+#
 def ld_scores_path(
     data_type: str, pop: str, adj: bool = True, version: Optional[str] = None
 ):
@@ -47,6 +51,7 @@ def ld_scores_path(
 def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_RELEASE):
     return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht"
 
+
 def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource:
     """Get resource for the LD matrix for the given population."""
     return GnomadPublicBlockMatrixResource(path=ld_matrix_path("genomes", pop))
@@ -59,4 +64,4 @@ def ld_index(pop: str) -> GnomadPublicTableResource:
 
 def ld_scores(pop: str) -> GnomadPublicTableResource:
     """Get resource for the LD scores for the given population."""
-    return GnomadPublicTableResource(path=ld_scores_path("genomes", pop))
\ No newline at end of file
+    return GnomadPublicTableResource(path=ld_scores_path("genomes", pop))

From d5a5dfe8e1a43ef359bdb4706cceef6fa3c39e59 Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Thu, 12 Sep 2024 15:51:51 -0400
Subject: [PATCH 03/13] testing, further params and labels

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 120 ++++++++++++--------
 gnomad_qc/v4/resources/ld_resources.py      |   2 +-
 2 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 2bda418a6..770735ea3 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -10,6 +10,7 @@
 
 from gnomad_qc.slack_creds import slack_token
 from gnomad_qc.v4.resources import *
+from gnomad_qc.v4.resources.ld_resources import *
 from gnomad_qc.v3.resources.release import hgdp_tgp_subset
 
 # this includes ld_resources
@@ -20,14 +21,15 @@
 RARE_FREQ = 0.0005
 
 
-def get_pop_and_subpop_counters(mt):
+def get_pop_and_subpop_counters(mt,label):
     cut_dict = {
-        "pop": hl.agg.filter(
-            hl.is_defined(mt.meta.pop) & (mt.meta.pop != "oth"),
-            hl.agg.counter(mt.meta.pop),
+        f"{label}": hl.agg.filter(
+            hl.is_defined(mt.meta.population_inference.pop) & (mt.meta.population_inference.pop != "oth"),
+            hl.agg.counter(mt.meta.population_inference.pop),
         ),
     }
     cut_data = mt.aggregate_cols(hl.struct(**cut_dict))
+    logger.info(f'Counts: {cut_data}')
     return cut_data
 
 
@@ -38,7 +40,7 @@ def filter_mt_for_ld(
     # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants
     # All work on standard machines except AFR
     # Also creating `common_only` ones for most practical uses (above 0.5%)
-    pop_mt = mt.filter_cols(mt.meta[label] == pop)
+    pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop)
     meta_index = (
         hl.enumerate(pop_mt.freq_meta)
         .find(lambda f: (f[1].get(label) == pop))
@@ -56,12 +58,14 @@ def filter_mt_for_ld(
             call_stats,
         )
         pop_freq = call_stats_bind
+        pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq)
+
     else:
         pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0))
         pop_freq = pop_mt.freq[meta_index]
+        pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq)
 
     pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0))
-    pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq)
     pop_mt = pop_mt.filter_rows(
         (pop_mt.pop_freq.AC > 1)
         & (pop_mt.pop_freq.AN - pop_mt.pop_freq.AC > 1)
@@ -82,16 +86,25 @@ def generate_ld_pruned_set(
     r2: str = "0.2",
     radius: int = 1000000,
     overwrite: bool = False,
+    re_call_stats: bool = False,
+    version: str = None,
 ):
+
     for label, pops in dict(pop_data).items():
         for pop in pops:
-            pop_mt = filter_mt_for_ld(mt, label, pop)
+            logger.info(f"Filtering for {pop}...")
+            pop_mt = filter_mt_for_ld(mt, label, pop, re_call_stats)
+            logger.info(f"Count after filter_mt_for_ld() : {pop_mt.count()}")
 
             pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius)
 
+            logger.info(f"Count after hl.ld_prune() : {pruned_ht.count()}")
+
             ht = pop_mt.rows().select("pop_freq")
             ht = ht.filter(hl.is_defined(pruned_ht[ht.key]))
-            ht.write(ld_pruned_path(data_type, pop, r2), overwrite)
+            ld_path = ld_pruned_path(data_type, pop, r2, version=version)
+            logger.info(f"Writing pruned ht for {pop} to {ld_path} ...")
+            ht.write(ld_path, overwrite)
 
 
 def generate_ld_matrix(
@@ -102,25 +115,27 @@ def generate_ld_matrix(
     common_only: bool = True,
     adj: bool = False,
     overwrite: bool = False,
+    re_call_stats: bool = False,
+    version: str = None,
 ):
     # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population
     # Total of ~37 hours ($400)
+
     for label, pops in dict(pop_data).items():
         for pop in pops:
-            if data_type == "genomes_snv_sv" and pop not in SNV_SV_POPS:
-                continue
+
             pop_mt = filter_mt_for_ld(
-                mt, label, pop, common_only, sv_dataset=data_type == "genomes_snv_sv"
+                mt, label, pop, common_only, re_call_stats
             )
 
             pop_mt.rows().select("pop_freq").add_index().write(
-                ld_resources._ld_index_path(data_type, pop, common_only, adj), overwrite
+                ld_resources._ld_index_path(data_type, pop, common_only, adj, version=version), overwrite
             )
             ld = hl.ld_matrix(pop_mt.GT.n_alt_alleles(), pop_mt.locus, radius)
             if data_type != "genomes_snv_sv":
                 ld = ld.sparsify_triangle()
             ld.write(
-                ld_resources._ld_matrix_path(data_type, pop, common_only, adj),
+                ld_resources._ld_matrix_path(data_type, pop, common_only, adj, version=version),
                 overwrite,
             )
 
@@ -132,14 +147,15 @@ def generate_ld_scores_from_ld_matrix(
     call_rate_cutoff=0.8,
     adj: bool = False,
     radius: int = 1000000,
-    overwrite=False,
+    overwrite: bool = False,
+    version: str = None
 ):
     # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
     # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
     # mins to compute LD scores per population (~$150 total)
     for label, pops in dict(pop_data).items():
         for pop, n in pops.items():
-            ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj))
+            ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj, version=version))
             ht = ht.filter(
                 (ht.pop_freq.AF >= min_frequency)
                 & (ht.pop_freq.AF <= 1 - min_frequency)
@@ -150,17 +166,17 @@ def generate_ld_scores_from_ld_matrix(
 
             r2 = BlockMatrix.read(
                 ld_resources._ld_matrix_path(
-                    data_type, pop, min_frequency >= COMMON_FREQ, adj=adj
+                    data_type, pop, min_frequency >= COMMON_FREQ, adj=adj, version=version
                 )
             )
             r2 = r2.filter(indices, indices) ** 2
             r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))
 
-            out_name = ld_resources._ld_scores_path(data_type, pop, adj)
+            out_name = ld_scores_path(data_type, pop, adj,version=version)
             compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
 
 
-def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite):
+def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite,version):
     starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False)
     r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False)
 
@@ -181,7 +197,7 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite):
 
 
 def main(args):
-    hl.init(log="/ld_annotations.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/")
+    hl.init(log="/ld_assessment.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/")
     hl._set_flags(use_new_shuffle="1")
 
     # Only run on genomes so far, no choice to run on exomes, too sparse
@@ -191,30 +207,48 @@ def main(args):
 
     if args.hgdp_subset:
         mt = hgdp_tgp_subset(dense=True, public=True).mt()
-        if args.test:
-            mt = mt.filter_rows(mt.locus.contig == "chr22")
         mt = mt.annotate_globals(
-            freq_meta=mt.hgdp_tgp_freq_meta, freq_index_dict=mt.hgdp_tgp_freq_index_dict
+            freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
         )
+        mt = mt.annotate_rows(freq=mt.gnomad_freq)
+        mt = mt.annotate_cols(meta=hl.struct(
+            population_inference = hl.struct(
+                pop=mt.gnomad_population_inference.pop)))
 
-        mt = mt.annotate_rows(freq=mt.hgdp_tgp_freq)
+        if args.test:
+            mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1)
+            mt = mt.filter_cols(mt.meta.population_inference.pop == "afr")
 
-        mt = mt.annotate_cols(pop=mt.hgdp_tgp_meta.population)
     else:
         ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
         vds = basics.get_gnomad_v4_genomes_vds(
             test=args.test, annotate_meta=True, release_only=True, split=True
         )
         mt_vds = vds.variant_data
-        mt = mt_vds.annotate(**hl.eval(v4_release_ht.globals))
+        mt = mt_vds.annotate_globals(
+            **hl.eval(ht_release.globals)
+        )
+
+        mt = mt.annotate_rows(
+            freq = ht_release[mt.row_key].freq,
+            filters = ht_release[mt.row_key].filters
+        )
 
-        mt = mt.annotate(pop=mt.meta.population_inference.pop)
+        if args.test:
+            mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1)
+            mt = mt.filter_cols(mt.meta.population_inference.pop == "afr")
+
+        # it is already in the right place for the gnomAD MT , or whatever
+        # mt = mt.annotate(pop=mt.meta.population_inference.pop)
+    label = 'gen_anc' if not args.hgdp_subset else 'pop'
+    pop_data = get_pop_and_subpop_counters(mt,label=label)
 
-    pop_data = get_pop_and_subpop_counters(mt)
+    # Version is populated via ld_resources.py if None
+    version = None if not args.hgdp_subset else 'hgdp'
 
     if args.generate_ld_pruned_set:
         generate_ld_pruned_set(
-            mt, pop_data, data_type, args.r2, args.radius, args.overwrite
+            mt, pop_data, data_type, args.r2, args.radius, args.overwrite, re_call_stats=args.re_call_stats, version=version
         )
 
     if args.generate_ld_matrix:
@@ -226,6 +260,8 @@ def main(args):
             args.common_only,
             args.adj,
             args.overwrite,
+            re_call_stats=args.re_call_stats, 
+            version=version
         )
 
     if args.generate_ld_scores:
@@ -238,15 +274,6 @@ def main(args):
             overwrite=args.overwrite,
         )
 
-    if args.cross_pop_ld_scores:
-        generate_all_cross_pop_ld_scores(
-            pop_data,
-            data_type,
-            args.min_frequency,
-            args.min_call_rate,
-            args.adj,
-            overwrite=args.overwrite,
-        )
 
 
 if __name__ == "__main__":
@@ -257,15 +284,15 @@ def main(args):
         action="store_true",
     )
     parser.add_argument(
-        "--generate_ld_pruned_set",
+        "--generate-ld-pruned-set",
         help="Calculates LD pruned set of variants",
         action="store_true",
     )
     parser.add_argument(
-        "--generate_ld_matrix", help="Calculates LD matrix", action="store_true"
+        "--generate-ld-matrix", help="Calculates LD matrix", action="store_true"
     )
     parser.add_argument(
-        "--common_only",
+        "--common-only",
         help="Calculates LD matrix only on common variants (above 0.5%)",
         action="store_true",
     )
@@ -275,18 +302,18 @@ def main(args):
         action="store_true",
     )
     parser.add_argument(
-        "--generate_ld_scores",
+        "--generate-ld-scores",
         help="Calculates LD scores from LD matrix",
         action="store_true",
     )
     parser.add_argument(
-        "--min_frequency",
+        "--min-frequency",
         help="Minimum allele frequency to compute LD scores (default 0.01)",
         default=0.01,
         type=float,
     )
     parser.add_argument(
-        "--min_call_rate",
+        "--min-call-rate",
         help="Minimum call rate to compute LD scores (default 0.8)",
         default=0.8,
         type=float,
@@ -306,10 +333,13 @@ def main(args):
         action="store_true",
     )
     parser.add_argument(
-        "--hgdp_subset", help="Use hgdp dataset for callset.", action="store_true"
+        "--hgdp-subset", help="Use hgdp dataset for callset.", action="store_true"
+    )
+    parser.add_argument(
+        "--re-call-stats", help="Regenerate callstats for LD work. Can be useful for some subsets.", action="store_true"
     )
     parser.add_argument(
-        "--slack_channel", help="Slack channel to post results and notifications to."
+        "--slack-channel", help="Slack channel to post results and notifications to."
     )
     parser.add_argument("--overwrite", help="Overwrite data", action="store_true")
     args = parser.parse_args()
diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py
index 38ab3b8c8..dd0dd274e 100644
--- a/gnomad_qc/v4/resources/ld_resources.py
+++ b/gnomad_qc/v4/resources/ld_resources.py
@@ -48,7 +48,7 @@ def ld_scores_path(
     return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht'
 
 
-def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_RELEASE):
+def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE):
     return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht"
 
 

From 0f5bd4e43914caa08491c901379cee554804cebb Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Thu, 12 Sep 2024 16:12:17 -0400
Subject: [PATCH 04/13] Update generate_ld_data.py

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 770735ea3..d2c8bd366 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -50,6 +50,7 @@ def filter_mt_for_ld(
     # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on
     # This is for test sets or 1kg_tdpg
     if re_call_stats:
+        logger.info("Regenerating call_stats...")
         call_stats = hl.agg.call_stats(pop_mt.GT, pop_mt.alleles)
         call_stats_bind = hl.bind(
             lambda cs: cs.annotate(
@@ -240,6 +241,10 @@ def main(args):
 
         # it is already in the right place for the gnomAD MT , or whatever
         # mt = mt.annotate(pop=mt.meta.population_inference.pop)
+
+    if args.pop:
+        mt = mt.filter_cols(mt.meta.population_inference.pop==args.pop)
+
     label = 'gen_anc' if not args.hgdp_subset else 'pop'
     pop_data = get_pop_and_subpop_counters(mt,label=label)
 
@@ -338,6 +343,9 @@ def main(args):
     parser.add_argument(
         "--re-call-stats", help="Regenerate callstats for LD work. Can be useful for some subsets.", action="store_true"
     )
+    parser.add_argument(
+        "--pop", help="Filter to, and run on, one individual pop", type=str,
+    )
     parser.add_argument(
         "--slack-channel", help="Slack channel to post results and notifications to."
     )

From 2d48987d767c91744de5068d29639132fea2493f Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Thu, 12 Sep 2024 16:16:47 -0400
Subject: [PATCH 05/13] run Black for formatting

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 81 +++++++++++++--------
 gnomad_qc/v4/resources/ld_resources.py      |  4 +-
 2 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index d2c8bd366..7fb399fb4 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -21,15 +21,16 @@
 RARE_FREQ = 0.0005
 
 
-def get_pop_and_subpop_counters(mt,label):
+def get_pop_and_subpop_counters(mt, label):
     cut_dict = {
         f"{label}": hl.agg.filter(
-            hl.is_defined(mt.meta.population_inference.pop) & (mt.meta.population_inference.pop != "oth"),
+            hl.is_defined(mt.meta.population_inference.pop)
+            & (mt.meta.population_inference.pop != "oth"),
             hl.agg.counter(mt.meta.population_inference.pop),
         ),
     }
     cut_data = mt.aggregate_cols(hl.struct(**cut_dict))
-    logger.info(f'Counts: {cut_data}')
+    logger.info(f"Counts: {cut_data}")
     return cut_data
 
 
@@ -125,18 +126,21 @@ def generate_ld_matrix(
     for label, pops in dict(pop_data).items():
         for pop in pops:
 
-            pop_mt = filter_mt_for_ld(
-                mt, label, pop, common_only, re_call_stats
-            )
+            pop_mt = filter_mt_for_ld(mt, label, pop, common_only, re_call_stats)
 
             pop_mt.rows().select("pop_freq").add_index().write(
-                ld_resources._ld_index_path(data_type, pop, common_only, adj, version=version), overwrite
+                ld_resources._ld_index_path(
+                    data_type, pop, common_only, adj, version=version
+                ),
+                overwrite,
             )
             ld = hl.ld_matrix(pop_mt.GT.n_alt_alleles(), pop_mt.locus, radius)
             if data_type != "genomes_snv_sv":
                 ld = ld.sparsify_triangle()
             ld.write(
-                ld_resources._ld_matrix_path(data_type, pop, common_only, adj, version=version),
+                ld_resources._ld_matrix_path(
+                    data_type, pop, common_only, adj, version=version
+                ),
                 overwrite,
             )
 
@@ -149,14 +153,16 @@ def generate_ld_scores_from_ld_matrix(
     adj: bool = False,
     radius: int = 1000000,
     overwrite: bool = False,
-    version: str = None
+    version: str = None,
 ):
     # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
     # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
     # mins to compute LD scores per population (~$150 total)
     for label, pops in dict(pop_data).items():
         for pop, n in pops.items():
-            ht = hl.read_table(ld_resources._ld_index_path(data_type, pop, adj=adj, version=version))
+            ht = hl.read_table(
+                ld_resources._ld_index_path(data_type, pop, adj=adj, version=version)
+            )
             ht = ht.filter(
                 (ht.pop_freq.AF >= min_frequency)
                 & (ht.pop_freq.AF <= 1 - min_frequency)
@@ -167,17 +173,21 @@ def generate_ld_scores_from_ld_matrix(
 
             r2 = BlockMatrix.read(
                 ld_resources._ld_matrix_path(
-                    data_type, pop, min_frequency >= COMMON_FREQ, adj=adj, version=version
+                    data_type,
+                    pop,
+                    min_frequency >= COMMON_FREQ,
+                    adj=adj,
+                    version=version,
                 )
             )
             r2 = r2.filter(indices, indices) ** 2
             r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))
 
-            out_name = ld_scores_path(data_type, pop, adj,version=version)
+            out_name = ld_scores_path(data_type, pop, adj, version=version)
             compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
 
 
-def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite,version):
+def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite, version):
     starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False)
     r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False)
 
@@ -212,9 +222,11 @@ def main(args):
             freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
         )
         mt = mt.annotate_rows(freq=mt.gnomad_freq)
-        mt = mt.annotate_cols(meta=hl.struct(
-            population_inference = hl.struct(
-                pop=mt.gnomad_population_inference.pop)))
+        mt = mt.annotate_cols(
+            meta=hl.struct(
+                population_inference=hl.struct(pop=mt.gnomad_population_inference.pop)
+            )
+        )
 
         if args.test:
             mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1)
@@ -226,13 +238,10 @@ def main(args):
             test=args.test, annotate_meta=True, release_only=True, split=True
         )
         mt_vds = vds.variant_data
-        mt = mt_vds.annotate_globals(
-            **hl.eval(ht_release.globals)
-        )
+        mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals))
 
         mt = mt.annotate_rows(
-            freq = ht_release[mt.row_key].freq,
-            filters = ht_release[mt.row_key].filters
+            freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters
         )
 
         if args.test:
@@ -243,17 +252,24 @@ def main(args):
         # mt = mt.annotate(pop=mt.meta.population_inference.pop)
 
     if args.pop:
-        mt = mt.filter_cols(mt.meta.population_inference.pop==args.pop)
+        mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop)
 
-    label = 'gen_anc' if not args.hgdp_subset else 'pop'
-    pop_data = get_pop_and_subpop_counters(mt,label=label)
+    label = "gen_anc" if not args.hgdp_subset else "pop"
+    pop_data = get_pop_and_subpop_counters(mt, label=label)
 
     # Version is populated via ld_resources.py if None
-    version = None if not args.hgdp_subset else 'hgdp'
+    version = None if not args.hgdp_subset else "hgdp"
 
     if args.generate_ld_pruned_set:
         generate_ld_pruned_set(
-            mt, pop_data, data_type, args.r2, args.radius, args.overwrite, re_call_stats=args.re_call_stats, version=version
+            mt,
+            pop_data,
+            data_type,
+            args.r2,
+            args.radius,
+            args.overwrite,
+            re_call_stats=args.re_call_stats,
+            version=version,
         )
 
     if args.generate_ld_matrix:
@@ -265,8 +281,8 @@ def main(args):
             args.common_only,
             args.adj,
             args.overwrite,
-            re_call_stats=args.re_call_stats, 
-            version=version
+            re_call_stats=args.re_call_stats,
+            version=version,
         )
 
     if args.generate_ld_scores:
@@ -280,7 +296,6 @@ def main(args):
         )
 
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -341,10 +356,14 @@ def main(args):
         "--hgdp-subset", help="Use hgdp dataset for callset.", action="store_true"
     )
     parser.add_argument(
-        "--re-call-stats", help="Regenerate callstats for LD work. Can be useful for some subsets.", action="store_true"
+        "--re-call-stats",
+        help="Regenerate callstats for LD work. Can be useful for some subsets.",
+        action="store_true",
     )
     parser.add_argument(
-        "--pop", help="Filter to, and run on, one individual pop", type=str,
+        "--pop",
+        help="Filter to, and run on, one individual pop",
+        type=str,
     )
     parser.add_argument(
         "--slack-channel", help="Slack channel to post results and notifications to."
diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py
index dd0dd274e..c35cfebed 100644
--- a/gnomad_qc/v4/resources/ld_resources.py
+++ b/gnomad_qc/v4/resources/ld_resources.py
@@ -48,7 +48,9 @@ def ld_scores_path(
     return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht'
 
 
-def ld_pruned_path(data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE):
+def ld_pruned_path(
+    data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE
+):
     return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht"
 
 

From 7714d5219edb68ce8127d6f4d2291b964c37713a Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Tue, 24 Sep 2024 15:34:42 -0400
Subject: [PATCH 06/13] Documentation & Paths

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 302 +++++++++++++++-----
 gnomad_qc/v4/resources/ld_resources.py      |  40 ++-
 2 files changed, 245 insertions(+), 97 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 7fb399fb4..18a12f157 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -1,27 +1,38 @@
+""" 
+Script to generate LD scores for gnomAD v4 Genome SNPs & Indels. Example usage:
+
+hailctl dataproc submit cluster_name generate_ld_data.py --test --overwrite --hgdp-subset --pop afr --re-call-stats \
+    --generate-ld-mt \
+    --generate-ld-pruned-set \
+    --generate-ld-matrix \
+    --generate-ld-scores
+"""
+
 import argparse
 import sys
 
-import gnomad.resources.grch37.gnomad_ld as ld_resources
 from gnomad.utils.slack import slack_notifications
 from hail.linalg import BlockMatrix
 from hail.utils import new_temp_file
 
-# from hail.utils.java import Env
-
 from gnomad_qc.slack_creds import slack_token
+from gnomad_qc.v3.resources.release import hgdp_tgp_subset
 from gnomad_qc.v4.resources import *
 from gnomad_qc.v4.resources.ld_resources import *
-from gnomad_qc.v3.resources.release import hgdp_tgp_subset
-
-# this includes ld_resources
-
-HGDP_TGP_RELEASES
 
 COMMON_FREQ = 0.005
 RARE_FREQ = 0.0005
 
 
-def get_pop_and_subpop_counters(mt, label):
+def get_pop_counters(mt, label) -> dict:
+    """
+    Calculate and return genetic ancestry counts given an mt and column label.
+
+    :param mt: Input MT
+    :param label: Col field containing genetic ancestry information
+    :return: Dict of pop counts
+    """
+
     cut_dict = {
         f"{label}": hl.agg.filter(
             hl.is_defined(mt.meta.population_inference.pop)
@@ -36,7 +47,18 @@ def get_pop_and_subpop_counters(mt, label):
 
 def filter_mt_for_ld(
     mt, label, pop, common_only: bool = True, re_call_stats: bool = False
-):
+) -> hl.MatrixTable:
+    """
+    Filter MT to only variants and samples appropriate for LD analyses
+
+    :param mt: Input MT to filter
+    :param label: Col field containing genetic ancestry information
+    :param pop: Given genetic ancestry group to filter to
+    :param common_only: Bool of whether to filter to only common variants (>0.5%)
+    :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
+    :return: MT filtered to appropriate variants and samples
+    """
+
     frequency = COMMON_FREQ if common_only else RARE_FREQ
     # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants
     # All work on standard machines except AFR
@@ -90,7 +112,19 @@ def generate_ld_pruned_set(
     overwrite: bool = False,
     re_call_stats: bool = False,
     version: str = None,
-):
+) -> None:
+    """
+    Generate and write set of variants uncorrelated with eachother. Wrapper for hl.ld_prune()
+
+    :param mt: Input MT to filter
+    :param pop_data: Dict of population counts
+    :param data_type: Genetic data type for exomes or genomes. Exomes are currently too sparse, only runs on genomes.
+    :param r2: via Hail: Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0].
+    :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound).
+    :param overwrite: Bool to write over previous outputs or not
+    :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
+    :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
+    """
 
     for label, pops in dict(pop_data).items():
         for pop in pops:
@@ -119,7 +153,20 @@ def generate_ld_matrix(
     overwrite: bool = False,
     re_call_stats: bool = False,
     version: str = None,
-):
+) -> None:
+    """
+    Generate and write matrix of LD correlcations as Hail BlockMatrix using hl.ld_matrix. Read by generate_ld_scores_from_ld_matrix()
+
+    :param mt: Input MT to filter and generate LDs for
+    :param pop_data: Dict of population counts
+    :param data_type: Genetic data type for exomes or genomes. Exomes are currently too sparse, only runs on genomes.
+    :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound).
+    :param common_only: Bool of whether to filter to only common variants (>0.5%)
+    :param adj: Whether to use adj ("adjusted" or "passing") frequency
+    :param overwrite: Bool to write over previous outputs or not
+    :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
+    :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
+    """
     # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population
     # Total of ~37 hours ($400)
 
@@ -129,8 +176,12 @@ def generate_ld_matrix(
             pop_mt = filter_mt_for_ld(mt, label, pop, common_only, re_call_stats)
 
             pop_mt.rows().select("pop_freq").add_index().write(
-                ld_resources._ld_index_path(
-                    data_type, pop, common_only, adj, version=version
+                ld_index_path(
+                    data_type=data_type,
+                    pop=pop,
+                    common_only=common_only,
+                    adj=adj,
+                    version=version,
                 ),
                 overwrite,
             )
@@ -138,9 +189,7 @@ def generate_ld_matrix(
             if data_type != "genomes_snv_sv":
                 ld = ld.sparsify_triangle()
             ld.write(
-                ld_resources._ld_matrix_path(
-                    data_type, pop, common_only, adj, version=version
-                ),
+                ld_matrix_path(data_type, pop, common_only, adj, version=version),
                 overwrite,
             )
 
@@ -150,18 +199,38 @@ def generate_ld_scores_from_ld_matrix(
     data_type,
     min_frequency=0.01,
     call_rate_cutoff=0.8,
+    common_only: bool = True,
     adj: bool = False,
     radius: int = 1000000,
     overwrite: bool = False,
     version: str = None,
-):
+) -> None:
+    """
+    Generate LD scores from Hail BlockMatrix written by generate_ld_scores_from_ld_matrix().
+
+    :param pop_data: Dict of population counts
+    :param data_type: Genetic data type for exomes or genomes. Exomes are currently too sparse, only runs on genomes.
+    :param min_frequency: Lowest frequency for variants to calculate LD scores of.
+    :param call_rate_cutoff: Lowest call rate for variants to calculate LD scores of.
+    :param common_only: Bool of whether to filter to only common variants (>0.5%)
+    :param adj: Whether to use adj ("adjusted" or "passing") frequency
+    :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound).
+    :param overwrite: Bool to write over previous outputs or not
+    :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
+    """
     # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
     # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
     # mins to compute LD scores per population (~$150 total)
     for label, pops in dict(pop_data).items():
         for pop, n in pops.items():
             ht = hl.read_table(
-                ld_resources._ld_index_path(data_type, pop, adj=adj, version=version)
+                ld_index_path(
+                    data_type=data_type,
+                    pop=pop,
+                    common_only=common_only,
+                    adj=adj,
+                    version=version,
+                )
             )
             ht = ht.filter(
                 (ht.pop_freq.AF >= min_frequency)
@@ -172,7 +241,7 @@ def generate_ld_scores_from_ld_matrix(
             indices = ht.idx.collect()
 
             r2 = BlockMatrix.read(
-                ld_resources._ld_matrix_path(
+                ld_matrix_path(
                     data_type,
                     pop,
                     min_frequency >= COMMON_FREQ,
@@ -184,10 +253,21 @@ def generate_ld_scores_from_ld_matrix(
             r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))
 
             out_name = ld_scores_path(data_type, pop, adj, version=version)
-            compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
+            compute_and_annotate_ld_score(
+                ht, r2_adj, radius, out_name, overwrite, version=version
+            )
+
 
+def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> None:
+    """
+    Annotate LD scores onto Hail Table containing variants and write to path.
 
-def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite, version):
+    :param ht: Hail Table of variants and LD indices
+    :param r2: via Hail: Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0].
+    :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound).
+    :param out_name: Path to write outputs to. Designed to be output of ld_scores_path()
+    :param overwrite: Bool to write over previous outputs or not
+    """
     starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False)
     r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False)
 
@@ -208,7 +288,11 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite, versi
 
 
 def main(args):
-    hl.init(log="/ld_assessment.log", tmp_dir="gs://gnomad-tmp-4day/ld_tmp/")
+    hl.init(
+        log="/ld_assessment.log",
+        tmp_dir="gs://gnomad-tmp-4day/ld_tmp/",
+        gcs_requester_pays_configuration="broad-mpg-gnomad",
+    )
     hl._set_flags(use_new_shuffle="1")
 
     # Only run on genomes so far, no choice to run on exomes, too sparse
@@ -216,83 +300,148 @@ def main(args):
     if args.exomes:
         raise NotImplementedError("LD Code does not work for exomes yet. Too sparse:(")
 
-    if args.hgdp_subset:
-        mt = hgdp_tgp_subset(dense=True, public=True).mt()
-        mt = mt.annotate_globals(
-            freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
+    # Define command line args used more than once
+    test = args.test
+    is_hgdp = args.hgdp_subset
+    overwrite = args.overwrite
+
+    if test and not args.re_call_stats:
+        logger.info(
+            "WARNING: Test subsets without re-called stats may result in None or inaccurate LD scores."
         )
-        mt = mt.annotate_rows(freq=mt.gnomad_freq)
-        mt = mt.annotate_cols(
-            meta=hl.struct(
-                population_inference=hl.struct(pop=mt.gnomad_population_inference.pop)
+
+    # Version is populated via ld_resources.py if None
+    version = None if not is_hgdp else "hgdp"
+
+    # Path for LD MT includes all pops by design
+    ld_mt_path = ld_mt_checkpoint_path(
+        data_type="genomes", pops="all_pops", version=version, test=test
+    )
+
+    def _ld_test_mt(
+        mt: hl.MatrixTable,
+        filter_contig: bool = False,
+        sample_cols: bool = False,
+    ):
+        if filter_contig:
+            mt = mt.filter_rows(mt.locus.contig == "chr22")
+        if sample_cols:
+            mt = mt.sample_cols(0.1)
+
+        return mt
+
+    def _generate_ld_mt(
+        version: str = version,
+        is_hgdp: bool = False,
+        test: bool = False,
+        ld_mt_path: str = ld_mt_path,
+        overwrite=overwrite,
+    ) -> hl.MatrixTable:
+
+        if is_hgdp:
+            mt = hgdp_tgp_subset(dense=True, public=True).mt()
+            mt = mt.annotate_globals(
+                freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
+            )
+            mt = mt.annotate_rows(freq=mt.gnomad_freq)
+            mt = mt.annotate_cols(
+                meta=hl.struct(
+                    population_inference=hl.struct(
+                        pop=mt.gnomad_population_inference.pop
+                    )
+                )
             )
-        )
 
-        if args.test:
-            mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1)
-            mt = mt.filter_cols(mt.meta.population_inference.pop == "afr")
+            if test:
+                logger.info(
+                    "Filter HGDP to Test, to chr22, and downsample cols by 0.1..."
+                )
+                mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True)
 
-    else:
-        ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
-        vds = basics.get_gnomad_v4_genomes_vds(
-            test=args.test, annotate_meta=True, release_only=True, split=True
-        )
-        mt_vds = vds.variant_data
-        mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals))
+        else:
+            ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
+            vds = basics.get_gnomad_v4_genomes_vds(
+                test=test, annotate_meta=True, release_only=True, split=True
+            )
+            mt_vds = vds.variant_data
+            mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals))
 
-        mt = mt.annotate_rows(
-            freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters
-        )
+            if test:
+                logger.info(
+                    "Filter Test VDS to Chr22, do not touch sample level information, then naive coalesce to 1000 partitions..."
+                )
+                mt = _ld_test_mt(mt, filter_contig=True, sample_cols=False)
+                mt = mt.naive_coalesce(1000)
+
+            mt = mt.annotate_rows(
+                freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters
+            )
 
-        if args.test:
-            mt = mt.filter_rows(mt.locus.contig == "chr22").sample_rows(0.1)
-            mt = mt.filter_cols(mt.meta.population_inference.pop == "afr")
+        # Select and checkpoint to speed up calculations
+        mt = mt.select_rows(mt.freq, mt.filters)
+        mt = mt.select_cols(mt.meta)
 
-        # it is already in the right place for the gnomAD MT , or whatever
-        # mt = mt.annotate(pop=mt.meta.population_inference.pop)
+        logger.info(f"Checkpoint MT to {ld_mt_path} ...")
+        mt = mt.checkpoint(ld_mt_path, overwrite=overwrite)
+
+        return mt
+
+    # Read in Hail MatrixTable for analysis
+    mt = None
+    if args.generate_ld_mt:
+        mt = _generate_ld_mt(
+            version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path
+        )
+    else:
+        mt = hl.read_matrix_table(ld_mt_path)
 
     if args.pop:
         mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop)
+        mt = mt.checkpoint(
+            ld_mt_checkpoint_path(
+                data_type="genomes", pops=args.pop, version=version, test=test
+            ),
+            overwrite=overwrite,
+        )
 
-    label = "gen_anc" if not args.hgdp_subset else "pop"
-    pop_data = get_pop_and_subpop_counters(mt, label=label)
-
-    # Version is populated via ld_resources.py if None
-    version = None if not args.hgdp_subset else "hgdp"
+    label = "gen_anc" if not is_hgdp else "pop"
+    pop_data = get_pop_counters(mt, label=label)
 
     if args.generate_ld_pruned_set:
         generate_ld_pruned_set(
-            mt,
-            pop_data,
-            data_type,
-            args.r2,
-            args.radius,
-            args.overwrite,
+            mt=mt,
+            pop_data=pop_data,
+            data_type=data_type,
+            r2=args.r2,
+            radius=args.radius,
+            overwrite=overwrite,
             re_call_stats=args.re_call_stats,
             version=version,
         )
 
     if args.generate_ld_matrix:
         generate_ld_matrix(
-            mt,
-            pop_data,
-            data_type,
-            args.radius,
-            args.common_only,
-            args.adj,
-            args.overwrite,
+            mt=mt,
+            pop_data=pop_data,
+            data_type=data_type,
+            radius=args.radius,
+            common_only=args.common_only,
+            adj=args.adj,
+            overwrite=overwrite,
             re_call_stats=args.re_call_stats,
             version=version,
         )
 
     if args.generate_ld_scores:
         generate_ld_scores_from_ld_matrix(
-            pop_data,
-            data_type,
-            args.min_frequency,
-            args.min_call_rate,
-            args.adj,
-            overwrite=args.overwrite,
+            pop_data=pop_data,
+            data_type=data_type,
+            min_frequency=args.min_frequency,
+            call_rate_cutoff=args.min_call_rate,
+            common_only=args.common_only,
+            adj=args.adj,
+            version=version,
+            overwrite=overwrite,
         )
 
 
@@ -303,6 +452,11 @@ def main(args):
         help="Input MT is exomes. One of --exomes or --genomes is required.",
         action="store_true",
     )
+    parser.add_argument(
+        "--generate-ld-mt",
+        help="Generate Hail MatrixTable of variants and calls to calculate LD for. ",
+        action="store_true",
+    )
     parser.add_argument(
         "--generate-ld-pruned-set",
         help="Calculates LD pruned set of variants",
diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py
index c35cfebed..7eeda4a88 100644
--- a/gnomad_qc/v4/resources/ld_resources.py
+++ b/gnomad_qc/v4/resources/ld_resources.py
@@ -3,10 +3,6 @@
 from typing import Optional
 
 from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE
-from gnomad.resources.resource_utils import (
-    GnomadPublicBlockMatrixResource,
-    GnomadPublicTableResource,
-)
 
 
 def ld_matrix_path(
@@ -20,7 +16,7 @@ def ld_matrix_path(
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
+    return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
 
 
 def ld_index_path(
@@ -34,7 +30,7 @@ def ld_index_path(
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.r{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
+    return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
 
 
 #
@@ -45,25 +41,23 @@ def ld_scores_path(
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht'
+    return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}.{pop}.{"adj." if adj else ""}ld_scores.ht'
 
 
-def ld_pruned_path(
-    data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE
+def ld_mt_checkpoint_path(
+    data_type: str,
+    pops: str = None,
+    version: str = CURRENT_GENOME_RELEASE,
+    test: bool = False,
 ):
-    return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.r{version}.{pop}.ld.pruned_set.r2_{r2}.ht"
-
-
-def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource:
-    """Get resource for the LD matrix for the given population."""
-    return GnomadPublicBlockMatrixResource(path=ld_matrix_path("genomes", pop))
-
-
-def ld_index(pop: str) -> GnomadPublicTableResource:
-    """Get resource for the LD indices for the given population."""
-    return GnomadPublicTableResource(path=ld_index_path("genomes", pop))
+    if version is None:
+        version = CURRENT_GENOME_RELEASE
+    return f'gs://gnomad-tmp-30day/ld/gnomad.{data_type}.{"all_pops" if not pops else f"{pops}"}.{version}{f".test" if test else ""}.mt'
 
 
-def ld_scores(pop: str) -> GnomadPublicTableResource:
-    """Get resource for the LD scores for the given population."""
-    return GnomadPublicTableResource(path=ld_scores_path("genomes", pop))
+def ld_pruned_path(
+    data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE
+):
+    if version is None:
+        version = CURRENT_GENOME_RELEASE
+    return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}.{pop}.ld.pruned_set.r2_{r2}.ht"

From 331cd125d8e57fb2f269dd2b69ad9d1286ee878f Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Tue, 24 Sep 2024 16:19:49 -0400
Subject: [PATCH 07/13] added better test params

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 32 ++++++++++++++++-----
 gnomad_qc/v4/resources/ld_resources.py      | 14 +++++----
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 18a12f157..22f6aec81 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -112,6 +112,7 @@ def generate_ld_pruned_set(
     overwrite: bool = False,
     re_call_stats: bool = False,
     version: str = None,
+    test: bool = False,
 ) -> None:
     """
     Generate and write set of variants uncorrelated with eachother. Wrapper for hl.ld_prune()
@@ -124,6 +125,7 @@ def generate_ld_pruned_set(
     :param overwrite: Bool to write over previous outputs or not
     :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
+    :param test: Filter to test versions and/or write to test paths. 
     """
 
     for label, pops in dict(pop_data).items():
@@ -138,7 +140,7 @@ def generate_ld_pruned_set(
 
             ht = pop_mt.rows().select("pop_freq")
             ht = ht.filter(hl.is_defined(pruned_ht[ht.key]))
-            ld_path = ld_pruned_path(data_type, pop, r2, version=version)
+            ld_path = ld_pruned_path(data_type, pop, r2, version=version, test=test)
             logger.info(f"Writing pruned ht for {pop} to {ld_path} ...")
             ht.write(ld_path, overwrite)
 
@@ -153,6 +155,7 @@ def generate_ld_matrix(
     overwrite: bool = False,
     re_call_stats: bool = False,
     version: str = None,
+    test: bool = False,
 ) -> None:
     """
     Generate and write matrix of LD correlcations as Hail BlockMatrix using hl.ld_matrix. Read by generate_ld_scores_from_ld_matrix()
@@ -166,6 +169,7 @@ def generate_ld_matrix(
     :param overwrite: Bool to write over previous outputs or not
     :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
+    :param test: Filter to test versions and/or write to test paths. 
     """
     # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population
     # Total of ~37 hours ($400)
@@ -182,6 +186,7 @@ def generate_ld_matrix(
                     common_only=common_only,
                     adj=adj,
                     version=version,
+                    test=test,
                 ),
                 overwrite,
             )
@@ -189,7 +194,7 @@ def generate_ld_matrix(
             if data_type != "genomes_snv_sv":
                 ld = ld.sparsify_triangle()
             ld.write(
-                ld_matrix_path(data_type, pop, common_only, adj, version=version),
+                ld_matrix_path(data_type, pop, common_only, adj, version=version, test=test),
                 overwrite,
             )
 
@@ -204,6 +209,7 @@ def generate_ld_scores_from_ld_matrix(
     radius: int = 1000000,
     overwrite: bool = False,
     version: str = None,
+    test: bool = False,
 ) -> None:
     """
     Generate LD scores from Hail BlockMatrix written by generate_ld_scores_from_ld_matrix().
@@ -217,6 +223,7 @@ def generate_ld_scores_from_ld_matrix(
     :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound).
     :param overwrite: Bool to write over previous outputs or not
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
+    :param test: Filter to test versions and/or write to test paths. 
     """
     # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
     # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
@@ -230,6 +237,7 @@ def generate_ld_scores_from_ld_matrix(
                     common_only=common_only,
                     adj=adj,
                     version=version,
+                    test=test
                 )
             )
             ht = ht.filter(
@@ -247,14 +255,15 @@ def generate_ld_scores_from_ld_matrix(
                     min_frequency >= COMMON_FREQ,
                     adj=adj,
                     version=version,
+                    test=test
                 )
             )
             r2 = r2.filter(indices, indices) ** 2
             r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))
 
-            out_name = ld_scores_path(data_type, pop, adj, version=version)
+            out_name = ld_scores_path(data_type, pop, adj, version=version, test=test)
             compute_and_annotate_ld_score(
-                ht, r2_adj, radius, out_name, overwrite, version=version
+                ht, r2_adj, radius, out_name, overwrite
             )
 
 
@@ -324,8 +333,10 @@ def _ld_test_mt(
         sample_cols: bool = False,
     ):
         if filter_contig:
+            logger.info('Filtering to chr22...')
             mt = mt.filter_rows(mt.locus.contig == "chr22")
         if sample_cols:
+            logger.info('Downsampling all cols by 0.1...')
             mt = mt.sample_cols(0.1)
 
         return mt
@@ -339,6 +350,7 @@ def _generate_ld_mt(
     ) -> hl.MatrixTable:
 
         if is_hgdp:
+            logger.info('Reading in HGDP_TGP Subset...')
             mt = hgdp_tgp_subset(dense=True, public=True).mt()
             mt = mt.annotate_globals(
                 freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
@@ -353,12 +365,10 @@ def _generate_ld_mt(
             )
 
             if test:
-                logger.info(
-                    "Filter HGDP to Test, to chr22, and downsample cols by 0.1..."
-                )
                 mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True)
 
         else:
+            logger.info('Reading in gnomAD release HT and VDS...')
             ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
             vds = basics.get_gnomad_v4_genomes_vds(
                 test=test, annotate_meta=True, release_only=True, split=True
@@ -389,6 +399,7 @@ def _generate_ld_mt(
     # Read in Hail MatrixTable for analysis
     mt = None
     if args.generate_ld_mt:
+        logger.info('Generating MT for LD with all genetic ancestry groups...')
         mt = _generate_ld_mt(
             version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path
         )
@@ -396,6 +407,7 @@ def _generate_ld_mt(
         mt = hl.read_matrix_table(ld_mt_path)
 
     if args.pop:
+        logger.info(f'Filtering to {args.pop}...')
         mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop)
         mt = mt.checkpoint(
             ld_mt_checkpoint_path(
@@ -408,6 +420,7 @@ def _generate_ld_mt(
     pop_data = get_pop_counters(mt, label=label)
 
     if args.generate_ld_pruned_set:
+        logger.info('Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()...')
         generate_ld_pruned_set(
             mt=mt,
             pop_data=pop_data,
@@ -417,9 +430,11 @@ def _generate_ld_mt(
             overwrite=overwrite,
             re_call_stats=args.re_call_stats,
             version=version,
+            test=test,
         )
 
     if args.generate_ld_matrix:
+        logger.info('Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()...')
         generate_ld_matrix(
             mt=mt,
             pop_data=pop_data,
@@ -430,9 +445,11 @@ def _generate_ld_mt(
             overwrite=overwrite,
             re_call_stats=args.re_call_stats,
             version=version,
+            test=test
         )
 
     if args.generate_ld_scores:
+        logger.info('Generating in LD scores and annotating onto variant HT...')
         generate_ld_scores_from_ld_matrix(
             pop_data=pop_data,
             data_type=data_type,
@@ -442,6 +459,7 @@ def _generate_ld_mt(
             adj=args.adj,
             version=version,
             overwrite=overwrite,
+            test=test
         )
 
 
diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py
index 7eeda4a88..db0a880b5 100644
--- a/gnomad_qc/v4/resources/ld_resources.py
+++ b/gnomad_qc/v4/resources/ld_resources.py
@@ -11,12 +11,13 @@ def ld_matrix_path(
     common_only: bool = True,
     adj: bool = True,
     version: Optional[str] = None,
+    test: bool = False
 ):
     if version is None:
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
+    return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
 
 
 def ld_index_path(
@@ -25,23 +26,24 @@ def ld_index_path(
     common_only: bool = True,
     adj: bool = True,
     version: Optional[str] = None,
+    test: bool = False
 ):
     if version is None:
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
+    return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
 
 
 #
 def ld_scores_path(
-    data_type: str, pop: str, adj: bool = True, version: Optional[str] = None
+    data_type: str, pop: str, adj: bool = True, version: Optional[str] = None, test: bool = False
 ):
     if version is None:
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}.{pop}.{"adj." if adj else ""}ld_scores.ht'
+    return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"adj." if adj else ""}ld_scores.ht'
 
 
 def ld_mt_checkpoint_path(
@@ -56,8 +58,8 @@ def ld_mt_checkpoint_path(
 
 
 def ld_pruned_path(
-    data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE
+    data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE, test: bool = False
 ):
     if version is None:
         version = CURRENT_GENOME_RELEASE
-    return f"gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}.{pop}.ld.pruned_set.r2_{r2}.ht"
+    return f'gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.ld.pruned_set.r2_{r2}.ht'

From 5f0ecdd661b9b0a89c37fb01b166e055c38f7219 Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Tue, 24 Sep 2024 16:37:03 -0400
Subject: [PATCH 08/13] common_only logic, formatting...

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 62 +++++++++++++--------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 22f6aec81..a00d2ff3b 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -125,7 +125,7 @@ def generate_ld_pruned_set(
     :param overwrite: Bool to write over previous outputs or not
     :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
-    :param test: Filter to test versions and/or write to test paths. 
+    :param test: Filter to test versions and/or write to test paths.
     """
 
     for label, pops in dict(pop_data).items():
@@ -169,7 +169,7 @@ def generate_ld_matrix(
     :param overwrite: Bool to write over previous outputs or not
     :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
-    :param test: Filter to test versions and/or write to test paths. 
+    :param test: Filter to test versions and/or write to test paths.
     """
     # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population
     # Total of ~37 hours ($400)
@@ -194,7 +194,14 @@ def generate_ld_matrix(
             if data_type != "genomes_snv_sv":
                 ld = ld.sparsify_triangle()
             ld.write(
-                ld_matrix_path(data_type, pop, common_only, adj, version=version, test=test),
+                ld_matrix_path(
+                    data_type,
+                    pop,
+                    common_only=common_only,
+                    adj=adj,
+                    version=version,
+                    test=test,
+                ),
                 overwrite,
             )
 
@@ -223,11 +230,18 @@ def generate_ld_scores_from_ld_matrix(
     :param radius: Used for bp_window_size, via Hail: Window size in base pairs (inclusive upper bound).
     :param overwrite: Bool to write over previous outputs or not
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
-    :param test: Filter to test versions and/or write to test paths. 
+    :param test: Filter to test versions and/or write to test paths.
     """
     # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
     # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
     # mins to compute LD scores per population (~$150 total)
+
+    freq_cutoff = COMMON_FREQ if common_only else RARE_FREQ
+
+    logger.info(
+        f'Min frequency of {min_frequency} is {">=" if min_frequency >= my_cutoff else "<"} cutoff of {freq_cutoff}...'
+    )
+
     for label, pops in dict(pop_data).items():
         for pop, n in pops.items():
             ht = hl.read_table(
@@ -237,7 +251,7 @@ def generate_ld_scores_from_ld_matrix(
                     common_only=common_only,
                     adj=adj,
                     version=version,
-                    test=test
+                    test=test,
                 )
             )
             ht = ht.filter(
@@ -250,21 +264,19 @@ def generate_ld_scores_from_ld_matrix(
 
             r2 = BlockMatrix.read(
                 ld_matrix_path(
-                    data_type,
-                    pop,
-                    min_frequency >= COMMON_FREQ,
+                    data_type=data_type,
+                    pop=pop,
+                    common_only=common_only,
                     adj=adj,
                     version=version,
-                    test=test
+                    test=test,
                 )
             )
             r2 = r2.filter(indices, indices) ** 2
             r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))
 
             out_name = ld_scores_path(data_type, pop, adj, version=version, test=test)
-            compute_and_annotate_ld_score(
-                ht, r2_adj, radius, out_name, overwrite
-            )
+            compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
 
 
 def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> None:
@@ -333,10 +345,10 @@ def _ld_test_mt(
         sample_cols: bool = False,
     ):
         if filter_contig:
-            logger.info('Filtering to chr22...')
+            logger.info("Filtering to chr22...")
             mt = mt.filter_rows(mt.locus.contig == "chr22")
         if sample_cols:
-            logger.info('Downsampling all cols by 0.1...')
+            logger.info("Downsampling all cols by 0.1...")
             mt = mt.sample_cols(0.1)
 
         return mt
@@ -350,7 +362,7 @@ def _generate_ld_mt(
     ) -> hl.MatrixTable:
 
         if is_hgdp:
-            logger.info('Reading in HGDP_TGP Subset...')
+            logger.info("Reading in HGDP_TGP Subset...")
             mt = hgdp_tgp_subset(dense=True, public=True).mt()
             mt = mt.annotate_globals(
                 freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
@@ -368,7 +380,7 @@ def _generate_ld_mt(
                 mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True)
 
         else:
-            logger.info('Reading in gnomAD release HT and VDS...')
+            logger.info("Reading in gnomAD release HT and VDS...")
             ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
             vds = basics.get_gnomad_v4_genomes_vds(
                 test=test, annotate_meta=True, release_only=True, split=True
@@ -399,7 +411,7 @@ def _generate_ld_mt(
     # Read in Hail MatrixTable for analysis
     mt = None
     if args.generate_ld_mt:
-        logger.info('Generating MT for LD with all genetic ancestry groups...')
+        logger.info("Generating MT for LD with all genetic ancestry groups...")
         mt = _generate_ld_mt(
             version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path
         )
@@ -407,7 +419,7 @@ def _generate_ld_mt(
         mt = hl.read_matrix_table(ld_mt_path)
 
     if args.pop:
-        logger.info(f'Filtering to {args.pop}...')
+        logger.info(f"Filtering to {args.pop}...")
         mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop)
         mt = mt.checkpoint(
             ld_mt_checkpoint_path(
@@ -420,7 +432,9 @@ def _generate_ld_mt(
     pop_data = get_pop_counters(mt, label=label)
 
     if args.generate_ld_pruned_set:
-        logger.info('Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()...')
+        logger.info(
+            "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..."
+        )
         generate_ld_pruned_set(
             mt=mt,
             pop_data=pop_data,
@@ -434,7 +448,9 @@ def _generate_ld_mt(
         )
 
     if args.generate_ld_matrix:
-        logger.info('Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()...')
+        logger.info(
+            "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..."
+        )
         generate_ld_matrix(
             mt=mt,
             pop_data=pop_data,
@@ -445,11 +461,11 @@ def _generate_ld_mt(
             overwrite=overwrite,
             re_call_stats=args.re_call_stats,
             version=version,
-            test=test
+            test=test,
         )
 
     if args.generate_ld_scores:
-        logger.info('Generating in LD scores and annotating onto variant HT...')
+        logger.info("Generating in LD scores and annotating onto variant HT...")
         generate_ld_scores_from_ld_matrix(
             pop_data=pop_data,
             data_type=data_type,
@@ -459,7 +475,7 @@ def _generate_ld_mt(
             adj=args.adj,
             version=version,
             overwrite=overwrite,
-            test=test
+            test=test,
         )
 
 

From 49e9733843b65f7edaad3077e45721c08ff588a0 Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Tue, 24 Sep 2024 16:50:32 -0400
Subject: [PATCH 09/13] cutoff variable

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index a00d2ff3b..fa4b6acc2 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -239,7 +239,7 @@ def generate_ld_scores_from_ld_matrix(
     freq_cutoff = COMMON_FREQ if common_only else RARE_FREQ
 
     logger.info(
-        f'Min frequency of {min_frequency} is {">=" if min_frequency >= my_cutoff else "<"} cutoff of {freq_cutoff}...'
+        f'Min frequency of {min_frequency} is {">=" if min_frequency >= freq_cutoff else "<"} cutoff of {freq_cutoff}...'
     )
 
     for label, pops in dict(pop_data).items():

From e25706a7051e97cb26dc6e34b0d55194bad033f2 Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Tue, 28 Jan 2025 15:55:46 -0500
Subject: [PATCH 10/13] better gnomAD VDS performance

still pretty slow checkpointing chr22 before join/annotation , and does not work for chr1 (stalls out)
---
 gnomad_qc/v4/assessment/generate_ld_data.py | 160 ++++++++++++++++----
 1 file changed, 127 insertions(+), 33 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index fa4b6acc2..9766c210d 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -45,6 +45,55 @@ def get_pop_counters(mt, label) -> dict:
     return cut_data
 
 
+def _filter_ht_for_ld(
+    ht,
+    label="gen_anc",
+    pop="eas",
+    common_only: bool = True,
+    re_call_stats: bool = False,
+) -> hl.MatrixTable:
+    """
+    Filter HT to only variants appropriate for LD analyses for a given genetic ancestry.
+
+    :param mt: Input HT to filter
+    :param label: Col field containing genetic ancestry information
+    :param pop: Given genetic ancestry group to filter to
+    :param common_only: Bool of whether to filter to only common variants (>0.5%)
+    :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
+    :return: MT filtered to appropriate variants and samples
+    """
+
+    frequency = COMMON_FREQ if common_only else RARE_FREQ
+    # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants
+    # All work on standard machines except AFR
+    # Also creating `common_only` ones for most practical uses (above 0.5%)
+    # pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop) # irrelevant for HT
+    meta_index = (
+        hl.enumerate(ht.freq_meta)
+        .find(lambda f: (f[1].get(label) == pop))
+        .collect()[0][0]
+    )
+    # No support for SV datasets in gnomAD v4 Production team code
+    # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on
+    # This is for test sets or 1kg_tdpg
+    # CANNOT re_call stats without GT
+    ht = ht.filter((hl.len(ht.filters) == 0))
+    pop_freq = ht.freq[meta_index]
+    ht = ht.annotate(pop_freq=pop_freq)
+
+    ht = ht.filter(
+        (ht.pop_freq.AC > 1)
+        & (ht.pop_freq.AN - ht.pop_freq.AC > 1)
+        & (ht.pop_freq.AF > frequency)
+        &
+        #  the following filter is needed for "het-only" monomorphic sites
+        #  as otherwise variance is 0, and so, row_correlation errors out
+        ~((ht.pop_freq.AF == 0.5) & (ht.pop_freq.homozygote_count == 0))
+    )
+
+    return ht
+
+
 def filter_mt_for_ld(
     mt, label, pop, common_only: bool = True, re_call_stats: bool = False
 ) -> hl.MatrixTable:
@@ -134,6 +183,14 @@ def generate_ld_pruned_set(
             pop_mt = filter_mt_for_ld(mt, label, pop, re_call_stats)
             logger.info(f"Count after filter_mt_for_ld() : {pop_mt.count()}")
 
+            # now we checkpoint
+            # pop_mt = pop_mt.checkpoint(
+            #     ld_mt_checkpoint_path(
+            #         data_type="genomes", pops=pop, version=version, test=test
+            #     ),
+            #     overwrite=overwrite,
+            # )
+
             pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius)
 
             logger.info(f"Count after hl.ld_prune() : {pruned_ht.count()}")
@@ -325,6 +382,8 @@ def main(args):
     test = args.test
     is_hgdp = args.hgdp_subset
     overwrite = args.overwrite
+    pop = args.pop
+    chrom = args.chrom
 
     if test and not args.re_call_stats:
         logger.info(
@@ -336,7 +395,7 @@ def main(args):
 
     # Path for LD MT includes all pops by design
     ld_mt_path = ld_mt_checkpoint_path(
-        data_type="genomes", pops="all_pops", version=version, test=test
+        data_type="genomes", pops=pop, version=version, test=test
     )
 
     def _ld_test_mt(
@@ -358,6 +417,7 @@ def _generate_ld_mt(
         is_hgdp: bool = False,
         test: bool = False,
         ld_mt_path: str = ld_mt_path,
+        pop: str = "eas",
         overwrite=overwrite,
     ) -> hl.MatrixTable:
 
@@ -376,61 +436,88 @@ def _generate_ld_mt(
                 )
             )
 
+            mt = mt.filter_cols(mt.meta.population_inference.pop == "pop")
+
             if test:
                 mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True)
 
+            mt = mt.checkpoint(
+                ld_mt_path,
+                overwrite=overwrite,
+            )
+
         else:
             logger.info("Reading in gnomAD release HT and VDS...")
+            filter_chrom = None
+            if test:
+                filter_chrom = "chr22"
+            elif chrom:
+                filter_chrom = chrom
+
             ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
+            ht_filter = _filter_ht_for_ld(ht_release,label='gen_anc',pop=pop)
+            if filter_chrom:
+                ht_filter = ht_filter.filter(ht_filter.locus.contig==filter_chrom)
+
+            ht_filter = ht_filter.select('freq','filters').checkpoint(new_temp_file())
+
             vds = basics.get_gnomad_v4_genomes_vds(
-                test=test, annotate_meta=True, release_only=True, split=True
+                test=test,
+                annotate_meta=True,
+                release_only=True,
+                split=True,
+                chrom=filter_chrom,
+                filter_variant_ht=ht_filter,
+                entries_to_keep=["GT"],
             )
-            mt_vds = vds.variant_data
-            mt = mt_vds.annotate_globals(**hl.eval(ht_release.globals))
 
-            if test:
+            if test or chrom:
                 logger.info(
-                    "Filter Test VDS to Chr22, do not touch sample level information, then naive coalesce to 1000 partitions..."
+                    f"Due to Test or Chrom behavior, only working on {filter_chrom} - do naive coalesce to 1,000 partitions"
                 )
-                mt = _ld_test_mt(mt, filter_contig=True, sample_cols=False)
-                mt = mt.naive_coalesce(1000)
+                mt = vds.variant_data.naive_coalesce(1000)
+            else:
+                mt = vds.variant_data.naive_coalesce(9800)
 
+            logger.info(
+                f"Filtering to relevant cols - only meta - and no VDS/MT rows..."
+            )
+            mt = mt.select_cols(mt.meta)
+            mt = mt.select_rows()
+            logger.info(f"Filtering to {args.pop}...")
+            mt = mt.filter_cols(mt.meta.population_inference.pop == pop)
+
+            # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT
+            logger.info(f"Checkpointing {pop} on {chrom if chrom else default_str}")
+            mt = mt.checkpoint(new_temp_file())
+
+            # This part begins potential performance concerns - what is up here?
+            # Ideally, would checkpoint both of these beforehand.
+            # Is very slow if MT isn't checkpointed
+            # Checkpointing said MT is also very slow...
             mt = mt.annotate_rows(
-                freq=ht_release[mt.row_key].freq, filters=ht_release[mt.row_key].filters
+                freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters
             )
+            mt = mt.annotate_globals(**hl.eval(ht_filter.globals))
 
         # Select and checkpoint to speed up calculations
-        mt = mt.select_rows(mt.freq, mt.filters)
-        mt = mt.select_cols(mt.meta)
 
-        logger.info(f"Checkpoint MT to {ld_mt_path} ...")
-        mt = mt.checkpoint(ld_mt_path, overwrite=overwrite)
+        # logger.info(f"DO NOT Checkpoint MT to {ld_mt_path} ...")
+        # mt = mt.checkpoint(ld_mt_path, overwrite=overwrite)
 
         return mt
 
-    # Read in Hail MatrixTable for analysis
-    mt = None
-    if args.generate_ld_mt:
-        logger.info("Generating MT for LD with all genetic ancestry groups...")
-        mt = _generate_ld_mt(
-            version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path
-        )
-    else:
-        mt = hl.read_matrix_table(ld_mt_path)
-
-    if args.pop:
-        logger.info(f"Filtering to {args.pop}...")
-        mt = mt.filter_cols(mt.meta.population_inference.pop == args.pop)
-        mt = mt.checkpoint(
-            ld_mt_checkpoint_path(
-                data_type="genomes", pops=args.pop, version=version, test=test
-            ),
-            overwrite=overwrite,
-        )
+    mt = _generate_ld_mt(
+        version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path
+    )
+
+    # Something needs checkpointed before heading into generate_ld_pruned_set()
+    mt = mt.checkpoint(new_temp_file()) # skip if prior step is being checkpointed
 
     label = "gen_anc" if not is_hgdp else "pop"
     pop_data = get_pop_counters(mt, label=label)
 
+    # Previously established that THIS needs a checkpoint to have been ran to work effectively
     if args.generate_ld_pruned_set:
         logger.info(
             "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..."
@@ -550,8 +637,15 @@ def _generate_ld_mt(
     )
     parser.add_argument(
         "--pop",
-        help="Filter to, and run on, one individual pop",
+        help="Which individual pop to run LD on",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--chrom",
+        help="Which individual chromosome to run LD on",
         type=str,
+        required=False,
     )
     parser.add_argument(
         "--slack-channel", help="Slack channel to post results and notifications to."

From b127899018e19d9ca459b236f77089f541c4f750 Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Mon, 7 Apr 2025 10:13:27 -0400
Subject: [PATCH 11/13] April LD Code

Code for LD, pending VDS performance investigation
---
 gnomad_qc/v4/assessment/generate_ld_data.py | 711 +++++++++++++-------
 gnomad_qc/v4/resources/ld_resources.py      |  50 +-
 2 files changed, 511 insertions(+), 250 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 9766c210d..6b2f9abf8 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -11,6 +11,7 @@
 import argparse
 import sys
 
+from gnomad.utils.annotations import get_adj_expr
 from gnomad.utils.slack import slack_notifications
 from hail.linalg import BlockMatrix
 from hail.utils import new_temp_file
@@ -18,10 +19,15 @@
 from gnomad_qc.slack_creds import slack_token
 from gnomad_qc.v3.resources.release import hgdp_tgp_subset
 from gnomad_qc.v4.resources import *
-from gnomad_qc.v4.resources.ld_resources import *
 
-COMMON_FREQ = 0.005
-RARE_FREQ = 0.0005
+EXPECTED_CONTIGS = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"]
+
+from typing import Optional
+
+from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE
+
+# Importing paths and Common+Rare Freqs
+from gnomad_qc.v4.resources.ld_resources import *
 
 
 def get_pop_counters(mt, label) -> dict:
@@ -45,12 +51,32 @@ def get_pop_counters(mt, label) -> dict:
     return cut_data
 
 
+def _filter_call_stats(
+    ht,
+    freq: float = COMMON_FREQ,
+    ac_cutoff: int = 1,
+) -> hl.Table:
+    """
+    Filter ht to relevant variants. Must have field popfreq. This code is used for filtering ht and mt
+    """
+
+    ht = ht.filter(
+        (ht.pop_freq.AC > ac_cutoff)
+        & (ht.pop_freq.AN - ht.pop_freq.AC > 1)
+        & (ht.pop_freq.AF > freq)
+        & ~((ht.pop_freq.AF == 0.5) & (ht.pop_freq.homozygote_count == 0))
+        #  the above filter is needed for "het-only" monomorphic sites
+        #  as otherwise variance is 0, and so, row_correlation errors out
+    )
+    return ht
+
+
 def _filter_ht_for_ld(
     ht,
+    freq: float = COMMON_FREQ,
     label="gen_anc",
     pop="eas",
-    common_only: bool = True,
-    re_call_stats: bool = False,
+    ac_cutoff: int = 1,
 ) -> hl.MatrixTable:
     """
     Filter HT to only variants appropriate for LD analyses for a given genetic ancestry.
@@ -62,17 +88,12 @@ def _filter_ht_for_ld(
     :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
     :return: MT filtered to appropriate variants and samples
     """
-
-    frequency = COMMON_FREQ if common_only else RARE_FREQ
-    # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants
-    # All work on standard machines except AFR
-    # Also creating `common_only` ones for most practical uses (above 0.5%)
-    # pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop) # irrelevant for HT
     meta_index = (
         hl.enumerate(ht.freq_meta)
         .find(lambda f: (f[1].get(label) == pop))
         .collect()[0][0]
     )
+
     # No support for SV datasets in gnomAD v4 Production team code
     # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on
     # This is for test sets or 1kg_tdpg
@@ -81,21 +102,18 @@ def _filter_ht_for_ld(
     pop_freq = ht.freq[meta_index]
     ht = ht.annotate(pop_freq=pop_freq)
 
-    ht = ht.filter(
-        (ht.pop_freq.AC > 1)
-        & (ht.pop_freq.AN - ht.pop_freq.AC > 1)
-        & (ht.pop_freq.AF > frequency)
-        &
-        #  the following filter is needed for "het-only" monomorphic sites
-        #  as otherwise variance is 0, and so, row_correlation errors out
-        ~((ht.pop_freq.AF == 0.5) & (ht.pop_freq.homozygote_count == 0))
-    )
-
-    return ht
+    return _filter_call_stats(ht, freq=freq, ac_cutoff=ac_cutoff)
 
 
 def filter_mt_for_ld(
-    mt, label, pop, common_only: bool = True, re_call_stats: bool = False
+    mt,
+    label,
+    pop,
+    freq: float = COMMON_FREQ,
+    re_call_stats: bool = False,
+    ld_pruned_path: str = None,
+    ac_cutoff: int = 1,
+    call_rate_cutoff: float = 0.8,
 ) -> hl.MatrixTable:
     """
     Filter MT to only variants and samples appropriate for LD analyses
@@ -107,17 +125,30 @@ def filter_mt_for_ld(
     :param re_call_stats: Bool to re-calculate callstats. Needed for subsets and when sampling cols.
     :return: MT filtered to appropriate variants and samples
     """
+    logger.info(
+        f"With label: {label} and pop: {pop} and freq: {freq} and ac_cutoff: {ac_cutoff} and call_rate_cutoff: {call_rate_cutoff}"
+    )
 
-    frequency = COMMON_FREQ if common_only else RARE_FREQ
-    # At the moment, ld_matrix OOMs above ~30M variants, so filtering all populations to 0.05% to cut down on variants
-    # All work on standard machines except AFR
-    # Also creating `common_only` ones for most practical uses (above 0.5%)
     pop_mt = mt.filter_cols(mt.meta.population_inference.pop == pop)
     meta_index = (
         hl.enumerate(pop_mt.freq_meta)
         .find(lambda f: (f[1].get(label) == pop))
         .collect()[0][0]
     )
+
+    pop_count_int = pop_mt.count_cols()
+    logger.info(f"Count1: {pop_mt.count()}")
+
+    # If passed, read in pruned variants and remove them from our analyses
+    # Do this here to save having to compute on them
+    # ... and nothing here is sample-level, so we can count out
+    if ld_pruned_path:
+        logger.info("Filtering out LD pruned variants...")
+        ld_ht = hl.read_table(ld_pruned_path)
+        pop_mt = pop_mt.filter_rows(~hl.is_defined(ld_ht[pop_mt.row_key]))
+
+    logger.info(f"Count2: {pop_mt.count()}")
+
     # No support for SV datasets in gnomAD v4 Production team code
     # Have to re-do callstats when noted frequencies don't 100% reflect what you are calculating on
     # This is for test sets or 1kg_tdpg
@@ -133,21 +164,21 @@ def filter_mt_for_ld(
         pop_freq = call_stats_bind
         pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq)
 
+        logger.info(f"Count_recallstats: {pop_mt.count()}")
+
     else:
-        pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0))
         pop_freq = pop_mt.freq[meta_index]
         pop_mt = pop_mt.annotate_rows(pop_freq=pop_freq)
+        logger.info(f"Count_not_recall_stats: {pop_mt.count()}")
+
+    # Annotate callrate to check
+    pop_mt = pop_mt.annotate_rows(callrate=pop_mt.pop_freq.AN / (pop_count_int * 2))
 
     pop_mt = pop_mt.filter_rows((hl.len(pop_mt.filters) == 0))
-    pop_mt = pop_mt.filter_rows(
-        (pop_mt.pop_freq.AC > 1)
-        & (pop_mt.pop_freq.AN - pop_mt.pop_freq.AC > 1)
-        & (pop_mt.pop_freq.AF > frequency)
-        &
-        #  the following filter is needed for "het-only" monomorphic sites
-        #  as otherwise variance is 0, and so, row_correlation errors out
-        ~((pop_mt.pop_freq.AF == 0.5) & (pop_mt.pop_freq.homozygote_count == 0))
-    )
+    pop_ht_rows = _filter_call_stats(pop_mt.rows(), freq=freq, ac_cutoff=ac_cutoff)
+    pop_mt = pop_mt.filter_rows(hl.is_defined(pop_ht_rows[pop_mt.row_key]))
+
+    logger.info(f"Count_final: {pop_mt.count()}")
 
     return pop_mt
 
@@ -157,11 +188,16 @@ def generate_ld_pruned_set(
     pop_data: dict,
     data_type: str,
     r2: str = "0.2",
+    freq: float = COMMON_FREQ,
     radius: int = 1000000,
     overwrite: bool = False,
     re_call_stats: bool = False,
     version: str = None,
     test: bool = False,
+    ld_contig: str = None,
+    ac_cutoff: int = 1,
+    call_rate_cutoff: float = 0.8,
+    adj: bool = False,
 ) -> None:
     """
     Generate and write set of variants uncorrelated with eachother. Wrapper for hl.ld_prune()
@@ -180,24 +216,36 @@ def generate_ld_pruned_set(
     for label, pops in dict(pop_data).items():
         for pop in pops:
             logger.info(f"Filtering for {pop}...")
-            pop_mt = filter_mt_for_ld(mt, label, pop, re_call_stats)
+            pop_mt = filter_mt_for_ld(
+                mt=mt,
+                label=label,
+                pop=pop,
+                freq=freq,
+                re_call_stats=re_call_stats,
+                ld_pruned_path=None,
+                ac_cutoff=ac_cutoff,
+                call_rate_cutoff=call_rate_cutoff,
+            )
             logger.info(f"Count after filter_mt_for_ld() : {pop_mt.count()}")
 
-            # now we checkpoint
-            # pop_mt = pop_mt.checkpoint(
-            #     ld_mt_checkpoint_path(
-            #         data_type="genomes", pops=pop, version=version, test=test
-            #     ),
-            #     overwrite=overwrite,
-            # )
-
             pruned_ht = hl.ld_prune(pop_mt.GT, r2=float(r2), bp_window_size=radius)
 
-            logger.info(f"Count after hl.ld_prune() : {pruned_ht.count()}")
+            logger.info(f"Count of pruned from hl.ld_prune() : {pruned_ht.count()}")
 
             ht = pop_mt.rows().select("pop_freq")
             ht = ht.filter(hl.is_defined(pruned_ht[ht.key]))
-            ld_path = ld_pruned_path(data_type, pop, r2, version=version, test=test)
+
+            # Write out pruned set
+            ld_path = ld_pruned_path(
+                data_type,
+                pop,
+                r2,
+                version=version,
+                test=test,
+                freq=freq,
+                ld_contig=ld_contig,
+                adj=adj,
+            )
             logger.info(f"Writing pruned ht for {pop} to {ld_path} ...")
             ht.write(ld_path, overwrite)
 
@@ -207,12 +255,17 @@ def generate_ld_matrix(
     pop_data,
     data_type,
     radius: int = 1000000,
-    common_only: bool = True,
+    freq: float = COMMON_FREQ,
     adj: bool = False,
     overwrite: bool = False,
     re_call_stats: bool = False,
     version: str = None,
     test: bool = False,
+    ld_contig: str = None,
+    r2: str = "0.2",
+    custom_suffix: str = None,
+    ac_cutoff: int = 1,
+    call_rate_cutoff: float = 0.8,
 ) -> None:
     """
     Generate and write matrix of LD correlcations as Hail BlockMatrix using hl.ld_matrix. Read by generate_ld_scores_from_ld_matrix()
@@ -234,16 +287,36 @@ def generate_ld_matrix(
     for label, pops in dict(pop_data).items():
         for pop in pops:
 
-            pop_mt = filter_mt_for_ld(mt, label, pop, common_only, re_call_stats)
+            pop_mt = filter_mt_for_ld(
+                mt,
+                label,
+                pop,
+                freq=freq,
+                re_call_stats=re_call_stats,
+                ac_cutoff=ac_cutoff,
+                call_rate_cutoff=call_rate_cutoff,
+                ld_pruned_path=ld_pruned_path(
+                    data_type,
+                    pop,
+                    r2=r2,
+                    version=version,
+                    test=test,
+                    freq=freq,
+                    ld_contig=ld_contig,
+                    adj=adj,
+                ),
+            )
 
             pop_mt.rows().select("pop_freq").add_index().write(
                 ld_index_path(
                     data_type=data_type,
                     pop=pop,
-                    common_only=common_only,
+                    freq=freq,
                     adj=adj,
                     version=version,
                     test=test,
+                    ld_contig=ld_contig,
+                    custom_suffix=custom_suffix,
                 ),
                 overwrite,
             )
@@ -254,10 +327,12 @@ def generate_ld_matrix(
                 ld_matrix_path(
                     data_type,
                     pop,
-                    common_only=common_only,
+                    freq=freq,
                     adj=adj,
                     version=version,
                     test=test,
+                    ld_contig=ld_contig,
+                    custom_suffix=custom_suffix,
                 ),
                 overwrite,
             )
@@ -266,14 +341,15 @@ def generate_ld_matrix(
 def generate_ld_scores_from_ld_matrix(
     pop_data,
     data_type,
-    min_frequency=0.01,
+    freq: float = COMMON_FREQ,
     call_rate_cutoff=0.8,
-    common_only: bool = True,
     adj: bool = False,
     radius: int = 1000000,
     overwrite: bool = False,
     version: str = None,
     test: bool = False,
+    ld_contig: str = None,
+    custom_suffix: str = None,
 ) -> None:
     """
     Generate LD scores from Hail BlockMatrix written by generate_ld_scores_from_ld_matrix().
@@ -293,11 +369,9 @@ def generate_ld_scores_from_ld_matrix(
     # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
     # mins to compute LD scores per population (~$150 total)
 
-    freq_cutoff = COMMON_FREQ if common_only else RARE_FREQ
-
-    logger.info(
-        f'Min frequency of {min_frequency} is {">=" if min_frequency >= freq_cutoff else "<"} cutoff of {freq_cutoff}...'
-    )
+    # logger.info(
+    #     f'Min frequency of {min_frequency} is {">=" if min_frequency >= freq_cutoff else "<"} cutoff of {freq_cutoff}...'
+    # )
 
     for label, pops in dict(pop_data).items():
         for pop, n in pops.items():
@@ -305,15 +379,20 @@ def generate_ld_scores_from_ld_matrix(
                 ld_index_path(
                     data_type=data_type,
                     pop=pop,
-                    common_only=common_only,
+                    freq=freq,
                     adj=adj,
                     version=version,
                     test=test,
+                    ld_contig=ld_contig,
+                    custom_suffix=custom_suffix,
                 )
             )
+
+            ht = ht.annotate(callrate=(ht.pop_freq.AN / n) / 2)
+
             ht = ht.filter(
-                (ht.pop_freq.AF >= min_frequency)
-                & (ht.pop_freq.AF <= 1 - min_frequency)
+                (ht.pop_freq.AF >= freq)
+                & (ht.pop_freq.AF <= 1 - freq)
                 & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff)
             ).add_index(name="new_idx")
 
@@ -323,16 +402,28 @@ def generate_ld_scores_from_ld_matrix(
                 ld_matrix_path(
                     data_type=data_type,
                     pop=pop,
-                    common_only=common_only,
+                    freq=freq,
                     adj=adj,
                     version=version,
                     test=test,
+                    ld_contig=ld_contig,
+                    custom_suffix=custom_suffix,
                 )
             )
             r2 = r2.filter(indices, indices) ** 2
             r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))
 
-            out_name = ld_scores_path(data_type, pop, adj, version=version, test=test)
+            out_name = ld_scores_path(
+                data_type,
+                pop,
+                adj,
+                version=version,
+                test=test,
+                freq=freq,
+                ld_contig=ld_contig,
+                custom_suffix=custom_suffix,
+                call_rate_cutoff=call_rate_cutoff,
+            )
             compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite)
 
 
@@ -365,6 +456,152 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> No
     ht.filter(hl.is_defined(ht.ld_score)).write(out_name, overwrite)
 
 
+def _ld_test_mt(
+    mt: hl.MatrixTable,
+    mt_contig: bool = False,
+    sample_cols: bool = False,
+) -> hl.MatrixTable:
+    if mt_contig:
+        logger.info("Filtering to chr22...")
+        mt = mt.filter_rows(mt.locus.contig == "chr22")
+    if sample_cols:
+        logger.info("Downsampling all cols by 0.1...")
+        mt = mt.sample_cols(0.1)
+
+    return mt
+
+
+def generate_ld_mt(
+    ld_mt_path: str,
+    is_hgdp: bool = False,
+    test: bool = False,
+    pop: str = "eas",
+    adj: bool = False,
+    freq: float = COMMON_FREQ,
+    mt_contig: str = None,
+    do_v2_samples: bool = False,
+    hapmap: bool = False,
+    overwrite=False,
+) -> hl.MatrixTable:
+
+    if is_hgdp:
+        logger.info("Reading in HGDP_TGP Subset...")
+        mt = hgdp_tgp_subset(dense=True, public=True).mt()
+        mt = mt.annotate_globals(
+            freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
+        )
+        mt = mt.annotate_rows(freq=mt.gnomad_freq)
+        mt = mt.annotate_cols(
+            meta=hl.struct(
+                population_inference=hl.struct(pop=mt.gnomad_population_inference.pop)
+            )
+        )
+
+        # Filter to appropriate genetic ancestry and frequency b efore checkpointing
+        mt = mt.filter_cols(mt.meta.population_inference.pop == pop)
+        mt = mt.filter_rows(mt.freq[mt.freq_index_dict[f"gnomad_{pop}"]].AF > freq)
+
+        if mt_contig:
+            mt = mt.filter_rows(mt.locus.contig == mt_contig)
+
+        if test:
+            mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True)
+
+        logger.info(
+            f"Checkpointing HGDP_TGP Subset on {mt_contig if mt_contig else 'full'}"
+        )
+        mt = mt.checkpoint(ld_mt_path, overwrite=overwrite)
+
+    else:
+        logger.info("Reading in gnomAD release HT and VDS...")
+
+        ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
+        ht_filter = _filter_ht_for_ld(ht_release, freq=freq, label="gen_anc", pop=pop)
+        if mt_contig:
+            ht_filter = ht_filter.filter(ht_filter.locus.contig == mt_contig)
+
+        ht_filter = ht_filter.select("freq", "filters").checkpoint(new_temp_file())
+
+        filter_samples_ht = meta(version="4.0", data_type="genomes").ht()
+        filter_samples_ht = filter_samples_ht.filter(
+            filter_samples_ht.population_inference.pop == pop
+        )
+
+        if do_v2_samples:
+            logger.info("Filtering to only v2 samples...")
+            filter_samples_ht = filter_samples_ht.filter(
+                (filter_samples_ht.project_meta.v2_pop == pop)
+                & (filter_samples_ht.project_meta.v2_release)
+                & (filter_samples_ht.project_meta.v2_high_quality)
+            )
+
+        naive_coalesce_partitions = 9800 if mt_contig is None else 1000
+
+        if mt_contig == "chr1":
+            naive_coalesce_partitions = 2500
+
+        if hapmap:
+            logger.info("Filtering to only hapmap variants...")
+            ht_hm3 = hl.read_table(
+                f"gs://gnomad-tmp/ld/HM3.UKBB.{pop.upper()}.qc.snplist.grch38.ht"
+            )
+            ht_filter = ht_filter.filter(hl.is_defined(ht_hm3[ht_filter.key]))
+            logger.info(
+                f"Count after filtering to hapmap variants: {ht_filter.count()}"
+            )
+
+        entries_to_keep = ["GT"] if not adj else ["GT", "GQ", "DP", "AD"]
+
+        vds = basics.get_gnomad_v4_genomes_vds(
+            test=test,
+            annotate_meta=True,
+            release_only=True,
+            split=True,
+            chrom=[mt_contig] if mt_contig else None,
+            filter_variant_ht=ht_filter,
+            naive_coalesce_partitions=naive_coalesce_partitions,
+            entries_to_keep=entries_to_keep,
+            filter_samples_ht=filter_samples_ht,
+        )
+
+        logger.info("From DENSE MT, performance beware...")
+        mt = hl.vds.to_dense_mt(vds)
+
+        logger.info(f"Filtering to relevant cols - only meta - and no VDS/MT rows...")
+        mt = mt.select_cols(mt.meta)
+        mt = mt.select_rows()
+
+        # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT
+        # NOTE: need to checkpoint either here or step below where noted
+        logger.info(
+            f"Checkpointing unannotated {pop} on {mt_contig if mt_contig else 'full'}"
+        )
+        mt = mt.checkpoint(new_temp_file())
+
+        if adj:
+            logger.info("Filtering to adj sites only...")
+            adj_expr = get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
+            mt = mt.filter_entries(adj_expr)
+            mt = mt.select_entries("GT")
+
+        # logger.info('CUSTOM READ FROM PRIOR CHECKPOINT..')
+        # mt = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp//XLmYz1uA7NYEY2jgB55Yuc')
+        # This part begins potential performance concerns - what is up here?
+        # Ideally, would checkpoint both of these beforehand.
+        # Is very slow if MT isn't checkpointed
+        # Checkpointing said MT is also very slow...
+        logger.info("Annotating information from release HT...")
+        mt = mt.annotate_rows(
+            freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters
+        )
+        mt = mt.annotate_globals(**hl.eval(ht_filter.globals))
+
+        logger.info("Checkpointing annotated MT...")
+        mt = mt.checkpoint(ld_mt_path, overwrite=overwrite)
+
+    return mt
+
+
 def main(args):
     hl.init(
         log="/ld_assessment.log",
@@ -380,10 +617,34 @@ def main(args):
 
     # Define command line args used more than once
     test = args.test
+    adj = args.adj
     is_hgdp = args.hgdp_subset
     overwrite = args.overwrite
     pop = args.pop
-    chrom = args.chrom
+    custom_mt = args.custom_mt
+    do_v2_samples = args.do_v2_samples
+    hapmap = args.do_hapmap
+
+    # Chromosome filter logic
+    mt_contig = args.mt_contig
+    ld_contig = args.ld_contig
+
+    if test:
+        mt_contig = "chr22"
+
+    if mt_contig:
+        ld_contig = mt_contig
+
+    # Get correct AF based on input logic...
+    vds_freq = args.vds_af
+
+    ld_freq = args.ld_af
+    ld_ac = args.ld_ac
+
+    if vds_freq < ld_freq:
+        pass  # this is the desired behavior, for the vds freq to be more permisive
+    if ld_freq < vds_freq:
+        ld_freq = vds_freq
 
     if test and not args.re_call_stats:
         logger.info(
@@ -391,179 +652,117 @@ def main(args):
         )
 
     # Version is populated via ld_resources.py if None
-    version = None if not is_hgdp else "hgdp"
+    version = None
+    if is_hgdp:
+        version = "hgdp"
+    elif do_v2_samples:
+        version = "v2samples"
+
+    if version:
+        version += "hapmap"
+    else:
+        version = "hapmap"
 
-    # Path for LD MT includes all pops by design
     ld_mt_path = ld_mt_checkpoint_path(
-        data_type="genomes", pops=pop, version=version, test=test
+        data_type="genomes",
+        freq=vds_freq,
+        pop=pop,
+        version=version,
+        test=test,
+        mt_contig=mt_contig,
+        adj=adj,
     )
 
-    def _ld_test_mt(
-        mt: hl.MatrixTable,
-        filter_contig: bool = False,
-        sample_cols: bool = False,
-    ):
-        if filter_contig:
-            logger.info("Filtering to chr22...")
-            mt = mt.filter_rows(mt.locus.contig == "chr22")
-        if sample_cols:
-            logger.info("Downsampling all cols by 0.1...")
-            mt = mt.sample_cols(0.1)
-
-        return mt
-
-    def _generate_ld_mt(
-        version: str = version,
-        is_hgdp: bool = False,
-        test: bool = False,
-        ld_mt_path: str = ld_mt_path,
-        pop: str = "eas",
-        overwrite=overwrite,
-    ) -> hl.MatrixTable:
-
-        if is_hgdp:
-            logger.info("Reading in HGDP_TGP Subset...")
-            mt = hgdp_tgp_subset(dense=True, public=True).mt()
-            mt = mt.annotate_globals(
-                freq_meta=mt.gnomad_freq_meta, freq_index_dict=mt.gnomad_freq_index_dict
-            )
-            mt = mt.annotate_rows(freq=mt.gnomad_freq)
-            mt = mt.annotate_cols(
-                meta=hl.struct(
-                    population_inference=hl.struct(
-                        pop=mt.gnomad_population_inference.pop
-                    )
-                )
-            )
-
-            mt = mt.filter_cols(mt.meta.population_inference.pop == "pop")
-
-            if test:
-                mt = _ld_test_mt(mt, filter_contig=True, sample_cols=True)
-
-            mt = mt.checkpoint(
-                ld_mt_path,
-                overwrite=overwrite,
-            )
-
+    if args.generate_ld_mt:
+        mt = generate_ld_mt(
+            is_hgdp=is_hgdp,
+            test=test,
+            pop=pop,
+            mt_contig=mt_contig,
+            freq=vds_freq,
+            ld_mt_path=ld_mt_path,
+            overwrite=overwrite,
+            do_v2_samples=do_v2_samples,
+            adj=adj,
+            hapmap=hapmap,
+        )
+    else:
+        if not custom_mt:
+            mt = hl.read_matrix_table(ld_mt_path)
         else:
-            logger.info("Reading in gnomAD release HT and VDS...")
-            filter_chrom = None
-            if test:
-                filter_chrom = "chr22"
-            elif chrom:
-                filter_chrom = chrom
+            mt = hl.read_matrix_table(custom_mt)
 
-            ht_release = hl.read_table(release.release_ht_path(data_type="genomes"))
-            ht_filter = _filter_ht_for_ld(ht_release,label='gen_anc',pop=pop)
-            if filter_chrom:
-                ht_filter = ht_filter.filter(ht_filter.locus.contig==filter_chrom)
-
-            ht_filter = ht_filter.select('freq','filters').checkpoint(new_temp_file())
-
-            vds = basics.get_gnomad_v4_genomes_vds(
+    label = "gen_anc" if not is_hgdp else "pop"
+    pop_data = get_pop_counters(
+        mt, label=label
+    )  # NOTE: could be irrelevant later? tbd if needed
+
+    contig_list = EXPECTED_CONTIGS if not ld_contig else [ld_contig]
+
+    for ld_contig in contig_list:
+        # NOTE: This FULLY serializes the code, but hopefully it works
+        # Would we save time without so many separate filter_mt_for_ld calls ?
+        # The only point of concern is hl.ld_prune() and the size of the matrix
+        # Need to filter the mt to only the contig we are working on
+        # This is because the LD matrix is too large otherwise
+        mt = mt.filter_rows(mt.locus.contig == ld_contig)
+        logger.info(f"Filtering to {ld_contig}...")
+
+        # Previously established that THIS needs a checkpoint to have been ran to work effectively
+        if args.generate_ld_pruned_set:
+            logger.info(
+                "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..."
+            )
+            generate_ld_pruned_set(
+                mt=mt,
+                pop_data=pop_data,
+                data_type=data_type,
+                r2=args.r2,
+                freq=ld_freq,
+                ac_cutoff=ld_ac,
+                radius=args.radius,
+                overwrite=overwrite,
+                re_call_stats=args.re_call_stats,
+                version=version,
                 test=test,
-                annotate_meta=True,
-                release_only=True,
-                split=True,
-                chrom=filter_chrom,
-                filter_variant_ht=ht_filter,
-                entries_to_keep=["GT"],
+                ld_contig=ld_contig,
+                adj=adj,
             )
 
-            if test or chrom:
-                logger.info(
-                    f"Due to Test or Chrom behavior, only working on {filter_chrom} - do naive coalesce to 1,000 partitions"
-                )
-                mt = vds.variant_data.naive_coalesce(1000)
-            else:
-                mt = vds.variant_data.naive_coalesce(9800)
-
+        if args.generate_ld_matrix:
             logger.info(
-                f"Filtering to relevant cols - only meta - and no VDS/MT rows..."
+                "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..."
             )
-            mt = mt.select_cols(mt.meta)
-            mt = mt.select_rows()
-            logger.info(f"Filtering to {args.pop}...")
-            mt = mt.filter_cols(mt.meta.population_inference.pop == pop)
-
-            # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT
-            logger.info(f"Checkpointing {pop} on {chrom if chrom else default_str}")
-            mt = mt.checkpoint(new_temp_file())
-
-            # This part begins potential performance concerns - what is up here?
-            # Ideally, would checkpoint both of these beforehand.
-            # Is very slow if MT isn't checkpointed
-            # Checkpointing said MT is also very slow...
-            mt = mt.annotate_rows(
-                freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters
+            generate_ld_matrix(
+                mt=mt,
+                pop_data=pop_data,
+                data_type=data_type,
+                radius=args.radius,
+                freq=ld_freq,
+                ac_cutoff=ld_ac,
+                adj=args.adj,
+                overwrite=overwrite,
+                re_call_stats=args.re_call_stats,
+                version=version,
+                test=test,
+                ld_contig=ld_contig,
+                custom_suffix=args.custom_suffix,
             )
-            mt = mt.annotate_globals(**hl.eval(ht_filter.globals))
-
-        # Select and checkpoint to speed up calculations
-
-        # logger.info(f"DO NOT Checkpoint MT to {ld_mt_path} ...")
-        # mt = mt.checkpoint(ld_mt_path, overwrite=overwrite)
-
-        return mt
-
-    mt = _generate_ld_mt(
-        version=version, is_hgdp=is_hgdp, test=test, ld_mt_path=ld_mt_path
-    )
-
-    # Something needs checkpointed before heading into generate_ld_pruned_set()
-    mt = mt.checkpoint(new_temp_file()) # skip if prior step is being checkpointed
-
-    label = "gen_anc" if not is_hgdp else "pop"
-    pop_data = get_pop_counters(mt, label=label)
 
-    # Previously established that THIS needs a checkpoint to have been ran to work effectively
-    if args.generate_ld_pruned_set:
-        logger.info(
-            "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..."
-        )
-        generate_ld_pruned_set(
-            mt=mt,
-            pop_data=pop_data,
-            data_type=data_type,
-            r2=args.r2,
-            radius=args.radius,
-            overwrite=overwrite,
-            re_call_stats=args.re_call_stats,
-            version=version,
-            test=test,
-        )
-
-    if args.generate_ld_matrix:
-        logger.info(
-            "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..."
-        )
-        generate_ld_matrix(
-            mt=mt,
-            pop_data=pop_data,
-            data_type=data_type,
-            radius=args.radius,
-            common_only=args.common_only,
-            adj=args.adj,
-            overwrite=overwrite,
-            re_call_stats=args.re_call_stats,
-            version=version,
-            test=test,
-        )
-
-    if args.generate_ld_scores:
-        logger.info("Generating in LD scores and annotating onto variant HT...")
-        generate_ld_scores_from_ld_matrix(
-            pop_data=pop_data,
-            data_type=data_type,
-            min_frequency=args.min_frequency,
-            call_rate_cutoff=args.min_call_rate,
-            common_only=args.common_only,
-            adj=args.adj,
-            version=version,
-            overwrite=overwrite,
-            test=test,
-        )
+        if args.generate_ld_scores:
+            logger.info("Generating in LD scores and annotating onto variant HT...")
+            generate_ld_scores_from_ld_matrix(
+                pop_data=pop_data,
+                data_type=data_type,
+                freq=ld_freq,
+                call_rate_cutoff=args.min_call_rate,
+                adj=args.adj,
+                version=version,
+                overwrite=overwrite,
+                test=test,
+                ld_contig=ld_contig,
+                custom_suffix=args.custom_suffix,
+            )
 
 
 if __name__ == "__main__":
@@ -575,7 +774,7 @@ def _generate_ld_mt(
     )
     parser.add_argument(
         "--generate-ld-mt",
-        help="Generate Hail MatrixTable of variants and calls to calculate LD for. ",
+        help="Generate pop-specific Hail MatrixTable of variants and calls to calculate LD for. ",
         action="store_true",
     )
     parser.add_argument(
@@ -587,9 +786,10 @@ def _generate_ld_mt(
         "--generate-ld-matrix", help="Calculates LD matrix", action="store_true"
     )
     parser.add_argument(
-        "--common-only",
-        help="Calculates LD matrix only on common variants (above 0.5%)",
-        action="store_true",
+        "--vds-af",
+        help="Minimum genetic ancestry-specific allele frequency to pull from for VDS.",
+        type=float,
+        default=RARE_FREQ,
     )
     parser.add_argument(
         "--adj",
@@ -601,18 +801,24 @@ def _generate_ld_mt(
         help="Calculates LD scores from LD matrix",
         action="store_true",
     )
-    parser.add_argument(
-        "--min-frequency",
-        help="Minimum allele frequency to compute LD scores (default 0.01)",
-        default=0.01,
-        type=float,
-    )
     parser.add_argument(
         "--min-call-rate",
         help="Minimum call rate to compute LD scores (default 0.8)",
         default=0.8,
         type=float,
     )
+    parser.add_argument(
+        "--ld-ac",
+        help="Minimum allele count to compute LD scores (default 1).",
+        default=1,
+        type=int,
+    )
+    parser.add_argument(
+        "--ld-af",
+        help="Minimum allele frequency to compute LD scores (default 0.0005).",
+        default=RARE_FREQ,
+        type=float,
+    )
     parser.add_argument(
         "--r2", help="r-squared to which to prune LD (default 0.2)", default="0.2"
     )
@@ -637,19 +843,48 @@ def _generate_ld_mt(
     )
     parser.add_argument(
         "--pop",
-        help="Which individual pop to run LD on",
+        help="Which individual pop to run LD on.",
         type=str,
         required=True,
     )
     parser.add_argument(
-        "--chrom",
-        help="Which individual chromosome to run LD on",
+        "--mt-contig",
+        help="Which chromosome to pull VDS from. Applies for reading and writing.",
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "--ld-contig",
+        help="Which chromosome to run LD on. Applies for reading and writing.",
         type=str,
         required=False,
     )
     parser.add_argument(
         "--slack-channel", help="Slack channel to post results and notifications to."
     )
+    parser.add_argument(
+        "--custom-mt",
+        type=str,
+        default=None,
+        help="Custom MT to run LD on",
+    )
+    parser.add_argument(
+        "--do-v2-samples",
+        action="store_true",
+        help="Only use samples also in v2. Good for verification!",
+    )
+    parser.add_argument(
+        "--custom-suffix",
+        type=str,
+        help="Custom string to append to output files",
+        required=False,
+    )
+    parser.add_argument(
+        "--do-hapmap",
+        action="store_true",
+        help="Filter to HapMap3 SNPs when pulling VDS.",
+    )
+
     parser.add_argument("--overwrite", help="Overwrite data", action="store_true")
     args = parser.parse_args()
 
diff --git a/gnomad_qc/v4/resources/ld_resources.py b/gnomad_qc/v4/resources/ld_resources.py
index db0a880b5..290820578 100644
--- a/gnomad_qc/v4/resources/ld_resources.py
+++ b/gnomad_qc/v4/resources/ld_resources.py
@@ -4,62 +4,88 @@
 
 from gnomad.resources.grch38.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE
 
+COMMON_FREQ = 0.005
+RARE_FREQ = 0.0005
+
 
 def ld_matrix_path(
     data_type: str,
     pop: str,
-    common_only: bool = True,
+    freq: float = COMMON_FREQ,
     adj: bool = True,
+    ld_contig: str = None,
     version: Optional[str] = None,
-    test: bool = False
+    test: bool = False,
+    custom_suffix: str = None,
 ):
     if version is None:
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.bm'
+    return f'gs://gnomad-tmp-30day/ld/matrix/gnomad.{data_type}.{version}{f".test" if test else ""}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.{f"af{freq}"}.{"adj." if adj else ""}ld{custom_suffix if custom_suffix else ""}.bm'
 
 
 def ld_index_path(
     data_type: str,
     pop: str,
-    common_only: bool = True,
+    freq: float = COMMON_FREQ,
     adj: bool = True,
+    ld_contig: str = None,
     version: Optional[str] = None,
-    test: bool = False
+    test: bool = False,
+    custom_suffix: str = None,
 ):
     if version is None:
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"common." if common_only else ""}{"adj." if adj else ""}ld.variant_indices.ht'
+    return f'gs://gnomad-tmp-30day/ld/index/gnomad.{data_type}.{version}{f".test" if test else ""}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.{f"af{freq}"}.{"adj." if adj else ""}ld.variant_indices{custom_suffix if custom_suffix else ""}.ht'
 
 
 #
 def ld_scores_path(
-    data_type: str, pop: str, adj: bool = True, version: Optional[str] = None, test: bool = False
+    data_type: str,
+    pop: str,
+    adj: bool = True,
+    ld_contig: str = None,
+    freq: float = COMMON_FREQ,
+    version: Optional[str] = None,
+    test: bool = False,
+    custom_suffix: str = None,
+    call_rate_cutoff: float = 0.8,
 ):
     if version is None:
         version = (
             CURRENT_EXOME_RELEASE if data_type == "exomes" else CURRENT_GENOME_RELEASE
         )
-    return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.{"adj." if adj else ""}ld_scores.ht'
+    return f'gs://gnomad-tmp-30day/ld/scores/gnomad.{data_type}.{version}{f".test" if test else ""}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.{f"af{freq}"}.{f"callrate{call_rate_cutoff}"}.{"adj." if adj else ""}ld_scores{f"{custom_suffix}" if custom_suffix else ""}.ht'
 
 
 def ld_mt_checkpoint_path(
     data_type: str,
-    pops: str = None,
+    freq: float = COMMON_FREQ,
+    pop: str = None,
     version: str = CURRENT_GENOME_RELEASE,
+    mt_contig: str = None,
     test: bool = False,
+    adj: bool = False,
 ):
     if version is None:
         version = CURRENT_GENOME_RELEASE
-    return f'gs://gnomad-tmp-30day/ld/gnomad.{data_type}.{"all_pops" if not pops else f"{pops}"}.{version}{f".test" if test else ""}.mt'
+    return f'gs://gnomad-tmp-30day/ld/gnomad.{data_type}.{"all_pops" if not pop else f"{pop}"}.{f"af{freq}"}.{"all_contigs" if not mt_contig else f"{mt_contig}"}.{version}{f".adj" if adj else ""}{f".test" if test else ""}.mt'
 
 
 def ld_pruned_path(
-    data_type: str, pop: str, r2: str, version: str = CURRENT_GENOME_RELEASE, test: bool = False
+    data_type: str,
+    pop: str,
+    r2: str,
+    freq: float = COMMON_FREQ,
+    ld_contig: str = None,
+    version: str = CURRENT_GENOME_RELEASE,
+    test: bool = False,
+    ld_set: bool = False,
+    adj: bool = False,
 ):
     if version is None:
         version = CURRENT_GENOME_RELEASE
-    return f'gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}{f".test" if test else ""}.{pop}.ld.pruned_set.r2_{r2}.ht'
+    return f'gs://gnomad-tmp-30day/ld/pruned/gnomad.{data_type}.{version}{f".test" if test else ""}.{f"af{freq}"}.{"all_contigs" if not ld_contig else f"{ld_contig}"}.{pop}.ld.{f"pruned_set" if not ld_set else "ld_set"}{f".adj" if adj else ""}.r2_{r2}.ht'

From 691b531ea241783e35639421ecb9e87cc3a9a6fd Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Tue, 8 Apr 2025 13:42:49 -0400
Subject: [PATCH 12/13] VDS checkpointing & linalg

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index 6b2f9abf8..bd97a1a44 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -442,7 +442,7 @@ def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite) -> No
 
     l2row = r2_adj.sum(axis=0).T
     l2col = r2_adj.sum(axis=1)
-    l2 = l2row + l2col + 1
+    l2 = l2row + l2col - 1
     l2_bm_tmp = new_temp_file()
     l2_tsv_tmp = new_temp_file()
 
@@ -564,6 +564,9 @@ def generate_ld_mt(
             filter_samples_ht=filter_samples_ht,
         )
 
+        vds.variant_data = vds.variant_data.checkpoint(new_temp_file())
+        vds.reference_data = vds.reference_data.checkpoint(new_temp_file())
+
         logger.info("From DENSE MT, performance beware...")
         mt = hl.vds.to_dense_mt(vds)
 
@@ -576,7 +579,6 @@ def generate_ld_mt(
         logger.info(
             f"Checkpointing unannotated {pop} on {mt_contig if mt_contig else 'full'}"
         )
-        mt = mt.checkpoint(new_temp_file())
 
         if adj:
             logger.info("Filtering to adj sites only...")
@@ -584,6 +586,8 @@ def generate_ld_mt(
             mt = mt.filter_entries(adj_expr)
             mt = mt.select_entries("GT")
 
+        # mt = mt.checkpoint(new_temp_file())
+
         # logger.info('CUSTOM READ FROM PRIOR CHECKPOINT..')
         # mt = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp//XLmYz1uA7NYEY2jgB55Yuc')
         # This part begins potential performance concerns - what is up here?

From f6b92fb7b302fc26eb8ec55b5ee500f7432a625f Mon Sep 17 00:00:00 2001
From: Daniel Marten <marten@broadinstitute.org>
Date: Thu, 8 May 2025 12:08:05 -0400
Subject: [PATCH 13/13] Small fixes, cleaning up

---
 gnomad_qc/v4/assessment/generate_ld_data.py | 48 +++++++++------------
 1 file changed, 20 insertions(+), 28 deletions(-)

diff --git a/gnomad_qc/v4/assessment/generate_ld_data.py b/gnomad_qc/v4/assessment/generate_ld_data.py
index bd97a1a44..fc1b692c9 100644
--- a/gnomad_qc/v4/assessment/generate_ld_data.py
+++ b/gnomad_qc/v4/assessment/generate_ld_data.py
@@ -1,11 +1,7 @@
-""" 
+"""
 Script to generate LD scores for gnomAD v4 Genome SNPs & Indels. Example usage:
 
-hailctl dataproc submit cluster_name generate_ld_data.py --test --overwrite --hgdp-subset --pop afr --re-call-stats \
-    --generate-ld-mt \
-    --generate-ld-pruned-set \
-    --generate-ld-matrix \
-    --generate-ld-scores
+hailctl dataproc submit dmld10 ~/Documents/GitHub/gnomad_qc/gnomad_qc/v4/assessment/generate_ld_data.py --generate-ld-mt --custom-suffix alleassample --pop eas --generate-ld-pruned-set --generate-ld-matrix --generate-ld-scores --adj --overwrite
 """
 
 import argparse
@@ -281,6 +277,7 @@ def generate_ld_matrix(
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
     :param test: Filter to test versions and/or write to test paths.
     """
+    # From gnomAD v2 run:
     # Takes about 4 hours on 20 n1-standard-8 nodes (with SSD - not sure if necessary) per population
     # Total of ~37 hours ($400)
 
@@ -365,6 +362,7 @@ def generate_ld_scores_from_ld_matrix(
     :param version: Version of files, either 'hgdp' or None, which ld_pruned_path() populates with most recent gnomAD genomes version.
     :param test: Filter to test versions and/or write to test paths.
     """
+    # From gnomAD v2 run:
     # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
     # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15
     # mins to compute LD scores per population (~$150 total)
@@ -564,7 +562,14 @@ def generate_ld_mt(
             filter_samples_ht=filter_samples_ht,
         )
 
+        logger.info("New checkpointing logic...")
         vds.variant_data = vds.variant_data.checkpoint(new_temp_file())
+
+        # From a prior run, where it failed (or was failed) after writing the Variant Data, it can be read back in as follows:
+        # with args: hailctl dataproc submit dmld50 generate_ld_data.py --overwrite --pop eas --ld-contig chr22 --adj --generate-ld-mt --generate-ld-pruned-set --generate-ld-matrix --generate-ld-scores --custom-suffix alleassamples
+        # READ VARIANT DATA FROM THIS
+        # vds.variant_data = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp/Sms092zqq8FbN9QXweJnzA') # FOR EAS RUN
+
         vds.reference_data = vds.reference_data.checkpoint(new_temp_file())
 
         logger.info("From DENSE MT, performance beware...")
@@ -574,26 +579,12 @@ def generate_ld_mt(
         mt = mt.select_cols(mt.meta)
         mt = mt.select_rows()
 
-        # NOTE: QUESTION IF THIS SHOULD BE RAN OR NOT
-        # NOTE: need to checkpoint either here or step below where noted
-        logger.info(
-            f"Checkpointing unannotated {pop} on {mt_contig if mt_contig else 'full'}"
-        )
-
         if adj:
             logger.info("Filtering to adj sites only...")
             adj_expr = get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
             mt = mt.filter_entries(adj_expr)
             mt = mt.select_entries("GT")
 
-        # mt = mt.checkpoint(new_temp_file())
-
-        # logger.info('CUSTOM READ FROM PRIOR CHECKPOINT..')
-        # mt = hl.read_matrix_table('gs://gnomad-tmp-4day/ld_tmp//XLmYz1uA7NYEY2jgB55Yuc')
-        # This part begins potential performance concerns - what is up here?
-        # Ideally, would checkpoint both of these beforehand.
-        # Is very slow if MT isn't checkpointed
-        # Checkpointing said MT is also very slow...
         logger.info("Annotating information from release HT...")
         mt = mt.annotate_rows(
             freq=ht_filter[mt.row_key].freq, filters=ht_filter[mt.row_key].filters
@@ -646,7 +637,7 @@ def main(args):
     ld_ac = args.ld_ac
 
     if vds_freq < ld_freq:
-        pass  # this is the desired behavior, for the vds freq to be more permisive
+        pass
     if ld_freq < vds_freq:
         ld_freq = vds_freq
 
@@ -662,10 +653,11 @@ def main(args):
     elif do_v2_samples:
         version = "v2samples"
 
-    if version:
-        version += "hapmap"
-    else:
-        version = "hapmap"
+    if hapmap == True:
+        if version:
+            version += "hapmap"
+        else:
+            version = "hapmap"
 
     ld_mt_path = ld_mt_checkpoint_path(
         data_type="genomes",
@@ -709,8 +701,8 @@ def main(args):
         # The only point of concern is hl.ld_prune() and the size of the matrix
         # Need to filter the mt to only the contig we are working on
         # This is because the LD matrix is too large otherwise
-        mt = mt.filter_rows(mt.locus.contig == ld_contig)
         logger.info(f"Filtering to {ld_contig}...")
+        mt_ld = mt.filter_rows(mt.locus.contig == ld_contig)
 
         # Previously established that THIS needs a checkpoint to have been ran to work effectively
         if args.generate_ld_pruned_set:
@@ -718,7 +710,7 @@ def main(args):
                 "Generating LD Pruned Set of uncorrelated variants, using hl.ld_prune()..."
             )
             generate_ld_pruned_set(
-                mt=mt,
+                mt=mt_ld,
                 pop_data=pop_data,
                 data_type=data_type,
                 r2=args.r2,
@@ -738,7 +730,7 @@ def main(args):
                 "Generating Hail BlockMatrix of variants correlations to other variants, using hl.ld_matrix()..."
             )
             generate_ld_matrix(
-                mt=mt,
+                mt=mt_ld,
                 pop_data=pop_data,
                 data_type=data_type,
                 radius=args.radius,