Skip to content
49 changes: 48 additions & 1 deletion gnomad_qc/v5/resources/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def get_info_ht(test: bool = False, environment: str = "rwb") -> VersionedTableR
CURRENT_ANNOTATION_VERSION,
{
version: TableResource(
f"{_annotations_root(version, test=test, environment=environment)}/gnomad.genomes.v{version}.info.ht"
f"{_annotations_root(version, test=test, environment=environment)}/aou.genomes.v{version}.info.ht"
)
for version in ANNOTATION_VERSIONS
},
Expand All @@ -250,3 +250,50 @@ def get_info_ht(test: bool = False, environment: str = "rwb") -> VersionedTableR
aou_annotated_sites_only_vcf = (
f"gs://{WORKSPACE_BUCKET}/echo_full_gnomad_annotated.sites-only.vcf.gz"
)


def get_info_vcf_path(
version: str = CURRENT_ANNOTATION_VERSION,
split: bool = False,
test: bool = False,
environment: str = "batch",
) -> str:
"""
Path to sites VCF (input information for running VQSR).

:param version: Version of annotation path to return.
:param split: Whether to return the split or multi-allelic version of the resource.
:param test: Whether to use a tmp path for testing.
:param environment: Environment to use. Default is "rwb".
:return: String for the path to the info VCF.
"""
return f"{_annotations_root(version, test=test, environment=environment)}/aou.genomes.v{version}.info{'.split' if split else ''}.vcf.bgz"


def get_true_positive_vcf_path(
version: str = CURRENT_ANNOTATION_VERSION,
test: bool = False,
adj: bool = False,
true_positive_type: str = "transmitted_singleton",
environment: str = "batch",
) -> str:
"""
Provide the path to the transmitted singleton VCF used as input to VQSR.

:param version: Version of true positive VCF path to return.
:param test: Whether to use a tmp path for testing.
:param adj: Whether to use adj genotypes.
:param true_positive_type: Type of true positive VCF path to return. Should be one
of "transmitted_singleton", "sibling_singleton", or
"transmitted_singleton.sibling_singleton". Default is "transmitted_singleton".
:param environment: Environment to use. Default is "rwb".
:return: String for the path to the true positive VCF.
"""
tp_types = [
"transmitted_singleton",
"sibling_singleton",
"transmitted_singleton.sibling_singleton",
]
if true_positive_type not in tp_types:
raise ValueError(f"true_positive_type must be one of {tp_types}")
return f'{_annotations_root(version, test=test, environment=environment)}/aou.genomes.v{version}.{true_positive_type}.{"adj" if adj else "raw"}.vcf.bgz'
6 changes: 6 additions & 0 deletions gnomad_qc/v5/resources/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
ANNOTATION_VERSIONS = ["5.0"]
CURRENT_ANNOTATION_VERSION = "5.0"

VARIANT_QC_VERSIONS = ["5.0"]
CURRENT_VARIANT_QC_VERSION = "5.0"

VARIANT_QC_RESULT_VERSIONS = ["5.0"]
CURRENT_VARIANT_QC_RESULT_VERSION = ["5.0"]

# Storage constants.
WORKSPACE_BUCKET = "fc-secure-b25d1307-7763-48b8-8045-fcae9caadfa1"
GNOMAD_BUCKET = "gnomad"
Expand Down
123 changes: 123 additions & 0 deletions gnomad_qc/v5/resources/variant_qc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Script containing variant QC related resources."""

from gnomad.resources.grch38 import (
na12878_giab,
na12878_giab_hc_intervals,
syndip,
syndip_hc_intervals,
)
from gnomad.resources.resource_utils import TableResource, VersionedTableResource

from gnomad_qc.v5.resources.constants import (
CURRENT_VARIANT_QC_RESULT_VERSION,
CURRENT_VARIANT_QC_VERSION,
VARIANT_QC_RESULT_VERSIONS,
)

VQSR_FEATURES = {
"genomes": {
"snv": [
"AS_QD",
"AS_MQRankSum",
"AS_ReadPosRankSum",
"AS_FS",
"AS_SOR",
"AS_MQ",
],
"indel": [
"AS_QD",
"AS_MQRankSum",
"AS_ReadPosRankSum",
"AS_FS",
"AS_SOR",
],
},
}
"""List of features used in the VQSR model."""

SYNDIP = "CHMI_CHMI3_Nex1"
"""
String representation for syndip truth sample.
"""

NA12878 = "ASC-4Set-1573S_NA12878@1075619236"
"""
String representation for NA12878 truth sample.
"""
# TODO: Is this present ? Likely not but check.
UKB_NA12878 = "Coriell_NA12878_NA12878"
"""
String representation for the UKB Regeneron generated NA12878 truth sample.
"""

TRUTH_SAMPLES = {
"syndip": {"s": SYNDIP, "truth_mt": syndip, "hc_intervals": syndip_hc_intervals},
"NA12878": {
"s": NA12878,
"truth_mt": na12878_giab,
"hc_intervals": na12878_giab_hc_intervals,
},
"UKB_NA12878": {
"s": UKB_NA12878,
"truth_mt": na12878_giab,
"hc_intervals": na12878_giab_hc_intervals,
},
}
"""
Dictionary containing necessary information for truth samples

Current truth samples available are syndip and NA12878. Available data for each are the
following:

- s: Sample name in the callset
- truth_mt: Truth sample MatrixTable resource
- hc_intervals: High confidence interval Table resource in truth sample
"""


def _variant_qc_root(
version: str = CURRENT_VARIANT_QC_VERSION,
test: bool = False,
data_type: str = "genomes",
) -> str:
"""
Return path to variant QC root folder.

:param version: Version of variant QC path to return.
:param test: Whether to use a tmp path for variant QC tests.
:param data_type: Whether to return 'exomes' or 'genomes' data. Default is genomes.
:return: Root to variant QC path.
"""
return (
f"gs://gnomad-tmp/gnomad_v{version}_testing/variant_qc/{data_type}"
if test
else f"gs://gnomad/v{version}/variant_qc/{data_type}"
)


def get_variant_qc_result(
model_id: str,
test: bool = False,
) -> VersionedTableResource:
r"""
Get the results of variant QC filtering for a given run.

:param model_id: Model ID of variant QC run to load. Must start with 'rf\_',
'vqsr\_', or 'if\_'.
:param test: Whether to use a tmp path for variant QC tests.
:return: VersionedTableResource for variant QC results.
"""
model_type = model_id.split("_")[0]
if model_type not in ["rf", "vqsr", "if"]:
raise ValueError(
f"Model ID must start with 'rf_', 'vqsr_', or 'if_', but found {model_id}"
)
return VersionedTableResource(
CURRENT_VARIANT_QC_RESULT_VERSION,
{
version: TableResource(
f"{_variant_qc_root(version, test=test)}/{model_type}/models/{model_id}/aou.genomes.v{version}.{model_type}_result.ht"
)
for version in VARIANT_QC_RESULT_VERSIONS
},
)
43 changes: 43 additions & 0 deletions gnomad_qc/v5/variant_qc/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FROM us.gcr.io/broad-gatk/gatk:4.2.6.1

LABEL maintainer="gnomAD Team"
LABEL description="GATK 4.2.6.1 with BCFtools 1.10.2 for VQSR ApplyRecalibration"

# Fix expired Google Cloud SDK GPG key, then install dependencies for BCFtools
RUN rm -f /etc/apt/sources.list.d/google-cloud-sdk.list \
&& apt-get update && apt-get install -y \
autoconf \
automake \
make \
gcc \
perl \
zlib1g-dev \
libbz2-dev \
liblzma-dev \
libcurl4-gnutls-dev \
libssl-dev \
wget \
&& rm -rf /var/lib/apt/lists/*

# Install BCFtools 1.10.2
RUN wget https://github.com/samtools/bcftools/releases/download/1.10.2/bcftools-1.10.2.tar.bz2 \
&& tar -xjf bcftools-1.10.2.tar.bz2 \
&& cd bcftools-1.10.2 \
&& ./configure --prefix=/usr/local \
&& make \
&& make install \
&& cd .. \
&& rm -rf bcftools-1.10.2 bcftools-1.10.2.tar.bz2

# Install tabix (part of htslib, needed for indexing VCFs)
RUN wget https://github.com/samtools/htslib/releases/download/1.10.2/htslib-1.10.2.tar.bz2 \
&& tar -xjf htslib-1.10.2.tar.bz2 \
&& cd htslib-1.10.2 \
&& ./configure --prefix=/usr/local \
&& make \
&& make install \
&& cd .. \
&& rm -rf htslib-1.10.2 htslib-1.10.2.tar.bz2

# Verify installations
RUN gatk --version && bcftools --version && tabix --version
Loading
Loading