diff --git a/gnomad_qc/v5/annotations/compute_coverage.py b/gnomad_qc/v5/annotations/compute_coverage.py index 53335f642..0069c93a3 100644 --- a/gnomad_qc/v5/annotations/compute_coverage.py +++ b/gnomad_qc/v5/annotations/compute_coverage.py @@ -45,7 +45,7 @@ get_logging_path, qc_temp_prefix, ) -from gnomad_qc.v5.resources.constants import WORKSPACE_BUCKET +from gnomad_qc.v5.resources.constants import GNOMAD_TMP_BUCKET from gnomad_qc.v5.resources.meta import meta from gnomad_qc.v5.resources.release import ( release_all_sites_an_tsv_path, @@ -641,16 +641,24 @@ def join_aou_and_gnomad_qual_hists_ht( def main(args): """Compute all sites coverage, allele number, and quality histograms for v5 genomes (AoU v8 + gnomAD v4).""" project = args.project_name - environment = "rwb" if project == "aou" else "dataproc" - if environment == "rwb": + environment = "batch" if project == "aou" else "dataproc" + if environment == "batch": hl.init( - log="/home/jupyter/workspaces/gnomadproduction/compute_coverage.log", - tmp_dir=f"gs://{WORKSPACE_BUCKET}/tmp/4_day", + backend="batch", + app_name="compute_coverage", + log="compute_coverage.log", + tmp_dir=f"gs://{GNOMAD_TMP_BUCKET}/tmp/4_day", + driver_memory="highmem", + driver_cores=8, + worker_memory="highmem", + worker_cores=8, + gcs_requester_pays_configuration=args.gcp_billing_project, + regions=["us-central1"], ) else: hl.init( log="compute_coverage.log", - tmp_dir="gs://gnomad-tmp-4day", + tmp_dir=f"gs://{GNOMAD_TMP_BUCKET}/tmp/30_day", ) hl.default_reference("GRCh38") @@ -968,6 +976,12 @@ def get_script_argument_parser() -> argparse.ArgumentParser: type=str, choices=["aou", "gnomad"], ) + parser.add_argument( + "--gcp-billing-project", + type=str, + default="broad-mpg-gnomad", + help="Google Cloud billing project for reading requester pays buckets.", + ) parser.add_argument( "--overwrite", help="Overwrite existing hail Tables.", action="store_true" ) diff --git a/gnomad_qc/v5/resources/annotations.py b/gnomad_qc/v5/resources/annotations.py index c91801954..4d8d331a9 100644 --- a/gnomad_qc/v5/resources/annotations.py +++ b/gnomad_qc/v5/resources/annotations.py @@ -16,6 +16,7 @@ def _annotations_root( test: bool = False, data_type: str = "genomes", data_set: str = "aou", + environment: str = "batch", ) -> str: """ Get root path to the variant annotation files. @@ -25,17 +26,17 @@ def _annotations_root( full v4 VDS. :param data_type: Data type of annotation resource. e.g. "exomes" or "genomes". Default is "genomes". :param data_set: Data set of annotation resource. Default is "aou". + :param environment: Compute environment. One of 'rwb', 'batch', or 'dataproc'. Defaults to 'batch'. :return: Root path of the variant annotation files. """ path_suffix = f"sample_qc/{data_type}/{data_set}" if test: - environment = "rwb" if data_set == "aou" else "dataproc" return ( f"{qc_temp_prefix(version=version, environment=environment)}{path_suffix}" ) - base_bucket = WORKSPACE_BUCKET if data_set == "aou" else GNOMAD_BUCKET + base_bucket = WORKSPACE_BUCKET if environment == "rwb" else GNOMAD_BUCKET return f"gs://{base_bucket}/v{version}/{path_suffix}" diff --git a/gnomad_qc/v5/resources/basics.py b/gnomad_qc/v5/resources/basics.py index 5a3cc15af..f56a473df 100644 --- a/gnomad_qc/v5/resources/basics.py +++ b/gnomad_qc/v5/resources/basics.py @@ -65,17 +65,22 @@ def qc_temp_prefix( """ Return path to temporary QC bucket. + .. note:: + + Function supports three environments becauseAoU QC started in RWB, + then moved to Batch in November 2025. + :param version: Version of annotation path to return. - :param environment: Compute environment, either 'dataproc' or 'rwb'. Defaults to 'dataproc'. + :param environment: Compute environment. One of 'rwb', 'batch', or 'dataproc'. Defaults to 'dataproc'. :return: Path to bucket with temporary QC data. """ if environment == "rwb": env_bucket = f"{WORKSPACE_BUCKET}/tmp" - elif environment == "dataproc": + elif environment in ("batch", "dataproc"): env_bucket = GNOMAD_TMP_BUCKET else: raise ValueError( - f"Environment {environment} not recognized. Choose 'rwb' or 'dataproc'." + f"Environment {environment} not recognized. Choose 'rwb', 'batch', or 'dataproc'." ) return f"gs://{env_bucket}/gnomad.genomes.v{version}.qc_data/" diff --git a/gnomad_qc/v5/resources/constants.py b/gnomad_qc/v5/resources/constants.py index 691a6bf70..b8e910771 100644 --- a/gnomad_qc/v5/resources/constants.py +++ b/gnomad_qc/v5/resources/constants.py @@ -16,6 +16,8 @@ WORKSPACE_BUCKET = "fc-secure-b25d1307-7763-48b8-8045-fcae9caadfa1" GNOMAD_BUCKET = "gnomad" GNOMAD_TMP_BUCKET = "gnomad-tmp" + +# TODO: Update these constants for Batch if necessary. AOU_BUCKET = "fc-aou-datasets-controlled/v8" AOU_WGS_BUCKET = f"{AOU_BUCKET}/wgs/short_read/snpindel" AOU_WGS_AUX_BUCKET = f"{AOU_WGS_BUCKET}/aux" diff --git a/gnomad_qc/v5/resources/release.py b/gnomad_qc/v5/resources/release.py index 8c35b9f1b..4fd6f6348 100644 --- a/gnomad_qc/v5/resources/release.py +++ b/gnomad_qc/v5/resources/release.py @@ -30,7 +30,7 @@ def _release_root( test: bool = False, data_type: str = "genomes", extension: str = "ht", - environment: str = "rwb", + environment: str = "batch", ) -> str: """ Get root path to the release files. @@ -38,9 +38,9 @@ def _release_root( :param version: Version of release path to return. :param test: Whether to use a tmp path for testing. :param data_type: Data type of annotation resource. e.g. "exomes" or "genomes". - Default is "exomes". + Default is "genomes". :param extension: File extension of release file. Default is "ht". - :param environment: Environment to use. Default is "rwb". Must be "rwb" for AoU. + :param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU. :return: Root path of the release files. """ path_suffix = f"release/{extension}/{data_type}" @@ -57,7 +57,7 @@ def release_coverage_path( public: bool = False, test: bool = False, coverage_type: str = "coverage", - environment: str = "rwb", + environment: str = "batch", ) -> str: """ Fetch filepath for v5 (AoU + gnomAD v4 genomes) all sites coverage or allele number release Table. @@ -67,7 +67,7 @@ def release_coverage_path( private (False) bucket. Default is False. :param test: Whether to use a tmp path for testing. Default is False. :param coverage_type: 'coverage' or 'allele_number'. Default is 'coverage'. - :param environment: Environment to use. Default is "rwb". + :param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU. :return: File path for desired coverage Hail Table. """ assert coverage_type in [ @@ -101,14 +101,14 @@ def release_coverage_path( def release_coverage_tsv_path( release_version: str = CURRENT_COVERAGE_RELEASE["genomes"], test: bool = False, - environment: str = "rwb", + environment: str = "batch", ) -> str: """ Fetch path to coverage TSV file. :param release_version: Release version. Default is CURRENT_COVERAGE_RELEASE["genomes"]. :param test: Whether to use a tmp path for testing. Default is False. - :param environment: Environment to use. Default is "rwb". + :param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU. :return: Coverage TSV path. """ return f"{_release_root(release_version, test=test, extension='tsv', environment=environment)}/gnomad.genomes.v{release_version}.coverage.tsv.bgz" @@ -117,14 +117,14 @@ def release_coverage_tsv_path( def release_all_sites_an_tsv_path( release_version: str = None, test: bool = False, - environment: str = "rwb", + environment: str = "batch", ) -> str: """ Fetch path to all sites AN TSV file. :param release_version: Release version. Default is None. :param test: Whether to use a tmp path for testing. Default is False. - :param environment: Environment to use. Default is "rwb". + :param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU. :return: All sites AN TSV path. """ release_version = ( @@ -138,7 +138,7 @@ def release_all_sites_an_tsv_path( def release_coverage( public: bool = False, test: bool = False, - environment: str = "rwb", + environment: str = "batch", ) -> VersionedTableResource: """ Retrieve versioned resource for coverage release Table. @@ -146,7 +146,7 @@ def release_coverage( :param public: Determines whether release coverage Table is read from public (True) or private (False) bucket. Default is False. :param test: Whether to use a tmp path for testing. Default is False. - :param environment: Environment to use. Default is "rwb". + :param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU. :return: Coverage release Table. """ return VersionedTableResource( @@ -168,7 +168,7 @@ def release_coverage( def release_all_sites_an( public: bool = False, test: bool = False, - environment: str = "rwb", + environment: str = "batch", ) -> VersionedTableResource: """ Retrieve versioned resource for all sites allele number release Table. @@ -176,7 +176,7 @@ def release_all_sites_an( :param public: Determines whether release allele number Table is read from public or private bucket. Default is private. :param test: Whether to use a tmp path for testing. Default is False. - :param environment: Environment to use. Default is "rwb". + :param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU. :return: All sites allele number release Table. """ return VersionedTableResource(