diff --git a/gnomad_qc/federated/__init__.py b/gnomad_qc/federated/__init__.py new file mode 100644 index 000000000..2746f4ac1 --- /dev/null +++ b/gnomad_qc/federated/__init__.py @@ -0,0 +1 @@ +# noqa: D104X diff --git a/gnomad_qc/v5/configs/__init__.py b/gnomad_qc/federated/configs/__init__.py similarity index 100% rename from gnomad_qc/v5/configs/__init__.py rename to gnomad_qc/federated/configs/__init__.py diff --git a/gnomad_qc/v5/configs/validity_inputs_config.json b/gnomad_qc/federated/configs/validity_inputs_config.json similarity index 75% rename from gnomad_qc/v5/configs/validity_inputs_config.json rename to gnomad_qc/federated/configs/validity_inputs_config.json index ae54ceb5e..0d2be12cf 100644 --- a/gnomad_qc/v5/configs/validity_inputs_config.json +++ b/gnomad_qc/federated/configs/validity_inputs_config.json @@ -1,6 +1,5 @@ { - "missingness_threshold": 0.5, - "struct_annotations_for_missingness": [], + "struct_annotations_to_skip_missingness": ["vep"], "freq_fields": {"freq": "freq", "freq_meta": "freq_meta", "freq_meta_sample_count": "freq_meta_sample_count"}, "faf_fields": {}, "freq_annotations_to_sum": ["AC", "AN", "homozygote_count"], @@ -8,5 +7,6 @@ "nhomalt_metric": "homozygote_count", "subsets": [""], "variant_filter_field": "", - "check_mono_and_only_het" : true + "data_type": "genomes", + "check_mono_and_only_het": true } diff --git a/gnomad_qc/v5/configs/validity_inputs_schema.py b/gnomad_qc/federated/configs/validity_inputs_schema.py similarity index 88% rename from gnomad_qc/v5/configs/validity_inputs_schema.py rename to gnomad_qc/federated/configs/validity_inputs_schema.py index fa418b50d..88bc9e517 100644 --- a/gnomad_qc/v5/configs/validity_inputs_schema.py +++ b/gnomad_qc/federated/configs/validity_inputs_schema.py @@ -3,7 +3,10 @@ schema = { "type": "object", "properties": { - "missingness_threshold": {"type": "number"}, + "struct_annotations_to_skip_missingness": { + "type": "array", + "items": {"type": "string"}, + }, "struct_annotations_for_missingness": { "type": "array", "items": {"type": "string"}, @@ -46,10 +49,10 @@ "items": {"type": "string"}, }, "variant_filter_field": {"type": "string"}, + "data_type": {"type": "string", "enum": ["exomes", "genomes"]}, "check_mono_and_only_het": {"type": "boolean"}, }, "required": [ - "missingness_threshold", "freq_fields", "freq_annotations_to_sum", "sort_order", @@ -57,6 +60,7 @@ "subsets", "variant_filter_field", "check_mono_and_only_het", + "data_type", ], "additionalProperties": False, } diff --git a/gnomad_qc/v5/data_ingestion/__init__.py b/gnomad_qc/federated/data_ingestion/__init__.py similarity index 100% rename from gnomad_qc/v5/data_ingestion/__init__.py rename to gnomad_qc/federated/data_ingestion/__init__.py diff --git a/gnomad_qc/v5/data_ingestion/create_vds.py b/gnomad_qc/federated/data_ingestion/create_vds.py similarity index 100% rename from gnomad_qc/v5/data_ingestion/create_vds.py rename to gnomad_qc/federated/data_ingestion/create_vds.py diff --git a/gnomad_qc/v5/data_ingestion/field_md_to_html.py b/gnomad_qc/federated/data_ingestion/field_md_to_html.py similarity index 100% rename from gnomad_qc/v5/data_ingestion/field_md_to_html.py rename to gnomad_qc/federated/data_ingestion/field_md_to_html.py diff --git a/gnomad_qc/v5/data_ingestion/field_requirements.html b/gnomad_qc/federated/data_ingestion/field_requirements.html similarity index 56% rename from gnomad_qc/v5/data_ingestion/field_requirements.html rename to gnomad_qc/federated/data_ingestion/field_requirements.html index f188a18ed..70e4f1f72 100644 --- a/gnomad_qc/v5/data_ingestion/field_requirements.html +++ b/gnomad_qc/federated/data_ingestion/field_requirements.html @@ -14,9 +14,9 @@

Global Fields Specification

A variant is considered to fail the "adj" criteria if any of the following conditions are met: GQ < 20, DP < 10, or AB < 0.2 (heterozygous calls).

All histograms annotations must use the same bin edges as defined in their respective 'Description' columns.

-
FieldTypeDescriptionExampleField Necessity
freq_metaarray<dict<str, str>>Array of frequency metadata dictionaries containing the frequency aggregation group for each element of the ‘freq’ array row annotation. Each dictionary should have the following keys: 'gen_anc', 'group', 'sex'. A 'downsampling' key is optional. The 'adj' group should always be the first value of the array, and the 'raw' group should be the second value. Required 'group' values are 'adj' and 'raw'. Required 'sex' values are 'XX' and 'XY'. Specific values are not required for 'gen_anc'.[{'group': 'adj'},{'group': 'raw'},{'gen_anc': 'afr', 'group': 'adj'},{'gen_anc': 'amr', 'group': 'adj'},{'group': 'adj', 'sex': 'XX'},{'group': 'adj', 'sex': 'XY'}, ...]Required
freq_index_dictdict<str, int32>Dictionary keyed by specified label grouping combinations ('group': 'adj'/'raw', 'gen_anc': inferred genetic ancestry group, 'sex': sex karyotype), with values describing the corresponding index of each grouping entry in the ‘freq’ array row annotation.

If provided, keys need to be formatted in the order of 'gen_anc'_ 'sex'_'group'. If a 'downsampling' key is included, the order should be 'downsampling'_'gen_anc'_ 'sex'_'group'. The 'adj' value should always be at index 0 and the 'raw' value at index 1. A more detailed description can be found at https://gnomad.broadinstitute.org/help/v4-hts.
{"adj": 0, "raw": 1, "afr_adj": 2, "amr_adj": 3, "XX_adj": 4, "XY_adj": 5, ...}Optional
freq_meta_sample_countarray<int32>A sample count per sample grouping defined in the 'freq_meta' global annotation. Must be in the same order as 'freq'/'freq_meta'.[730947, 730947, 16740, 15001, 50000, 680947, ...]Required
faf_metaarray<dict<str, str>>Filtering allele frequency metadata. An ordered list containing the frequency aggregation group for each element of the ‘faf’ array row annotation.Not Needed
faf_index_dictdict<str, int32>Dictionary keyed by specified label grouping combinations ('group': 'adj'/'raw', 'gen_anc': inferred genetic ancestry group, 'sex': sex karyotype), with values describing the corresponding index of each grouping entry in the filtering allele frequency (‘faf’) row annotation.Not Needed
age_distributionstruct { ... }Callset-wide age histogram. Cohorts for gnomAD vary in how they report age (some report the age at diagnosis, others report the age of last visit, etc), so the ages associated with the gnomAD data can be thought of as the last known age of the individual. Information on age is not available for all gnomAD samples. This field is required with the acknowledgement that age data may not be available for all samples. If no age is available for any samples, set all bin_freq values to 0.Required
age_distribution.bin_edgesarray<float64>Bin edges for age histogram: 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0.[30.0, 35.0, 40.0, 45.0, ...]Required
age_distribution.bin_freqarray<int32>Bin frequencies for the age histogram. This is the number of records found in each bin.[101, 122, 85, 4, ...]Required
age_distribution.n_smallerint32Count of age values falling below lowest histogram bin edge.1000Required
age_distribution.n_largerint32Count of age values falling above highest histogram bin edge.30Required
downsamplingsdict<str, array<int32>>Dictionary keyed by dataset with values corresponding to available downsampled sample counts.{'gnomad': [10, 100, 500...]}Not Needed
filtering_modelstruct { ... }The variant filtering model used and its specific cutoffs.Not Needed
filtering_model.filter_namestrVariant filtering model name used in the 'filters' row annotation, indicating the variant was filtered by this model during variant QC.Not Needed
filtering_model.score_namestrName of the score used in filtering.Not Needed
filtering_model.snv_cutoffstruct { ... }SNV filtering cutoff information.Not Needed
filtering_model.snv_cutoff.binint32Filtering percentile cutoff for SNVs.Not Needed
filtering_model.snv_cutoff.min_scorefloat64Minimum score at SNV filtering percentile cutoff.Not Needed
filtering_model.indel_cutoffstruct { ... }Indel filtering cutoff information.Not Needed
filtering_model.indel_cutoff.binint32Filtering percentile cutoff for indels.Not Needed
filtering_model.indel_cutoff.min_scorefloat64Minimum score at indel filtering percentile cutoff.Not Needed
filtering_model.snv_training_variablesarray<str>Variant annotations used as features in the SNV filtering model.Not Needed
filtering_model.indel_training_variablesarray<str>Variant annotations used as features in the indel filtering model.Not Needed
inbreeding_coeff_cutofffloat64Inbreeding Coefficient threshold used to hard filter variants.Not Needed
excess_het_cutofffloat64Excess heterozygosity threshold used to hard filter variants.Not Needed
tool_versionsstruct { ... }Versions of in silico predictors used in the callset.Not Needed
tool_versions.cadd_versionstrCombined Annotation Dependent Depletion (CADD) version.Not Needed
tool_versions.revel_versionstrRare Exome Variant Ensemble Learner (REVEL) version.Not Needed
tool_versions.spliceai_versionstrSpliceAI version.Not Needed
tool_versions.pangolin_versionarray<str>Pangolin version.Not Needed
tool_versions.phylop_versionstrphyloP version.Not Needed
tool_versions.dbsnp_versionstrdbSNP version.Not Needed
tool_versions.sift_versionstrSorting Intolerant from Tolerant (SIFT) version.Not Needed
tool_versions.polyphen_versionstrPolymorphism Phenotyping v2 (Polyphen-v2) version.Not Needed
vrs_versionsstruct { ... }The Variant Representation Specification version that was used to compute IDs on the callset. Global and row VRS annotations are optional, but the global annotaions must be filled out if the row annotations are provided.Optional
vrs_versions.vrs_schema_versionstrThe version of the VRS schema that is used to represent variants and compute identifiers. Must be 2.0.1."2.0.1"Optional
vrs_versions.vrs_python_versionstrThe version of the vrs-python library that was used to compute IDs on the callset. Must be 2.1.3."2.1.3"Optional
vrs_versions.seqrepo_versionstrThe version of the SeqRepo database that was used in VRS computations. Must be 2024-12-20."2024-12-20"Optional
vep_globalsstruct { ... }Information about VEP annotations.Not Needed
vep_globals.vep_versionstrVEP version that was run on the callset.Not Needed
vep_globals.vep_helpstrOutput from vep --help.Not Needed
vep_globals.vep_configstrVEP configuration to run VEP version with Hail. File created using command within VEP init shell script in https://github.com/broadinstitute/gnomad_methods/tree/main.Not Needed
vep_globals.gencode_versionstrGENCODE version used in VEP.Not Needed
vep_globals.mane_select_versionstrMANE select version used in VEP.Not Needed
frequency_READMEstrExplanation of how to use the 'freq_index_dict' global annotation to extract frequencies from the 'freq' row annotation.Not Needed
datestrDate Hail Table was created."2025-04-09"Required
versionstrVersion of the file.Not Needed
+
FieldTypeDescriptionExampleField Necessity
freq_metaarray<dict<str, str>>Array of frequency metadata dictionaries containing the frequency aggregation group for each element of the ‘freq’ array row annotation. Each dictionary should have the following keys: 'gen_anc', 'group', 'sex'. A 'downsampling' key is optional. The 'adj' group should always be the first value of the array, and the 'raw' group should be the second value. Required 'group' values are 'adj' and 'raw'. Required 'sex' values are 'XX' and 'XY'. Specific values are not required for 'gen_anc'.[{'group': 'adj'},{'group': 'raw'},{'gen_anc': 'afr', 'group': 'adj'},{'gen_anc': 'amr', 'group': 'adj'},{'group': 'adj', 'sex': 'XX'},{'group': 'adj', 'sex': 'XY'}, ...]Required
freq_index_dictdict<str, int32>Dictionary keyed by specified label grouping combinations ('group': 'adj'/'raw', 'gen_anc': inferred genetic ancestry group, 'sex': sex karyotype), with values describing the corresponding index of each grouping entry in the ‘freq’ array row annotation.

If provided, keys need to be formatted in the order of 'gen_anc'_ 'sex'_'group'. If a 'downsampling' key is included, the order should be 'downsampling'_'gen_anc'_ 'sex'_'group'. The 'adj' value should always be at index 0 and the 'raw' value at index 1. A more detailed description can be found at https://gnomad.broadinstitute.org/help/v4-hts.
{"adj": 0, "raw": 1, "afr_adj": 2, "amr_adj": 3, "XX_adj": 4, "XY_adj": 5, ...}Optional
freq_meta_sample_countarray<int32>A sample count per sample grouping defined in the 'freq_meta' global annotation. Must be in the same order as 'freq'/'freq_meta'.[730947, 730947, 16740, 15001, 50000, 680947, ...]Required
faf_metaarray<dict<str, str>>Filtering allele frequency metadata. An ordered list containing the frequency aggregation group for each element of the ‘faf’ array row annotation.Not Needed
faf_index_dictdict<str, int32>Dictionary keyed by specified label grouping combinations ('group': 'adj'/'raw', 'gen_anc': inferred genetic ancestry group, 'sex': sex karyotype), with values describing the corresponding index of each grouping entry in the filtering allele frequency (‘faf’) row annotation.Not Needed
age_distributionstruct { ... }Callset-wide age histogram. Cohorts for gnomAD vary in how they report age (some report the age at diagnosis, others report the age of last visit, etc), so the ages associated with the gnomAD data can be thought of as the last known age of the individual. Information on age is not available for all gnomAD samples. This field is required with the acknowledgement that age data may not be available for all samples. If no age is available for any samples, set all bin_freq values to 0.Required
age_distribution.bin_edgesarray<float64>Bin edges for age histogram: 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0.[30.0, 35.0, 40.0, 45.0, ...]Required
age_distribution.bin_freqarray<int32>Bin frequencies for the age histogram. This is the number of records found in each bin.[101, 122, 85, 4, ...]Required
age_distribution.n_smallerint32Count of age values falling below lowest histogram bin edge.1000Required
age_distribution.n_largerint32Count of age values falling above highest histogram bin edge.30Required
downsamplingsdict<str, array<int32>>Dictionary keyed by dataset with values corresponding to available downsampled sample counts.{'gnomad': [10, 100, 500...]}Not Needed
filtering_modelstruct { ... }The variant filtering model used and its specific cutoffs.Not Needed
filtering_model.filter_namestrVariant filtering model name used in the 'filters' row annotation, indicating the variant was filtered by this model during variant QC.Not Needed
filtering_model.score_namestrName of the score used in filtering.Not Needed
filtering_model.snv_cutoffstruct { ... }SNV filtering cutoff information.Not Needed
filtering_model.snv_cutoff.binint32Filtering percentile cutoff for SNVs.Not Needed
filtering_model.snv_cutoff.min_scorefloat64Minimum score at SNV filtering percentile cutoff.Not Needed
filtering_model.indel_cutoffstruct { ... }Indel filtering cutoff information.Not Needed
filtering_model.indel_cutoff.binint32Filtering percentile cutoff for indels.Not Needed
filtering_model.indel_cutoff.min_scorefloat64Minimum score at indel filtering percentile cutoff.Not Needed
filtering_model.snv_training_variablesarray<str>Variant annotations used as features in the SNV filtering model.Not Needed
filtering_model.indel_training_variablesarray<str>Variant annotations used as features in the indel filtering model.Not Needed
inbreeding_coeff_cutofffloat64Inbreeding Coefficient threshold used to hard filter variants.Not Needed
excess_het_cutofffloat64Excess heterozygosity threshold used to hard filter variants.Not Needed
tool_versionsstruct { ... }Versions of in silico predictors used in the callset.Not Needed
tool_versions.cadd_versionstrCombined Annotation Dependent Depletion (CADD) version.Not Needed
tool_versions.revel_versionstrRare Exome Variant Ensemble Learner (REVEL) version.Not Needed
tool_versions.spliceai_versionstrSpliceAI version.Not Needed
tool_versions.pangolin_versionarray<str>Pangolin version.Not Needed
tool_versions.phylop_versionstrphyloP version.Not Needed
tool_versions.dbsnp_versionstrdbSNP version.Not Needed
tool_versions.sift_versionstrSorting Intolerant from Tolerant (SIFT) version.Not Needed
tool_versions.polyphen_versionstrPolymorphism Phenotyping v2 (Polyphen-v2) version.Not Needed
vrs_versionsstruct { ... }The Variant Representation Specification version that was used to compute IDs on the callset. Global and row VRS annotations are optional, but the global annotaions must be filled out if the row annotations are provided.Optional
vrs_versions.vrs_schema_versionstrThe version of the VRS schema that is used to represent variants and compute identifiers. Must be 2.0.1."2.0.1"Optional
vrs_versions.vrs_python_versionstrThe version of the vrs-python library that was used to compute IDs on the callset. Must be 2.1.3."2.1.3"Optional
vrs_versions.seqrepo_versionstrThe version of the SeqRepo database that was used in VRS computations. Must be 2024-12-20."2024-12-20"Optional
vep_globalsstruct { ... }Information about VEP annotations.Not Needed
vep_globals.vep_versionstrVEP version that was run on the callset.Not Needed
vep_globals.vep_helpstrOutput from vep --help.Not Needed
vep_globals.vep_configstrVEP configuration to run VEP version with Hail. File created using command within VEP init shell script in https://github.com/broadinstitute/gnomad_methods/tree/main.Not Needed
vep_globals.gencode_versionstrGENCODE version used in VEP.Not Needed
vep_globals.mane_select_versionstrMANE select version used in VEP.Not Needed
vep115_globalsstruct { ... }Information about VEP 115 annotations.Not Needed
vep115_globals.vep_versionstrVEP version that was run on the callset (115).Not Needed
vep115_globals.vep_helpstrOutput from vep --help for VEP 115.Not Needed
vep115_globals.vep_configstrVEP 115 configuration to run VEP version with Hail.Not Needed
vep115_globals.gencode_versionstrGENCODE version used in VEP 115.Not Needed
vep115_globals.mane_select_versionstrMANE select version used in VEP 115.Not Needed
frequency_READMEstrExplanation of how to use the 'freq_index_dict' global annotation to extract frequencies from the 'freq' row annotation.Not Needed
datestrDate Hail Table was created."2025-04-09"Required
versionstrVersion of the file.Not Needed

Row Fields Specification

This table provides a view of each row field. Every leaf field (i.e., non-struct field) is listed on its own row with the full annotation. The parent portion is shown in a lighter color to indicate context. Cell colors indicate the field necessity: "Required" in red, "Optional" in blue, "Not Needed" in grey.

All histograms annotations must use the same bin edges as defined in their respective 'Description' columns.

Data must be supplied as a "split" dataset, where multiallelic variants are split so that information for each alternate allele is in a separate row.

-
FieldTypeDescriptionExampleField Necessity
locuslocus<GRCh38>Variant locus. Contains contig and position information. Must be build GRCh38.chr1:12345Required
allelesarray<str>Variant alleles (reference and alternate).["A", "G"]Required
freqarray<struct {...}>Array of allele frequency information (AC, AN, AF, homozygote count) for each frequency aggregation group corresponding to each frequency metadata group. The 'adj' value should always be the first value of the array, and the 'raw' value should be the second value.Required
freq.ACint32Alternate allele count.10Required
freq.AFfloat64Alternate allele frequency, (AC/AN).0.1Required
freq.ANint32Total number of alleles.100Required
freq.homozygote_countint32Count of homozygous alternate individuals for the alternate allele.2Required
grpmaxstruct {...}Allele frequency information (AC, AN, AF, homozygote count) for the group with maximum allele frequency.Not Needed
grpmax.ACint32Alternate allele count in the group with the maximum allele frequency.Not Needed
grpmax.AFfloat64Maximum alternate allele frequency, (AC/AN), across all groups.Not Needed
grpmax.ANint32Total number of alleles in the group with the maximum allele frequency.Not Needed
grpmax.homozygote_countint32Count of homozygous individuals in the group with the maximum allele frequency.Not Needed
grpmax.gen_ancstrGenetic ancestry corresponding to the maximum frequency group.Not Needed
fafarray<struct {...}>Filtering allele frequency.Not Needed
faf.faf95float64FAF95: Filtering allele frequency (using Poisson 95% CI).Not Needed
faf.faf99float64FAF99: Filtering allele frequency (using Poisson 99% CI) frequency.Not Needed
fafmaxstruct {...}Information about the genetic ancestry group with the maximum filtering allele frequency.Not Needed
fafmax.faf95_maxfloat64Maximum filtering allele frequency (using Poisson 95% CI).Not Needed
fafmax.faf95_max_gen_ancstrGenetic ancestry group with the maximum filtering allele frequency (95% CI).Not Needed
fafmax.faf99_maxfloat64Maximum filtering allele frequency (using Poisson 99% CI).Not Needed
fafmax.faf99_max_gen_ancstrGenetic ancestry group with the maximum filtering allele frequency (99% CI).Not Needed
a_indexint32The original index of this alternate allele in the multiallelic representation (1 is the first alternate allele or the only alternate allele in a biallelic variant).1Required
was_splitboolTrue if this variant was originally multiallelic, otherwise False.FalseRequired
rsidset<str>dbSNP reference SNP identification (rsID) numbers.Not Needed
filtersset<str>Variant filters; 'AC0': Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; or AB < 0.2 for het calls), 'AS_VQSR': Failed allele-specific VQSR filtering thresholds, 'InbreedingCoeff': GATK InbreedingCoeff < -0.3. An empty set in this field indicates that the variant passed all variant filters.{"AC0","AS_VQSR"}Optional
infostruct {...}Struct containing typical GATK allele-specific (AS) info fields and additional variant QC fields.Required
info.FSfloat64Phred-scaled p-value of Fisher's exact test for strand bias.7.30e+00Required
info.MQfloat64Root mean square of the mapping quality of reads across all samples.3.48e+01Required
info.MQRankSumfloat64Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities.6.70e-02Required
info.MQRankSum_cdfstruct { ...}CDF summary of overall MQRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.MQRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,3]Required
info.MQRankSum_cdf.itemsarray<float64>Ordered sample of values from the MQRankSum distribution.[-9.38e-01,-2.27e+00,-1.34e+00]Required
info.MQRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation.[0]Required
info.QUALapproxint64Sum of PL[0] values; used to approximate the QUAL score.96Required
info.QDfloat32Variant call confidence normalized by depth of sample reads supporting a variant.2.74e+00Required
info.ReadPosRankSumfloat64Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias.-1.07e+00Required
info.ReadPosRankSum_cdfstruct { ...}CDF summary of overall ReadPosRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.ReadPosRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,3]Required
info.ReadPosRankSum_cdf.itemsarray<float64>Ordered sample of values from the ReadPosRankSum distribution.[9.67e-01,-9.67e-01,9.67e-01]Required
info.ReadPosRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation.[0]Required
info.SBarray<int32>Aggregate counts of strand depth across all non-homozygous-reference calls. The values are the depth of reference allele on forward strand, depth of the reference allele on reverse strand, depth of all alternate alleles on forward strand, depth of all alternate alleles on reverse strand.[21,6,4,4]Required
info.SORfloat64Strand bias estimated by the symmetric odds ratio test.9.60e-02Required
info.VarDPint32Depth over variant genotypes (does not include depth of reference samples).35Required
info.AS_FSfloat64Allele-specific phred-scaled p-value of Fisher's exact test for strand bias.5.10e+00Required
info.AS_MQfloat64Allele-specific root mean square of the mapping quality of reads across all samples.3.51e+01Required
info.AS_MQRankSumfloat64Allele-specific z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities.-5.72e-01Required
info.AS_MQRankSum_cdfstruct { ...}CDF summary of allele-specific MQRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.AS_MQRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,3]Required
info.AS_MQRankSum_cdf.itemsarray<float64>Ordered sample of values from the allele-specific MQRankSum distribution.[-1.38e+00,4.31e-01,-9.67e-01]Required
info.AS_MQRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation for allele-specific MQRankSum CDF.[0]Required
info.AS_pab_maxfloat64Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5.6.87e-01Required
info.AS_QUALapproxint64Allele-specific sum of PL[0] values; used to approximate the QUAL score.77Required
info.AS_QDfloat32Allele-specific variant call confidence normalized by depth of sample reads supporting a variant.2.96e+00Required
info.AS_ReadPosRankSumfloat64Allele-specific z-score from Wilcoxon rank sum test of alternate vs. reference read position bias.-1.38e+00Required
info.AS_ReadPosRankSum_cdfstruct { ...}CDF summary of allele-specific ReadPosRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.AS_ReadPosRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,1]Required
info.AS_ReadPosRankSum_cdf.itemsarray<float64>Ordered sample of values from the allele-specific ReadPosRankSum distribution.[-1.78e-01]Required
info.AS_ReadPosRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation for allele-specific ReadPosRankSum CDF.[0]Required
info.AS_SB_TABLEarray<int32>Allele-specific forward/reverse read counts for strand bias tests.[21,6,3,3]Required
info.AS_SORfloat64Allele-specific strand bias estimated by the symmetric odds ratio test.9.64e-02Required
info.AS_VarDPint32Allele-specific depth over variant genotypes (does not include depth of reference samples).26Required
info.singletonboolVariant is seen once in the callset.TrueOptional
info.transmitted_singletonboolVariant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort).TrueOptional
info.sibling_singletonboolVariant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort).TrueOptional
info.omniboolVariant is present on the Omni 2.5 genotyping array and found in 1000 Genomes data.Not Needed
info.millsboolIndel is present in the Mills and Devine data.Not Needed
info.monoallelicboolAll samples are homozygous alternate for the variant.Not Needed
info.only_hetboolAll samples are heterozygous for the variant (no homozygous reference or alternate genotype calls).Not Needed
info.inbreeding_coefffloat64Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes) / (the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium).Not Needed
info.excess_hetfloat64Phred-scaled p-value for exact test of excess heterozygosity.Not Needed
info.vrsstruct { ... }Struct containing information related to the Global Alliance for Genomic Health (GA4GH) Variant Representation Specification (VRS) standard. VRS annotations must be created using the following tool versions: (vrs_schema_version=2.0.1, vrs_python_version=2.1.3, seqrepo_version=2024-12-20).

Optional
info.vrs.VRS_Allele_IDsarray<str>The computed identifiers for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.["ga4gh:VA.oTAtTrgYxm81O9fu6Mrhfo1t3eHsgg4L","ga4gh:VA.Y283OnlLjyi1T1IT_JzvW255rC6YJsW6"]Optional
info.vrs.VRS_Startsarray<int32>Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[10030,10030]Optional
info.vrs.VRS_Endsarray<int32>Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[10031,10031]Optional
info.vrs.VRS_Statesarray<str>The literal sequence states used for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.["T","C"]Optional
info.vrs.VRS_Lengthsarray<int32>The length values from ReferenceLengthExpression states for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[1,NA]Optional
info.vrs.VRS_RepeatSubunitLengthsarray<int32>The repeatSubunitLength values from ReferenceLengthExpression states for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[1,NA]Optional
vepstruct { ... }VEP annotations generated by the VEP tool (to be re-annotated).Not Needed
vqsr_resultsstructVQSR related variant annotations.Not Needed
vqsr_results.AS_VQSLODfloat64Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model.Not Needed
vqsr_results.AS_culpritstrAllele-specific worst-performing annotation in the VQSR Gaussian mixture model.Not Needed
vqsr_results.positive_train_siteboolVariant was used to build the positive training set of high-quality variants for VQSR.Not Needed
vqsr_results.negative_train_siteboolVariant was used to build the negative training set of low-quality variants for VQSR.Not Needed
region_flagsstructStruct containing flags about regions.Not Needed
region_flags.non_parboolVariant falls within a non-pseudoautosomal region.Not Needed
region_flags.lcrboolVariant falls within a low complexity region.Not Needed
region_flags.segdupboolVariant falls within a segmental duplication region.Not Needed
allele_infostructAllele information.Not Needed
allele_info.allele_typestrAllele type (one of: 'snv', 'insertion', 'deletion', or 'mixed').Not Needed
allele_info.n_alt_allelesint32Total number of alternate alleles observed at variant locus.Not Needed
allele_info.variant_typestrVariant type (one of: 'snv', 'indel', 'multi-snv', 'multi-indel', or 'mixed').Not Needed
allele_info.was_mixedboolVariant type was mixed.Not Needed
histogramsstruct { ... }Variant information histograms.Required
histograms.qual_histsstruct { ... }Genotype quality metric histograms for high quality genotypes.Required
histograms.qual_hists.gq_hist_allstruct { ... }Histogram for GQ calculated on high quality genotypes.Required
histograms.qual_hists.gq_hist_all.bin_edgesarray<float64>Bin edges for the GQ histogram calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.gq_hist_all.bin_freqarray<int64>Bin frequencies for the GQ histogram calculated on high quality genotypes. The number of records found in each bin.[50, 56, 101, ...]Required
histograms.qual_hists.gq_hist_all.n_smallerint64Count of GQ values falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.gq_hist_all.n_largerint64Count of GQ values falling above the highest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_allstruct { ... }Histogram for DP calculated on high quality genotypes.Required
histograms.qual_hists.dp_hist_all.bin_edgesarray<float64>Bin edges for the DP histogram calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.dp_hist_all.bin_freqarray<int64>Bin frequencies for the DP histogram calculated on high quality genotypes. The number of records found in each bin.[2, 2, 16, ...]Required
histograms.qual_hists.dp_hist_all.n_smallerint64Count of DP values falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_all.n_largerint64Count of DP values falling above the highest histogram bin edge, calculated on high quality genotypes.500Required
histograms.qual_hists.gq_hist_altstruct { ... }Histogram for GQ in non-reference individuals calculated on high quality genotypes.Required
histograms.qual_hists.gq_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of GQ in non-reference individuals calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.gq_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of GQ in non-reference individuals calculated on high quality genotypes. The number of records found in each bin.[0, 0 , 1, ...]Required
histograms.qual_hists.gq_hist_alt.n_smallerint64Count of GQ values in non-reference individuals falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.gq_hist_alt.n_largerint64Count of GQ values in non-reference individuals falling above the highest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_altstruct { ... }Histogram for DP in non-reference individuals calculated on high quality genotypes.Required
histograms.qual_hists.dp_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of DP in non-reference individuals calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.dp_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of DP in non-reference individuals calculated on high quality genotypes. The number of records found in each bin.[1, 1, 10, ...]Required
histograms.qual_hists.dp_hist_alt.n_smallerint64Count of DP values in non-reference individuals falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_alt.n_largerint64Count of DP values in non-reference individuals falling above highest histogram bin edge, calculated on high quality genotypes.10Required
histograms.qual_hists.ab_hist_altstruct { ... }Histogram for AB in heterozygous individuals calculated on high quality genotypes.Required
histograms.qual_hists.ab_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of AB in heterozygous individuals calculated on high quality genotypes are: 0.00 | 0.05 | 0.10 | 0.20 | 0.25 | 0.30 | 0.35 | 0.40 | 0.45 | 0.50 | 0.55 | 0.60 | 0.65 | 0.70 | 0.75 | 0.80 | 0.85 | 0.90 | 0.95 | 1.00.[0.00 , 0.05 , 0.10, ...]Required
histograms.qual_hists.ab_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of AB in heterozygous individuals calculated on high quality genotypes. The number of records found in each bin.[0 , 0 , 5, ...]Required
histograms.qual_hists.ab_hist_alt.n_smallerint64Count of AB values in heterozygous individuals falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.ab_hist_alt.n_largerint64Count of AB values in heterozygous individuals falling above the highest histogram bin edge, calculated on high quality genotypes.0Required
histograms.raw_qual_histsstruct { ... }Genotype quality metric histograms for all genotypes as opposed to high quality genotypes.Optional
histograms.raw_qual_hists.gq_hist_allstruct { ... }Histogram for GQ calculated on all genotypes.Optional
histograms.raw_qual_hists.gq_hist_all.bin_edgesarray<float64>Bin edges for the GQ histogram calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.gq_hist_all.bin_freqarray<int64>Bin frequencies for the GQ histogram calculated on all genotypes. The number of records found in each bin.[60, 76, 130, ...]Optional
histograms.raw_qual_hists.gq_hist_all.n_smallerint64Count of GQ values falling below lowest histogram bin edge, for GQ calculated on all genotypes.0Optional
histograms.raw_qual_hists.gq_hist_all.n_largerint64Count of GQ values falling above highest histogram bin edge, for GQ calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_allstruct { ... }Histogram for DP calculated on all genotypes.Optional
histograms.raw_qual_hists.dp_hist_all.bin_edgesarray<float64>Bin edges for the DP histogram calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.dp_hist_all.bin_freqarray<int64>Bin frequencies for the DP histogram calculated on all genotypes. The number of records found in each bin.[4, 7, 20, ...]Optional
histograms.raw_qual_hists.dp_hist_all.n_smallerint64Count of DP values falling below lowest histogram bin edge, for DP calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_all.n_largerint64Count of DP values falling above highest histogram bin edge, for DP calculated on all genotypes.600Optional
histograms.raw_qual_hists.gq_hist_altstruct { ... }Histogram for GQ in non-reference individuals calculated on all genotypes.Optional
histograms.raw_qual_hists.gq_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of GQ in non-reference individuals calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.gq_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of GQ in non-reference individuals calculated on all genotypes. The number of records found in each bin.[3, 4, 10, ...]Optional
histograms.raw_qual_hists.gq_hist_alt.n_smallerint64Count of GQ values in non-reference individuals falling below lowest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.gq_hist_alt.n_largerint64Count of GQ values in non-reference individuals falling above highest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_altstruct { ... }Histogram for DP in non-reference individuals calculated on all genotypes.Optional
histograms.raw_qual_hists.dp_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of DP in non-reference individuals calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.dp_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of DP in non-reference individuals calculated on all genotypes. The number of records found in each bin.[2, 2, 6, ...]Optional
histograms.raw_qual_hists.dp_hist_alt.n_smallerint64Count of DP values in non-reference individuals falling below lowest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_alt.n_largerint64Count of DP values in non-reference individuals falling above highest histogram bin edge, calculated on all genotypes.11Optional
histograms.raw_qual_hists.ab_hist_altstruct { ... }Histogram for AB in heterozygous individuals calculated on all genotypes.Optional
histograms.raw_qual_hists.ab_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of AB in heterozygous individuals calculated on all genotypes are: 0.00 | 0.05 | 0.10 | 0.20 | 0.25 | 0.30 | 0.35 | 0.40 | 0.45 | 0.50 | 0.55 | 0.60 | 0.65 | 0.70 | 0.75 | 0.80 | 0.85 | 0.90 | 0.95 | 1.00.[0.00 , 0.05 , 0.10, ...]Optional
histograms.raw_qual_hists.ab_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of AB in heterozygous individuals calculated on all genotypes. The number of records found in each bin.[0 , 0 , 6, ...]Optional
histograms.raw_qual_hists.ab_hist_alt.n_smallerint64Count of AB values in heterozygous individuals falling below lowest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.ab_hist_alt.n_largerint64Count of AB values in heterozygous individuals falling above highest histogram bin edge, calculated on all genotypes.0Optional
histograms.age_histsstruct { ... }Histograms containing age information for high quality genotypes. Cohorts for gnomAD vary in how they report age (some report the age at diagnosis, others report the age of last visit, etc), so the ages associated with the gnomAD data can be thought of as the last known age of the individual. Information on age is not available for all gnomAD samples. This field is required with the acknowledgement that age data may not be available for all samples. If no age is available for any samples, set all bin_freq values to 0.Required
histograms.age_hists.age_hist_hetstruct { ... }Histogram for age in all heterozygous samples calculated on high quality genotypes.Required
histograms.age_hists.age_hist_het.bin_edgesarray<float64>Bin edges for the age histogram: 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0.[30.0, 35.0, 40.0, ...]Required
histograms.age_hists.age_hist_het.bin_freqarray<int64>Bin frequencies for the age histogram. This is the number of records found in each bin.[0, 3, 4, ...]Required
histograms.age_hists.age_hist_het.n_smallerint64Count of age values falling below lowest histogram bin edge.1Required
histograms.age_hists.age_hist_het.n_largerint64Count of age values falling above highest histogram bin edge.0Required
histograms.age_hists.age_hist_homstruct { ... }Histogram for age in all homozygous samples calculated on high quality genotypes. If variant is in the pseudoautosomal regions of chrX or chrY, this histogram also includes age counts of hemizygous samples.Required
histograms.age_hists.age_hist_hom.bin_edgesarray<float64>Bin edges for the age histogram: 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0.[30.0, 35.0, 40.0, ...]Required
histograms.age_hists.age_hist_hom.bin_freqarray<int64>Bin frequencies for the age histogram. This is the number of records found in each bin.[0, 2, 2, ...]Required
histograms.age_hists.age_hist_hom.n_smallerint64Count of age values falling below lowest histogram bin edge.0Required
histograms.age_hists.age_hist_hom.n_largerint64Count of age values falling above highest histogram bin edge.0Required
in_silico_predictorsstruct { ... }Variant prediction annotations.Not Needed
in_silico_predictors.caddstruct { ... }Score used to predict deleteriousness of SNVs and indels.Not Needed
in_silico_predictors.cadd.phredfloat32CADD Phred-like scaled C-scores ranging from 1 to 99 based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values indicate increased predicted deleteriousness.Not Needed
in_silico_predictors.cadd.raw_scorefloat32Unscaled CADD scores indicating whether a variant is likely to be "observed" (negative values) vs "simulated" (positive values). Larger values indicate increased predicted deleteriousness.Not Needed
in_silico_predictors.revel_maxfloat64An ensemble score for predicting the pathogenicity of missense variants (based on 13 other variant predictors).Not Needed
in_silico_predictors.spliceai_ds_maxfloat32Maximum delta score across 4 splicing consequences, which reflects the probability of the variant being splice-altering.Not Needed
in_silico_predictors.pangolin_largest_dsfloat64Largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering.Not Needed
in_silico_predictors.phylopfloat64Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores).Not Needed
in_silico_predictors.sift_maxfloat64Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1.Not Needed
in_silico_predictors.polyphen_maxfloat64Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious).Not Needed
+
FieldTypeDescriptionExampleField Necessity
locuslocus<GRCh38>Variant locus. Contains contig and position information. Must be build GRCh38.chr1:12345Required
allelesarray<str>Variant alleles (reference and alternate).["A", "G"]Required
freqarray<struct {...}>Array of allele frequency information (AC, AN, AF, homozygote count) for each frequency aggregation group corresponding to each frequency metadata group. The 'adj' value should always be the first value of the array, and the 'raw' value should be the second value.Required
freq.ACint32Alternate allele count.10Required
freq.AFfloat64Alternate allele frequency, (AC/AN).0.1Required
freq.ANint32Total number of alleles.100Required
freq.homozygote_countint32Count of homozygous alternate individuals for the alternate allele.2Required
grpmaxstruct {...}Allele frequency information (AC, AN, AF, homozygote count) for the group with maximum allele frequency.Not Needed
grpmax.ACint32Alternate allele count in the group with the maximum allele frequency.Not Needed
grpmax.AFfloat64Maximum alternate allele frequency, (AC/AN), across all groups.Not Needed
grpmax.ANint32Total number of alleles in the group with the maximum allele frequency.Not Needed
grpmax.homozygote_countint32Count of homozygous individuals in the group with the maximum allele frequency.Not Needed
grpmax.gen_ancstrGenetic ancestry corresponding to the maximum frequency group.Not Needed
fafarray<struct {...}>Filtering allele frequency.Not Needed
faf.faf95float64FAF95: Filtering allele frequency (using Poisson 95% CI).Not Needed
faf.faf99float64FAF99: Filtering allele frequency (using Poisson 99% CI) frequency.Not Needed
fafmaxstruct {...}Information about the genetic ancestry group with the maximum filtering allele frequency.Not Needed
fafmax.faf95_maxfloat64Maximum filtering allele frequency (using Poisson 95% CI).Not Needed
fafmax.faf95_max_gen_ancstrGenetic ancestry group with the maximum filtering allele frequency (95% CI).Not Needed
fafmax.faf99_maxfloat64Maximum filtering allele frequency (using Poisson 99% CI).Not Needed
fafmax.faf99_max_gen_ancstrGenetic ancestry group with the maximum filtering allele frequency (99% CI).Not Needed
a_indexint32The original index of this alternate allele in the multiallelic representation (1 is the first alternate allele or the only alternate allele in a biallelic variant).1Required
was_splitboolTrue if this variant was originally multiallelic, otherwise False.FalseRequired
rsidset<str>dbSNP reference SNP identification (rsID) numbers.Not Needed
filtersset<str>Variant filters; 'AC0': Allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; or AB < 0.2 for het calls), 'AS_VQSR': Failed allele-specific VQSR filtering thresholds, 'InbreedingCoeff': GATK InbreedingCoeff < -0.3. An empty set in this field indicates that the variant passed all variant filters.{"AC0","AS_VQSR"}Optional
infostruct {...}Struct containing typical GATK allele-specific (AS) info fields and additional variant QC fields.Required
info.FSfloat64Phred-scaled p-value of Fisher's exact test for strand bias.7.30e+00Required
info.MQfloat64Root mean square of the mapping quality of reads across all samples.3.48e+01Required
info.MQRankSumfloat64Z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities.6.70e-02Required
info.MQRankSum_cdfstruct { ...}CDF summary of overall MQRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.MQRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,3]Required
info.MQRankSum_cdf.itemsarray<float64>Ordered sample of values from the MQRankSum distribution.[-9.38e-01,-2.27e+00,-1.34e+00]Required
info.MQRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation.[0]Required
info.QUALapproxint64Sum of PL[0] values; used to approximate the QUAL score.96Required
info.QDfloat32Variant call confidence normalized by depth of sample reads supporting a variant.2.74e+00Required
info.ReadPosRankSumfloat64Z-score from Wilcoxon rank sum test of alternate vs. reference read position bias.-1.07e+00Required
info.ReadPosRankSum_cdfstruct { ...}CDF summary of overall ReadPosRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.ReadPosRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,3]Required
info.ReadPosRankSum_cdf.itemsarray<float64>Ordered sample of values from the ReadPosRankSum distribution.[9.67e-01,-9.67e-01,9.67e-01]Required
info.ReadPosRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation.[0]Required
info.SBarray<int32>Aggregate counts of strand depth across all non-homozygous-reference calls. The values are the depth of reference allele on forward strand, depth of the reference allele on reverse strand, depth of all alternate alleles on forward strand, depth of all alternate alleles on reverse strand.[21,6,4,4]Required
info.SORfloat64Strand bias estimated by the symmetric odds ratio test.9.60e-02Required
info.VarDPint32Depth over variant genotypes (does not include depth of reference samples).35Required
info.AS_FSfloat64Allele-specific phred-scaled p-value of Fisher's exact test for strand bias.5.10e+00Required
info.AS_MQfloat64Allele-specific root mean square of the mapping quality of reads across all samples.3.51e+01Required
info.AS_MQRankSumfloat64Allele-specific z-score from Wilcoxon rank sum test of alternate vs. reference read mapping qualities.-5.72e-01Required
info.AS_MQRankSum_cdfstruct { ...}CDF summary of allele-specific MQRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.AS_MQRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,3]Required
info.AS_MQRankSum_cdf.itemsarray<float64>Ordered sample of values from the allele-specific MQRankSum distribution.[-1.38e+00,4.31e-01,-9.67e-01]Required
info.AS_MQRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation for allele-specific MQRankSum CDF.[0]Required
info.AS_pab_maxfloat64Maximum p-value over callset for binomial test of observed allele balance for a heterozygous genotype, given expectation of 0.5.6.87e-01Required
info.AS_QUALapproxint64Allele-specific sum of PL[0] values; used to approximate the QUAL score.77Required
info.AS_QDfloat32Allele-specific variant call confidence normalized by depth of sample reads supporting a variant.2.96e+00Required
info.AS_ReadPosRankSumfloat64Allele-specific z-score from Wilcoxon rank sum test of alternate vs. reference read position bias.-1.38e+00Required
info.AS_ReadPosRankSum_cdfstruct { ...}CDF summary of allele-specific ReadPosRankSum values. The _raw parameter must be set to True to return an internal representation of the CDF approximation. This annotation is generated using Hail's function: hl.agg.approx_cdf(x, k=XX, _raw=True). The k value must be set to 200 in order to merge with the gnomAD dataset. This method is non-deterministic: computing approx_cdf multiple times will give slightly different results each time. It is currently not possible to seed the aggregator.Required
info.AS_ReadPosRankSum_cdf.levelsarray<int32>List indicating how many items are stored at each compression level.[0,1]Required
info.AS_ReadPosRankSum_cdf.itemsarray<float64>Ordered sample of values from the allele-specific ReadPosRankSum distribution.[-1.78e-01]Required
info.AS_ReadPosRankSum_cdf._compaction_countsarray<int32>Used internally to support downstream error estimation for allele-specific ReadPosRankSum CDF.[0]Required
info.AS_SB_TABLEarray<int32>Allele-specific forward/reverse read counts for strand bias tests.[21,6,3,3]Required
info.AS_SORfloat64Allele-specific strand bias estimated by the symmetric odds ratio test.9.64e-02Required
info.AS_VarDPint32Allele-specific depth over variant genotypes (does not include depth of reference samples).26Required
info.singletonboolVariant is seen once in the callset.TrueOptional
info.transmitted_singletonboolVariant was a callset-wide doubleton that was transmitted within a family from a parent to a child (i.e., a singleton amongst unrelated samples in cohort).TrueOptional
info.sibling_singletonboolVariant was a callset-wide doubleton that was present only in two siblings (i.e., a singleton amongst unrelated samples in cohort).TrueOptional
info.omniboolVariant is present on the Omni 2.5 genotyping array and found in 1000 Genomes data.Not Needed
info.millsboolIndel is present in the Mills and Devine data.Not Needed
info.monoallelicboolAll samples are homozygous alternate for the variant.Not Needed
info.only_hetboolAll samples are heterozygous for the variant (no homozygous reference or alternate genotype calls).Not Needed
info.inbreeding_coefffloat64Inbreeding coefficient, the excess heterozygosity at a variant site, computed as 1 - (the number of heterozygous genotypes) / (the number of heterozygous genotypes expected under Hardy-Weinberg equilibrium).Not Needed
info.excess_hetfloat64Phred-scaled p-value for exact test of excess heterozygosity.Not Needed
info.vrsstruct { ... }Struct containing information related to the Global Alliance for Genomic Health (GA4GH) Variant Representation Specification (VRS) standard. VRS annotations must be created using the following tool versions: (vrs_schema_version=2.0.1, vrs_python_version=2.1.3, seqrepo_version=2024-12-20).

Optional
info.vrs.VRS_Allele_IDsarray<str>The computed identifiers for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.["ga4gh:VA.oTAtTrgYxm81O9fu6Mrhfo1t3eHsgg4L","ga4gh:VA.Y283OnlLjyi1T1IT_JzvW255rC6YJsW6"]Optional
info.vrs.VRS_Startsarray<int32>Interresidue coordinates used as the location starts for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[10030,10030]Optional
info.vrs.VRS_Endsarray<int32>Interresidue coordinates used as the location ends for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[10031,10031]Optional
info.vrs.VRS_Statesarray<str>The literal sequence states used for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.["T","C"]Optional
info.vrs.VRS_Lengthsarray<int32>The length values from ReferenceLengthExpression states for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[1,NA]Optional
info.vrs.VRS_RepeatSubunitLengthsarray<int32>The repeatSubunitLength values from ReferenceLengthExpression states for the GA4GH VRS Alleles corresponding to the GT indexes of the reference and alternate alleles.[1,NA]Optional
vepstruct { ... }VEP annotations generated by the VEP tool (to be re-annotated).Not Needed
vqsr_resultsstructVQSR related variant annotations.Not Needed
vqsr_results.AS_VQSLODfloat64Allele-specific log-odds ratio of being a true variant versus being a false positive under the trained VQSR Gaussian mixture model.Not Needed
vqsr_results.AS_culpritstrAllele-specific worst-performing annotation in the VQSR Gaussian mixture model.Not Needed
vqsr_results.positive_train_siteboolVariant was used to build the positive training set of high-quality variants for VQSR.Not Needed
vqsr_results.negative_train_siteboolVariant was used to build the negative training set of low-quality variants for VQSR.Not Needed
region_flagsstructStruct containing flags about regions.Not Needed
region_flags.non_parboolVariant falls within a non-pseudoautosomal region.Not Needed
region_flags.lcrboolVariant falls within a low complexity region.Not Needed
region_flags.segdupboolVariant falls within a segmental duplication region.Not Needed
allele_infostructAllele information.Not Needed
allele_info.allele_typestrAllele type (one of: 'snv', 'insertion', 'deletion', or 'mixed').Not Needed
allele_info.n_alt_allelesint32Total number of alternate alleles observed at variant locus.Not Needed
allele_info.variant_typestrVariant type (one of: 'snv', 'indel', 'multi-snv', 'multi-indel', or 'mixed').Not Needed
allele_info.was_mixedboolVariant type was mixed.Not Needed
histogramsstruct { ... }Variant information histograms.Required
histograms.qual_histsstruct { ... }Genotype quality metric histograms for high quality genotypes.Required
histograms.qual_hists.gq_hist_allstruct { ... }Histogram for GQ calculated on high quality genotypes.Required
histograms.qual_hists.gq_hist_all.bin_edgesarray<float64>Bin edges for the GQ histogram calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.gq_hist_all.bin_freqarray<int64>Bin frequencies for the GQ histogram calculated on high quality genotypes. The number of records found in each bin.[50, 56, 101, ...]Required
histograms.qual_hists.gq_hist_all.n_smallerint64Count of GQ values falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.gq_hist_all.n_largerint64Count of GQ values falling above the highest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_allstruct { ... }Histogram for DP calculated on high quality genotypes.Required
histograms.qual_hists.dp_hist_all.bin_edgesarray<float64>Bin edges for the DP histogram calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.dp_hist_all.bin_freqarray<int64>Bin frequencies for the DP histogram calculated on high quality genotypes. The number of records found in each bin.[2, 2, 16, ...]Required
histograms.qual_hists.dp_hist_all.n_smallerint64Count of DP values falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_all.n_largerint64Count of DP values falling above the highest histogram bin edge, calculated on high quality genotypes.500Required
histograms.qual_hists.gq_hist_altstruct { ... }Histogram for GQ in non-reference individuals calculated on high quality genotypes.Required
histograms.qual_hists.gq_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of GQ in non-reference individuals calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.gq_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of GQ in non-reference individuals calculated on high quality genotypes. The number of records found in each bin.[0, 0 , 1, ...]Required
histograms.qual_hists.gq_hist_alt.n_smallerint64Count of GQ values in non-reference individuals falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.gq_hist_alt.n_largerint64Count of GQ values in non-reference individuals falling above the highest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_altstruct { ... }Histogram for DP in non-reference individuals calculated on high quality genotypes.Required
histograms.qual_hists.dp_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of DP in non-reference individuals calculated on high quality genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Required
histograms.qual_hists.dp_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of DP in non-reference individuals calculated on high quality genotypes. The number of records found in each bin.[1, 1, 10, ...]Required
histograms.qual_hists.dp_hist_alt.n_smallerint64Count of DP values in non-reference individuals falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.dp_hist_alt.n_largerint64Count of DP values in non-reference individuals falling above highest histogram bin edge, calculated on high quality genotypes.10Required
histograms.qual_hists.ab_hist_altstruct { ... }Histogram for AB in heterozygous individuals calculated on high quality genotypes.Required
histograms.qual_hists.ab_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of AB in heterozygous individuals calculated on high quality genotypes are: 0.00 | 0.05 | 0.10 | 0.20 | 0.25 | 0.30 | 0.35 | 0.40 | 0.45 | 0.50 | 0.55 | 0.60 | 0.65 | 0.70 | 0.75 | 0.80 | 0.85 | 0.90 | 0.95 | 1.00.[0.00 , 0.05 , 0.10, ...]Required
histograms.qual_hists.ab_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of AB in heterozygous individuals calculated on high quality genotypes. The number of records found in each bin.[0 , 0 , 5, ...]Required
histograms.qual_hists.ab_hist_alt.n_smallerint64Count of AB values in heterozygous individuals falling below the lowest histogram bin edge, calculated on high quality genotypes.0Required
histograms.qual_hists.ab_hist_alt.n_largerint64Count of AB values in heterozygous individuals falling above the highest histogram bin edge, calculated on high quality genotypes.0Required
histograms.raw_qual_histsstruct { ... }Genotype quality metric histograms for all genotypes as opposed to high quality genotypes.Optional
histograms.raw_qual_hists.gq_hist_allstruct { ... }Histogram for GQ calculated on all genotypes.Optional
histograms.raw_qual_hists.gq_hist_all.bin_edgesarray<float64>Bin edges for the GQ histogram calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.gq_hist_all.bin_freqarray<int64>Bin frequencies for the GQ histogram calculated on all genotypes. The number of records found in each bin.[60, 76, 130, ...]Optional
histograms.raw_qual_hists.gq_hist_all.n_smallerint64Count of GQ values falling below lowest histogram bin edge, for GQ calculated on all genotypes.0Optional
histograms.raw_qual_hists.gq_hist_all.n_largerint64Count of GQ values falling above highest histogram bin edge, for GQ calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_allstruct { ... }Histogram for DP calculated on all genotypes.Optional
histograms.raw_qual_hists.dp_hist_all.bin_edgesarray<float64>Bin edges for the DP histogram calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.dp_hist_all.bin_freqarray<int64>Bin frequencies for the DP histogram calculated on all genotypes. The number of records found in each bin.[4, 7, 20, ...]Optional
histograms.raw_qual_hists.dp_hist_all.n_smallerint64Count of DP values falling below lowest histogram bin edge, for DP calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_all.n_largerint64Count of DP values falling above highest histogram bin edge, for DP calculated on all genotypes.600Optional
histograms.raw_qual_hists.gq_hist_altstruct { ... }Histogram for GQ in non-reference individuals calculated on all genotypes.Optional
histograms.raw_qual_hists.gq_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of GQ in non-reference individuals calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.gq_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of GQ in non-reference individuals calculated on all genotypes. The number of records found in each bin.[3, 4, 10, ...]Optional
histograms.raw_qual_hists.gq_hist_alt.n_smallerint64Count of GQ values in non-reference individuals falling below lowest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.gq_hist_alt.n_largerint64Count of GQ values in non-reference individuals falling above highest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_altstruct { ... }Histogram for DP in non-reference individuals calculated on all genotypes.Optional
histograms.raw_qual_hists.dp_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of DP in non-reference individuals calculated on all genotypes are: 0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0.[0.0, 5.0, 10.0, ...]Optional
histograms.raw_qual_hists.dp_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of DP in non-reference individuals calculated on all genotypes. The number of records found in each bin.[2, 2, 6, ...]Optional
histograms.raw_qual_hists.dp_hist_alt.n_smallerint64Count of DP values in non-reference individuals falling below lowest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.dp_hist_alt.n_largerint64Count of DP values in non-reference individuals falling above highest histogram bin edge, calculated on all genotypes.11Optional
histograms.raw_qual_hists.ab_hist_altstruct { ... }Histogram for AB in heterozygous individuals calculated on all genotypes.Optional
histograms.raw_qual_hists.ab_hist_alt.bin_edgesarray<float64>Bin edges for the histogram of AB in heterozygous individuals calculated on all genotypes are: 0.00 | 0.05 | 0.10 | 0.20 | 0.25 | 0.30 | 0.35 | 0.40 | 0.45 | 0.50 | 0.55 | 0.60 | 0.65 | 0.70 | 0.75 | 0.80 | 0.85 | 0.90 | 0.95 | 1.00.[0.00 , 0.05 , 0.10, ...]Optional
histograms.raw_qual_hists.ab_hist_alt.bin_freqarray<int64>Bin frequencies for the histogram of AB in heterozygous individuals calculated on all genotypes. The number of records found in each bin.[0 , 0 , 6, ...]Optional
histograms.raw_qual_hists.ab_hist_alt.n_smallerint64Count of AB values in heterozygous individuals falling below lowest histogram bin edge, calculated on all genotypes.0Optional
histograms.raw_qual_hists.ab_hist_alt.n_largerint64Count of AB values in heterozygous individuals falling above highest histogram bin edge, calculated on all genotypes.0Optional
histograms.age_histsstruct { ... }Histograms containing age information for high quality genotypes. Cohorts for gnomAD vary in how they report age (some report the age at diagnosis, others report the age of last visit, etc), so the ages associated with the gnomAD data can be thought of as the last known age of the individual. Information on age is not available for all gnomAD samples. This field is required with the acknowledgement that age data may not be available for all samples. If no age is available for any samples, set all bin_freq values to 0.Required
histograms.age_hists.age_hist_hetstruct { ... }Histogram for age in all heterozygous samples calculated on high quality genotypes.Required
histograms.age_hists.age_hist_het.bin_edgesarray<float64>Bin edges for the age histogram: 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0.[30.0, 35.0, 40.0, ...]Required
histograms.age_hists.age_hist_het.bin_freqarray<int64>Bin frequencies for the age histogram. This is the number of records found in each bin.[0, 3, 4, ...]Required
histograms.age_hists.age_hist_het.n_smallerint64Count of age values falling below lowest histogram bin edge.1Required
histograms.age_hists.age_hist_het.n_largerint64Count of age values falling above highest histogram bin edge.0Required
histograms.age_hists.age_hist_homstruct { ... }Histogram for age in all homozygous samples calculated on high quality genotypes. If variant is in the pseudoautosomal regions of chrX or chrY, this histogram also includes age counts of hemizygous samples.Required
histograms.age_hists.age_hist_hom.bin_edgesarray<float64>Bin edges for the age histogram: 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0.[30.0, 35.0, 40.0, ...]Required
histograms.age_hists.age_hist_hom.bin_freqarray<int64>Bin frequencies for the age histogram. This is the number of records found in each bin.[0, 2, 2, ...]Required
histograms.age_hists.age_hist_hom.n_smallerint64Count of age values falling below lowest histogram bin edge.0Required
histograms.age_hists.age_hist_hom.n_largerint64Count of age values falling above highest histogram bin edge.0Required
in_silico_predictorsstruct { ... }Variant prediction annotations.Not Needed
in_silico_predictors.caddstruct { ... }Score used to predict deleteriousness of SNVs and indels.Not Needed
in_silico_predictors.cadd.phredfloat32CADD Phred-like scaled C-scores ranging from 1 to 99 based on the rank of each variant relative to all possible 8.6 billion substitutions in the human reference genome. Larger values indicate increased predicted deleteriousness.Not Needed
in_silico_predictors.cadd.raw_scorefloat32Unscaled CADD scores indicating whether a variant is likely to be "observed" (negative values) vs "simulated" (positive values). Larger values indicate increased predicted deleteriousness.Not Needed
in_silico_predictors.revel_maxfloat64An ensemble score for predicting the pathogenicity of missense variants (based on 13 other variant predictors).Not Needed
in_silico_predictors.spliceai_ds_maxfloat32Maximum delta score across 4 splicing consequences, which reflects the probability of the variant being splice-altering.Not Needed
in_silico_predictors.pangolin_largest_dsfloat64Largest delta score across 2 splicing consequences, which reflects the probability of the variant being splice-altering.Not Needed
in_silico_predictors.phylopfloat64Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores).Not Needed
in_silico_predictors.sift_maxfloat64Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1.Not Needed
in_silico_predictors.polyphen_maxfloat64Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious).Not Needed
vep115struct { ... }VEP 115 annotations generated by the VEP tool (to be re-annotated).Not Needed
\ No newline at end of file diff --git a/gnomad_qc/v5/data_ingestion/field_requirements.md b/gnomad_qc/federated/data_ingestion/field_requirements.md similarity index 92% rename from gnomad_qc/v5/data_ingestion/field_requirements.md rename to gnomad_qc/federated/data_ingestion/field_requirements.md index 1cc4745bc..2b15cc7ec 100644 --- a/gnomad_qc/v5/data_ingestion/field_requirements.md +++ b/gnomad_qc/federated/data_ingestion/field_requirements.md @@ -17,7 +17,7 @@ All histograms annotations must use the same bin edges as defined in their respe | **faf_meta** | `array>` | Filtering allele frequency metadata. An ordered list containing the frequency aggregation group for each element of the ‘faf’ array row annotation. | — | Not Needed | | **faf_index_dict** | `dict` | Dictionary keyed by specified label grouping combinations ('group': 'adj'/'raw', 'gen_anc': inferred genetic ancestry group, 'sex': sex karyotype), with values describing the corresponding index of each grouping entry in the filtering allele frequency (‘faf’) row annotation. | — | Not Needed | | **age_distribution** | `struct { ... }` | Callset-wide age histogram. Cohorts for gnomAD vary in how they report age (some report the age at diagnosis, others report the age of last visit, etc), so the ages associated with the gnomAD data can be thought of as the last known age of the individual. Information on age is not available for all gnomAD samples. This field is required with the acknowledgement that age data may not be available for all samples. If no age is available for any samples, set all `bin_freq` values to `0`. | — | Required | -| age_distribution.bin_edges | `array` | Bin edges for age histogram: `30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0`. | `[30.0, 35.0, 40.0, 45.0, ...]` | Required | +| age_distribution.bin_edges | `array` | Bin edges for age histogram: 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0. | `[30.0, 35.0, 40.0, 45.0, ...]` | Required | | age_distribution.bin_freq | `array` | Bin frequencies for the age histogram. This is the number of records found in each bin. | `[101, 122, 85, 4, ...]` | Required | | age_distribution.n_smaller | `int32` | Count of age values falling below lowest histogram bin edge. | `1000` | Required | | age_distribution.n_larger | `int32` | Count of age values falling above highest histogram bin edge. | `30` | Required | @@ -54,6 +54,12 @@ All histograms annotations must use the same bin edges as defined in their respe | vep_globals.vep_config | `str` | VEP configuration to run VEP version with Hail. File created using command within VEP init shell script in https://github.com/broadinstitute/gnomad_methods/tree/main. | — | Not Needed | | vep_globals.gencode_version | `str` | GENCODE version used in VEP. | — | Not Needed | | vep_globals.mane_select_version | `str` | MANE select version used in VEP. | — | Not Needed | +| **vep115_globals** | `struct { ... }` | Information about VEP 115 annotations. | — | Not Needed | +| vep115_globals.vep_version | `str` | VEP version that was run on the callset (115). | — | Not Needed | +| vep115_globals.vep_help | `str` | Output from vep --help for VEP 115. | — | Not Needed | +| vep115_globals.vep_config | `str` | VEP 115 configuration to run VEP version with Hail. | — | Not Needed | +| vep115_globals.gencode_version | `str` | GENCODE version used in VEP 115. | — | Not Needed | +| vep115_globals.mane_select_version | `str` | MANE select version used in VEP 115. | — | Not Needed | | **frequency_README** | `str` | Explanation of how to use the 'freq_index_dict' global annotation to extract frequencies from the 'freq' row annotation. | — | Not Needed | | **date** | `str` | Date Hail Table was created. | `"2025-04-09"` | Required | | **version** | `str` | Version of the file. | — | Not Needed | @@ -164,64 +170,64 @@ Data must be supplied as a "split" dataset, where multiallelic variants are spli | **histograms** | `struct { ... }` | Variant information histograms. | — | Required | | histograms.qual_hists | `struct { ... }` | Genotype quality metric histograms for high quality genotypes. | — | Required | | histograms.qual_hists.gq_hist_all | `struct { ... }` | Histogram for GQ calculated on high quality genotypes. | — | Required | -| histograms.qual_hists.gq_hist_all.bin_edges | `array` | Bin edges for the GQ histogram calculated on high quality genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Required | +| histograms.qual_hists.gq_hist_all.bin_edges | `array` | Bin edges for the GQ histogram calculated on high quality genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Required | | histograms.qual_hists.gq_hist_all.bin_freq | `array` | Bin frequencies for the GQ histogram calculated on high quality genotypes. The number of records found in each bin. | `[50, 56, 101, ...]` | Required | | histograms.qual_hists.gq_hist_all.n_smaller | `int64` | Count of GQ values falling below the lowest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.qual_hists.gq_hist_all.n_larger | `int64` | Count of GQ values falling above the highest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.qual_hists.dp_hist_all | `struct { ... }` | Histogram for DP calculated on high quality genotypes. | — | Required | -| histograms.qual_hists.dp_hist_all.bin_edges | `array` | Bin edges for the DP histogram calculated on high quality genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Required | +| histograms.qual_hists.dp_hist_all.bin_edges | `array` | Bin edges for the DP histogram calculated on high quality genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Required | | histograms.qual_hists.dp_hist_all.bin_freq | `array` | Bin frequencies for the DP histogram calculated on high quality genotypes. The number of records found in each bin. | `[2, 2, 16, ...]` | Required | | histograms.qual_hists.dp_hist_all.n_smaller | `int64` | Count of DP values falling below the lowest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.qual_hists.dp_hist_all.n_larger | `int64` | Count of DP values falling above the highest histogram bin edge, calculated on high quality genotypes. | `500` | Required | | histograms.qual_hists.gq_hist_alt | `struct { ... }` | Histogram for GQ in non-reference individuals calculated on high quality genotypes. | — | Required | -| histograms.qual_hists.gq_hist_alt.bin_edges | `array` | Bin edges for the histogram of GQ in non-reference individuals calculated on high quality genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Required | +| histograms.qual_hists.gq_hist_alt.bin_edges | `array` | Bin edges for the histogram of GQ in non-reference individuals calculated on high quality genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Required | | histograms.qual_hists.gq_hist_alt.bin_freq | `array` | Bin frequencies for the histogram of GQ in non-reference individuals calculated on high quality genotypes. The number of records found in each bin. | `[0, 0 , 1, ...]` | Required | | histograms.qual_hists.gq_hist_alt.n_smaller | `int64` | Count of GQ values in non-reference individuals falling below the lowest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.qual_hists.gq_hist_alt.n_larger | `int64` | Count of GQ values in non-reference individuals falling above the highest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.qual_hists.dp_hist_alt | `struct { ... }` | Histogram for DP in non-reference individuals calculated on high quality genotypes. | — | Required | -| histograms.qual_hists.dp_hist_alt.bin_edges | `array` | Bin edges for the histogram of DP in non-reference individuals calculated on high quality genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Required | +| histograms.qual_hists.dp_hist_alt.bin_edges | `array` | Bin edges for the histogram of DP in non-reference individuals calculated on high quality genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Required | | histograms.qual_hists.dp_hist_alt.bin_freq | `array` | Bin frequencies for the histogram of DP in non-reference individuals calculated on high quality genotypes. The number of records found in each bin. | `[1, 1, 10, ...]` | Required | | histograms.qual_hists.dp_hist_alt.n_smaller | `int64` | Count of DP values in non-reference individuals falling below the lowest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.qual_hists.dp_hist_alt.n_larger | `int64` | Count of DP values in non-reference individuals falling above highest histogram bin edge, calculated on high quality genotypes. | `10` | Required | | histograms.qual_hists.ab_hist_alt | `struct { ... }` | Histogram for AB in heterozygous individuals calculated on high quality genotypes. | — | Required | -| histograms.qual_hists.ab_hist_alt.bin_edges | `array` | Bin edges for the histogram of AB in heterozygous individuals calculated on high quality genotypes are: `0.00 | 0.05 | 0.10 | 0.20 | 0.25 | 0.30 | 0.35 | 0.40 | 0.45 | 0.50 | 0.55 | 0.60 | 0.65 | 0.70 | 0.75 | 0.80 | 0.85 | 0.90 | 0.95 | 1.00`. | `[0.00 , 0.05 , 0.10, ...]` | Required | +| histograms.qual_hists.ab_hist_alt.bin_edges | `array` | Bin edges for the histogram of AB in heterozygous individuals calculated on high quality genotypes are: 0.00 \| 0.05 \| 0.10 \| 0.20 \| 0.25 \| 0.30 \| 0.35 \| 0.40 \| 0.45 \| 0.50 \| 0.55 \| 0.60 \| 0.65 \| 0.70 \| 0.75 \| 0.80 \| 0.85 \| 0.90 \| 0.95 \| 1.00. | `[0.00 , 0.05 , 0.10, ...]` | Required | | histograms.qual_hists.ab_hist_alt.bin_freq | `array` | Bin frequencies for the histogram of AB in heterozygous individuals calculated on high quality genotypes. The number of records found in each bin. | `[0 , 0 , 5, ...]` | Required | | histograms.qual_hists.ab_hist_alt.n_smaller | `int64` | Count of AB values in heterozygous individuals falling below the lowest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.qual_hists.ab_hist_alt.n_larger | `int64` | Count of AB values in heterozygous individuals falling above the highest histogram bin edge, calculated on high quality genotypes. | `0` | Required | | histograms.raw_qual_hists | `struct { ... }` | Genotype quality metric histograms for all genotypes as opposed to high quality genotypes. | — | Optional | | histograms.raw_qual_hists.gq_hist_all | `struct { ... }` | Histogram for GQ calculated on all genotypes. | — | Optional | -| histograms.raw_qual_hists.gq_hist_all.bin_edges | `array` | Bin edges for the GQ histogram calculated on all genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Optional | +| histograms.raw_qual_hists.gq_hist_all.bin_edges | `array` | Bin edges for the GQ histogram calculated on all genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Optional | | histograms.raw_qual_hists.gq_hist_all.bin_freq | `array` | Bin frequencies for the GQ histogram calculated on all genotypes. The number of records found in each bin. | `[60, 76, 130, ...]` | Optional | | histograms.raw_qual_hists.gq_hist_all.n_smaller | `int64` | Count of GQ values falling below lowest histogram bin edge, for GQ calculated on all genotypes. | `0` | Optional | | histograms.raw_qual_hists.gq_hist_all.n_larger | `int64` | Count of GQ values falling above highest histogram bin edge, for GQ calculated on all genotypes. | `0` | Optional | | histograms.raw_qual_hists.dp_hist_all | `struct { ... }` | Histogram for DP calculated on all genotypes. | — | Optional | -| histograms.raw_qual_hists.dp_hist_all.bin_edges | `array` | Bin edges for the DP histogram calculated on all genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Optional | +| histograms.raw_qual_hists.dp_hist_all.bin_edges | `array` | Bin edges for the DP histogram calculated on all genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Optional | | histograms.raw_qual_hists.dp_hist_all.bin_freq | `array` | Bin frequencies for the DP histogram calculated on all genotypes. The number of records found in each bin. | `[4, 7, 20, ...]` | Optional | | histograms.raw_qual_hists.dp_hist_all.n_smaller | `int64` | Count of DP values falling below lowest histogram bin edge, for DP calculated on all genotypes. | `0` | Optional | | histograms.raw_qual_hists.dp_hist_all.n_larger | `int64` | Count of DP values falling above highest histogram bin edge, for DP calculated on all genotypes. | `600` | Optional | | histograms.raw_qual_hists.gq_hist_alt | `struct { ... }` | Histogram for GQ in non-reference individuals calculated on all genotypes. | — | Optional | -| histograms.raw_qual_hists.gq_hist_alt.bin_edges | `array` | Bin edges for the histogram of GQ in non-reference individuals calculated on all genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Optional | +| histograms.raw_qual_hists.gq_hist_alt.bin_edges | `array` | Bin edges for the histogram of GQ in non-reference individuals calculated on all genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Optional | | histograms.raw_qual_hists.gq_hist_alt.bin_freq | `array` | Bin frequencies for the histogram of GQ in non-reference individuals calculated on all genotypes. The number of records found in each bin. | `[3, 4, 10, ...]` | Optional | | histograms.raw_qual_hists.gq_hist_alt.n_smaller | `int64` | Count of GQ values in non-reference individuals falling below lowest histogram bin edge, calculated on all genotypes. | `0` | Optional | | histograms.raw_qual_hists.gq_hist_alt.n_larger | `int64` | Count of GQ values in non-reference individuals falling above highest histogram bin edge, calculated on all genotypes. | `0` | Optional | | histograms.raw_qual_hists.dp_hist_alt | `struct { ... }` | Histogram for DP in non-reference individuals calculated on all genotypes. | — | Optional | -| histograms.raw_qual_hists.dp_hist_alt.bin_edges | `array` | Bin edges for the histogram of DP in non-reference individuals calculated on all genotypes are: `0.0 | 5.0 | 10.0 | 20.0 | 25.0 | 30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0 | 85.0 | 90.0 | 95.0 | 100.0`. | `[0.0, 5.0, 10.0, ...]` | Optional | +| histograms.raw_qual_hists.dp_hist_alt.bin_edges | `array` | Bin edges for the histogram of DP in non-reference individuals calculated on all genotypes are: 0.0 \| 5.0 \| 10.0 \| 20.0 \| 25.0 \| 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0 \| 85.0 \| 90.0 \| 95.0 \| 100.0. | `[0.0, 5.0, 10.0, ...]` | Optional | | histograms.raw_qual_hists.dp_hist_alt.bin_freq | `array` | Bin frequencies for the histogram of DP in non-reference individuals calculated on all genotypes. The number of records found in each bin. | `[2, 2, 6, ...]` | Optional | | histograms.raw_qual_hists.dp_hist_alt.n_smaller | `int64` | Count of DP values in non-reference individuals falling below lowest histogram bin edge, calculated on all genotypes. | `0` | Optional | | histograms.raw_qual_hists.dp_hist_alt.n_larger | `int64` | Count of DP values in non-reference individuals falling above highest histogram bin edge, calculated on all genotypes. | `11` | Optional | | histograms.raw_qual_hists.ab_hist_alt | `struct { ... }` | Histogram for AB in heterozygous individuals calculated on all genotypes. | — | Optional | -| histograms.raw_qual_hists.ab_hist_alt.bin_edges | `array` | Bin edges for the histogram of AB in heterozygous individuals calculated on all genotypes are: `0.00 | 0.05 | 0.10 | 0.20 | 0.25 | 0.30 | 0.35 | 0.40 | 0.45 | 0.50 | 0.55 | 0.60 | 0.65 | 0.70 | 0.75 | 0.80 | 0.85 | 0.90 | 0.95 | 1.00`. | `[0.00 , 0.05 , 0.10, ...]` | Optional | +| histograms.raw_qual_hists.ab_hist_alt.bin_edges | `array` | Bin edges for the histogram of AB in heterozygous individuals calculated on all genotypes are: 0.00 \| 0.05 \| 0.10 \| 0.20 \| 0.25 \| 0.30 \| 0.35 \| 0.40 \| 0.45 \| 0.50 \| 0.55 \| 0.60 \| 0.65 \| 0.70 \| 0.75 \| 0.80 \| 0.85 \| 0.90 \| 0.95 \| 1.00. | `[0.00 , 0.05 , 0.10, ...]` | Optional | | histograms.raw_qual_hists.ab_hist_alt.bin_freq | `array` | Bin frequencies for the histogram of AB in heterozygous individuals calculated on all genotypes. The number of records found in each bin. | `[0 , 0 , 6, ...]` | Optional | | histograms.raw_qual_hists.ab_hist_alt.n_smaller | `int64` | Count of AB values in heterozygous individuals falling below lowest histogram bin edge, calculated on all genotypes. | `0` | Optional | | histograms.raw_qual_hists.ab_hist_alt.n_larger | `int64` | Count of AB values in heterozygous individuals falling above highest histogram bin edge, calculated on all genotypes. | `0` | Optional | | histograms.age_hists | `struct { ... }` | Histograms containing age information for high quality genotypes. Cohorts for gnomAD vary in how they report age (some report the age at diagnosis, others report the age of last visit, etc), so the ages associated with the gnomAD data can be thought of as the last known age of the individual. Information on age is not available for all gnomAD samples. This field is required with the acknowledgement that age data may not be available for all samples. If no age is available for any samples, set all `bin_freq` values to `0`. | — | Required | | histograms.age_hists.age_hist_het | `struct { ... }` | Histogram for age in all heterozygous samples calculated on high quality genotypes. | — | Required | -| histograms.age_hists.age_hist_het.bin_edges | `array` | Bin edges for the age histogram: `30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0`. | `[30.0, 35.0, 40.0, ...]` | Required | +| histograms.age_hists.age_hist_het.bin_edges | `array` | Bin edges for the age histogram: 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0. | `[30.0, 35.0, 40.0, ...]` | Required | | histograms.age_hists.age_hist_het.bin_freq | `array` | Bin frequencies for the age histogram. This is the number of records found in each bin. | `[0, 3, 4, ...]` | Required | | histograms.age_hists.age_hist_het.n_smaller | `int64` | Count of age values falling below lowest histogram bin edge. | `1` | Required | | histograms.age_hists.age_hist_het.n_larger | `int64` | Count of age values falling above highest histogram bin edge. | `0` | Required | | histograms.age_hists.age_hist_hom | `struct { ... }` | Histogram for age in all homozygous samples calculated on high quality genotypes. If variant is in the pseudoautosomal regions of chrX or chrY, this histogram also includes age counts of hemizygous samples. | — | Required | -| histograms.age_hists.age_hist_hom.bin_edges | `array` | Bin edges for the age histogram: `30.0 | 35.0 | 40.0 | 45.0 | 50.0 | 55.0 | 60.0 | 65.0 | 70.0 | 75.0 | 80.0`. | `[30.0, 35.0, 40.0, ...]`| Required | +| histograms.age_hists.age_hist_hom.bin_edges | `array` | Bin edges for the age histogram: 30.0 \| 35.0 \| 40.0 \| 45.0 \| 50.0 \| 55.0 \| 60.0 \| 65.0 \| 70.0 \| 75.0 \| 80.0. | `[30.0, 35.0, 40.0, ...]`| Required | | histograms.age_hists.age_hist_hom.bin_freq | `array` | Bin frequencies for the age histogram. This is the number of records found in each bin. | `[0, 2, 2, ...]` | Required | | histograms.age_hists.age_hist_hom.n_smaller | `int64` | Count of age values falling below lowest histogram bin edge. | `0` | Required | | histograms.age_hists.age_hist_hom.n_larger | `int64` | Count of age values falling above highest histogram bin edge. | `0` | Required | @@ -235,3 +241,4 @@ Data must be supplied as a "split" dataset, where multiallelic variants are spli | in_silico_predictors.phylop | `float64` | Base-wise conservation score across the 241 placental mammals in the Zoonomia project. Score ranges from -20 to 9.28, and reflects acceleration (faster evolution than expected under neutral drift, assigned negative scores) as well as conservation (slower than expected evolution, assigned positive scores). | — | Not Needed | | in_silico_predictors.sift_max | `float64` | Score reflecting the scaled probability of the amino acid substitution being tolerated, ranging from 0 to 1. | — | Not Needed | | in_silico_predictors.polyphen_max | `float64` | Score that predicts the possible impact of an amino acid substitution on the structure and function of a human protein, ranging from 0.0 (tolerated) to 1.0 (deleterious). | — | Not Needed | +| **vep115** | `struct { ... }` | VEP 115 annotations generated by the VEP tool (to be re-annotated). | — | Not Needed | diff --git a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py b/gnomad_qc/federated/federated_validity_checks.py similarity index 66% rename from gnomad_qc/v5/data_ingestion/federated_validity_checks.py rename to gnomad_qc/federated/federated_validity_checks.py index 5c3fd44f6..5c5a9b86e 100644 --- a/gnomad_qc/v5/data_ingestion/federated_validity_checks.py +++ b/gnomad_qc/federated/federated_validity_checks.py @@ -1,12 +1,16 @@ """Script to perform validity checks on input federated data or final release files.""" import argparse +import importlib +import inspect import json import logging import re from collections import defaultdict +from copy import deepcopy from io import StringIO -from typing import Any, Dict, List, Tuple +from pprint import pformat +from typing import Any, Dict, List, Optional, Tuple import hail as hl from bs4 import BeautifulSoup @@ -18,23 +22,20 @@ check_raw_and_adj_callstats, check_sex_chr_metrics, compare_subset_freqs, - compute_missingness, flatten_missingness_struct, sum_group_callstats, summarize_variant_filters, summarize_variants, unfurl_array_annotations, ) -from gnomad.resources.grch38.gnomad import public_release +from gnomad.resources.resource_utils import VersionedTableResource +from gnomad.utils.filtering import remove_fields_from_constant from gnomad.utils.reference_genome import get_reference_genome +from gnomad.utils.vcf import ALLELE_TYPE_FIELDS, REGION_FLAG_FIELDS from jsonschema import validate from jsonschema.exceptions import ValidationError -from gnomad_qc.v4.create_release.validate_and_export_vcf import ( - ALLELE_TYPE_FIELDS, - REGION_FLAG_FIELDS, -) -from gnomad_qc.v5.configs.validity_inputs_schema import schema +from gnomad_qc.federated.configs.validity_inputs_schema import schema from gnomad_qc.v5.resources.basics import get_logging_path for handler in logging.root.handlers[:]: @@ -61,12 +62,38 @@ memory_handler.setFormatter(formatter) logger.addHandler(memory_handler) -ALLELE_TYPE_FIELDS = ALLELE_TYPE_FIELDS["genomes"] -REGION_FLAG_FIELDS = REGION_FLAG_FIELDS["genomes"] +# Remove original alleles for containing non-releasable alleles. +ALLELE_TYPE_FIELDS = deepcopy(ALLELE_TYPE_FIELDS) +ALLELE_TYPE_FIELDS = remove_fields_from_constant( + ALLELE_TYPE_FIELDS, ["original_alleles"] +) + +ALLELE_TYPE_FIELDS = { + "exomes": ALLELE_TYPE_FIELDS, + "genomes": remove_fields_from_constant(ALLELE_TYPE_FIELDS, ["has_star"]), +} + +# Drop decoy, still doesn't exist on 38. +REGION_FLAG_FIELDS = deepcopy(REGION_FLAG_FIELDS) +REGION_FLAG_FIELDS = remove_fields_from_constant( + REGION_FLAG_FIELDS, ["decoy", "nonpar"] +) +REGION_FLAG_FIELDS = { + "exomes": ( + REGION_FLAG_FIELDS + + [ + "fail_interval_qc", + "outside_ukb_capture_region", + "outside_broad_capture_region", + ] + ), + "genomes": REGION_FLAG_FIELDS, +} def get_table_kind(lines, header_index) -> str: - """Determine whether a markdown table corresponds to "global" or "row" fields by scanning upward from the table header line. + """ + Determine whether a markdown table corresponds to "global" or "row" fields by scanning upward from the table header line. :param lines: The full list of lines from the markdown document. :param header_index: The index of the table header line (the line with column names). @@ -343,12 +370,16 @@ def validate_config_fields_in_ht(ht: hl.Table, config: Dict[str, Any]) -> None: missing_fields["globals"] = missing_global_fields # Check that specified row annotations are present. - row_fields = array_struct_annotations + config["struct_annotations_for_missingness"] + structs_to_skip_missingness = config.get( + "struct_annotations_to_skip_missingness", [] + ) + + row_fields = array_struct_annotations + structs_to_skip_missingness missing_row_fields = [i for i in row_fields if i not in ht.row] missing_fields["rows"] = missing_row_fields - # Check that specified info annotations are present. + # Check that specified info annotations are present when configured. if config.get("check_mono_and_only_het"): info_annotations = ["monoallelic", "only_het"] info_fields = list(ht.info.dtype) @@ -509,65 +540,257 @@ def _check_field_exists_and_type( return field_issues, type_issues, fields_validated, types_validated +def check_fields_not_in_requirements( + ht: hl.Table, field_types: Dict[str, Dict[str, Any]] +) -> None: + """ + Warn about fields in HT missing from requirements. + + :param ht: Hail Table. + :param field_types: Nested dictionary of both global and row fields and their expected types. There should be two keys: "global_field_types" and "row_field_types". + :return: None. + """ + + def _flatten_dtype(dtype: hl.expr.types.HailType, prefix: str = "") -> List[str]: + """Recursively extract nested names from a Hail DataType.""" + names = [] + + # Handle structs. + if isinstance(dtype, hl.tstruct): + for field, field_dtype in dtype.items(): + name = f"{prefix}.{field}" if prefix else field + # Check if this field itself is a struct or container + names.extend(_flatten_dtype(field_dtype, name)) + # Handle arrays and sets. + elif isinstance(dtype, (hl.tarray, hl.tset)): + names.extend(_flatten_dtype(dtype.element_type, prefix)) + # Handle dicts. + elif isinstance(dtype, hl.tdict): + names.extend(_flatten_dtype(dtype.value_type, prefix)) + else: + if prefix: + names.append(prefix) + + return names + + # Define the mapping between HT components and the requirements dict. + tasks = [ + ("Global", ht.globals.dtype, "global_field_types"), + ("Row", ht.row.dtype, "row_field_types"), + ] + + for label, dtype, req_key in tasks: + table_fields = set(_flatten_dtype(dtype)) + required_fields = set(field_types.get(req_key, {}).keys()) + + unexpected = table_fields - required_fields + + if unexpected: + logger.warning( + "%s fields present in Table but missing from requirements: %s", + label, + ", ".join(sorted(unexpected)), + ) + + +def filter_to_test_partitions( + ht: hl.Table, + test_n_partitions: int = 2, +) -> hl.Table: + """ + Filter the Table to a specified number of partitions on autosomes and sex chromosomes for testing purposes. + + :param ht: Input Table. + :param test_n_partitions: Number of partitions to filter to. Default is 2. + :return: Filtered Table with only the specified number of partitions. + """ + test_ht = ht._filter_partitions(range(test_n_partitions)) + x_ht = hl.filter_intervals( + ht, [hl.parse_locus_interval("chrX")] + )._filter_partitions(range(test_n_partitions)) + + y_ht = hl.filter_intervals( + ht, [hl.parse_locus_interval("chrY")] + )._filter_partitions(range(test_n_partitions)) + + ht = test_ht.union(x_ht, y_ht) + + return ht + + def check_missingness( ht: hl.Table, missingness_threshold: float = 0.5, - struct_annotations: List[str] = ["grpmax", "fafmax", "histograms"], + structs_to_not_traverse: Optional[Tuple[str]] = ("vep",), ) -> None: """ - Check for and report the fraction of missing data in the Table. + Check for and report the fraction of missing data in row annotations. + + For struct annotations, missingness is checked recursively unless the + annotation name is included in `structs_to_not_traverse`, in which case + only top-level missingness of the struct itself is checked. :param ht: Input Table. - :param missingness_threshold: Upper cutoff for allowed amount of missingness. Default is 0.50. - :param struct_annotations: List of struct annotations to check for missingness. Default is ['grpmax', 'fafmax', 'histograms']. + :param missingness_threshold: Upper cutoff for allowed amount of + missingness. Default is 0.50. + :param structs_to_not_traverse: Optional tuple of top-level struct row + annotations that should be treated as a single field rather than + recursively traversed. Default is ("vep",). :return: None """ - logger.info("Checking for missingness within struct annotations...") - logger.info("Struct annotations being checked: %s.", struct_annotations) - # Determine missingness of each struct annotation. + n_sites = ht.count() + structs_to_not_traverse = tuple(structs_to_not_traverse or ()) + + logger.info( + "Missingness threshold (upper cutoff for allowed missingness): %.2f", + missingness_threshold, + ) + metric_missingness = {} - for metric in struct_annotations: - metric_missingness.update(check_missingness_of_struct(ht[metric], metric)) + struct_annotations_checked = [] + non_struct_annotations_checked = [] + non_traversed_struct_annotations = [] + + for field, dtype in ht.row.dtype.items(): + field_expr = ht[field] + + if isinstance(dtype, hl.tstruct): + if field in structs_to_not_traverse: + non_traversed_struct_annotations.append(field) + metric_missingness[field] = hl.agg.sum(hl.is_missing(field_expr)) + else: + struct_annotations_checked.append(field) + metric_missingness.update( + check_missingness_of_struct(field_expr, field) + ) + else: + non_struct_annotations_checked.append(field) + metric_missingness[field] = hl.agg.sum(hl.is_missing(field_expr)) + + logger.info( + "Struct annotations being recursively checked: %s.", + struct_annotations_checked, + ) + logger.info( + "Struct annotations checked only at the top level: %s.", + non_traversed_struct_annotations, + ) + logger.info( + "Non-struct annotations being checked: %s.", + non_struct_annotations_checked, + ) + logger.info( + "Checking missingness for %d annotations.", + len(metric_missingness), + ) - missingness_struct = ht.aggregate(hl.struct(**metric_missingness)) - missingness_dict = flatten_missingness_struct(missingness_struct) + output = flatten_missingness_struct(ht.aggregate(hl.struct(**metric_missingness))) - # Report whether or not each metric pass or fails the missingness check - # based on the missingness_threshold. - for field, missingness in missingness_dict.items(): - if missingness > missingness_threshold: + n_fail = 0 + for field, n_missing in output.items(): + frac_missing = n_missing / n_sites + + if frac_missing > missingness_threshold: logger.info( - "FAILED missingness check for %s: %.2f%% missing", + "FAILED missingness check for %s: %d sites or %.2f%% missing", field, - 100 * missingness, + n_missing, + 100 * frac_missing, ) + n_fail += 1 else: logger.info( - "Passed missingness check for %s: %.2f%% missing", + "Passed missingness check for %s: %d sites or %.2f%% missing", field, - 100 * missingness, + n_missing, + 100 * frac_missing, ) - logger.info("Checking for missingness of info and non-info fields...") - # Gather info and non-info metrics (or if doesn't exist, set to an empty list) - # and substract missingness dict. - info_metrics = ( - set(ht.row.info) - missingness_dict.keys() if "info" in ht.row else set() - ) - non_info_metrics = set(ht.row) - {"info"} - missingness_dict.keys() - n_sites = ht.count() - logger.info("Info metrics are %s", info_metrics) - logger.info("Non-info metrics are %s", non_info_metrics) - compute_missingness( - ht, info_metrics, non_info_metrics, n_sites, missingness_threshold + logger.warning("%d missingness checks failed.", n_fail) + + +def run_row_to_globals_length_check( + ht: hl.Table, + config: Dict[str, Any], + check_all_rows: bool = True, +) -> None: + """ + Build the row_to_globals_check mapping from config and run check_global_and_row_annot_lengths. + + :param ht: Hail table to check. + :param config: Configuration dictionary containing freq_fields and optional faf_fields. + :param check_all_rows: Whether to check all rows. If False, only checks first rows. Default is True. + :return: None + """ + row_to_globals_check = { + config["freq_fields"]["freq"]: [ + config["freq_fields"]["freq_meta"], + config["freq_fields"]["freq_meta_sample_count"], + ] + } + if config["freq_fields"].get("freq_index_dict"): + row_to_globals_check[config["freq_fields"]["freq"]].append( + config["freq_fields"]["freq_index_dict"] + ) + if config.get("faf_fields"): + row_to_globals_check[config["faf_fields"]["faf"]] = [ + config["faf_fields"]["faf_meta"], + ] + if config["faf_fields"].get("faf_index_dict"): + row_to_globals_check[config["faf_fields"]["faf"]].append( + config["faf_fields"]["faf_index_dict"] + ) + + check_global_and_row_annot_lengths( + t=ht, row_to_globals_check=row_to_globals_check, check_all_rows=check_all_rows ) +def add_info_annotations( + ht: hl.Table, region_flag_fields: List[str], allele_type_fields: List[str] +) -> hl.Table: + """ + Add select annotations to `info` if present in the Table. + + :param ht: Table to annotate. + :param region_flag_fields: List of region flag fields to check for and add to info if present in the Table. + :param allele_type_fields: List of allele type fields to check for and add to info if present in the Table. + :return: Annotated Table with new `info` field. + """ + info_dict = {} + missing_region_flags = [] + + if "region_flags" in ht.row: + for field in region_flag_fields: + if field in ht["region_flags"]: + info_dict[field] = ht["region_flags"][field] + else: + missing_region_flags.append(field) + + if missing_region_flags: + logger.warning("Missing region_flag fields: %s", missing_region_flags) + + missing_allele_info = [] + if "allele_info" in ht.row: + for field in allele_type_fields: + if field in ht["allele_info"]: + info_dict[field] = ht["allele_info"][field] + else: + missing_allele_info.append(field) + + if missing_allele_info: + logger.warning("Missing allele type fields: %s", missing_allele_info) + + ht = ht.annotate(info=ht.info.annotate(**info_dict)) + + return ht + + def validate_federated_data( ht: hl.Table, freq_meta_expr: hl.expr.ArrayExpression, missingness_threshold: float = 0.50, - struct_annotations_for_missingness: List[str] = ["grpmax", "fafmax", "histograms"], + struct_annotations_to_skip_missingness: Optional[List[str]] = None, freq_annotations_to_sum: List[str] = ["AC", "AN", "homozygote_count"], sort_order: List[str] = ["subset", "downsampling", "gen_anc", "sex", "group"], nhomalt_metric: str = "nhomalt", @@ -584,6 +807,9 @@ def validate_federated_data( :param freq_meta_expr: Metadata expression that contains the values of the elements in `meta_indexed_expr`. The most often used expression is `freq_meta` to index into a 'freq' array (example: ht.freq_meta). + :param missingness_threshold: Upper cutoff for allowed amount of + missingness. Default is 0.50. + :param struct_annotations_to_skip_missingness: Optional list of top-level struct row annotations that should be treated as a single field rather than recursively traversed when checking missingness. Default is None. :param freq_annotations_to_sum: List of annotation fields within `meta_expr` to sum. Default is ['AC', 'AN', 'homozygote_count']. :param sort_order: Order in which groupings are unfurled into flattened annotations. Default is ["subset", "downsampling", gen_anc", "sex", "group"]. :param nhomalt_metric: Name of metric denoting homozygous alternate count. Default is "nhomalt". @@ -613,7 +839,7 @@ def validate_federated_data( check_missingness( ht, missingness_threshold, - struct_annotations=struct_annotations_for_missingness, + structs_to_not_traverse=struct_annotations_to_skip_missingness, ) # Check that subset totals sum to expected totals. @@ -681,6 +907,16 @@ def validate_federated_data( metrics=freq_annotations_to_sum, ) + logger.info("Printing schema of annotations and globals...") + + full_description = ht._type.pretty() + logger.info("Table describe:\n%s", full_description) + global_eval = {g: hl.eval(ht[g]) for g in ht.globals} + logger.info( + "Globals eval:\n%s", + pformat(global_eval, sort_dicts=False, compact=True, width=160), + ) + def create_logtest_ht(exclude_xnonpar_y: bool = False) -> hl.Table: """ @@ -885,6 +1121,7 @@ def create_logtest_ht(exclude_xnonpar_y: bool = False) -> hl.Table: faf_meta=faf_meta, freq_meta_sample_count=freq_meta_sample_count, faf_meta_sample_count=faf_meta_sample_count, + extra_global_field="extra_global_field", ) # Add in retired terms to globals. @@ -939,12 +1176,111 @@ def create_logtest_ht(exclude_xnonpar_y: bool = False) -> hl.Table: ht = ht.annotate(grpmax=grpmax, fafmax=fafmax) # Add monoallelic and only_het annotations. - ht = ht.annotate(monoallelic=hl.rand_bool(0.50), only_het=hl.rand_bool(0.10)) + ht = ht.annotate( + info=ht.info.annotate( + monoallelic=hl.rand_bool(0.50), only_het=hl.rand_bool(0.10) + ) + ) ht = ht.key_by("locus", "alleles") return ht +def load_gnomad_data( + gnomad_input_file: str, + version: str, + data_type: str = "genomes", + test: bool = False, + sample_set: Optional[str] = None, + public_release: Optional[bool] = None, + environment: Optional[str] = None, +) -> hl.Table: + """ + Load gnomAD data based on specified input file and parameters. + + :param gnomad_input_file: Name of resource to load, either "freq" or "release_sites". + :param version: Version to load. For example "4.0", "4.1", "5.0". Default is "5.0". + :param data_type: Type of gnomAD data to load, either "exomes" or "genomes". + :param test: If True, load test version of the data. Default is False. + :param sample_set: Sample set of annotation resource. One of "aou", "gnomad", or "merged". If None, uses the default defined by the underlying resource function. Default is None. + :param public_release: Whether or not to use the public version of the release. If None, uses the default defined by the underlying resource function.Default is None. + :param environment: Environment to use. Must be one of "rwb", "batch", or + "dataproc". If None, uses the default defined by the underlying resource function. Default is None. + :return: Hail Table of the specified gnomAD data. + """ + major_v = version.split(".")[0] + + # Define module mapping based on major version. + module_mapping = { + "4": { + "freq": ("gnomad_qc.v4.resources.annotations", "get_freq"), + "release_sites": ("gnomad_qc.v4.resources.release", "release_sites"), + }, + "5": { + "freq": ("gnomad_qc.v5.resources.annotations", "get_freq"), + "release_sites": ("gnomad_qc.v5.resources.release", "release_sites"), + }, + } + + if major_v not in module_mapping: + raise ValueError(f"Major version {major_v} not supported.") + + if gnomad_input_file not in module_mapping[major_v]: + raise ValueError(f"Input '{gnomad_input_file}' not found for v{major_v}") + + module_path, function_name = module_mapping[major_v][gnomad_input_file] + + # Import the module and get the function to call. + module = importlib.import_module(module_path) + resource_func = getattr(module, function_name) + + logger.info("Loading %s version %s (%s)...", gnomad_input_file, major_v, data_type) + + # Collect all possible params for the function. + all_params = { + "data_type": data_type, + "test": test, + "version": version, + "sample_set": sample_set, + "public": public_release, + "environment": environment, + } + + # Filter to only the parameter that function can accept. + sig_params = inspect.signature(resource_func).parameters + valid_args = { + k: v for k, v in all_params.items() if k in sig_params and v is not None + } + + logger.info("Using valid parameters %s for function %s", valid_args, function_name) + + # Log which file and params are being used. + arg_preview = ", ".join([f"{k}={v}" for k, v in valid_args.items()]) + logger.info(f"Calling {module_path}.{function_name}({arg_preview})") + + resource = resource_func(**valid_args) + + # Some resources (e.g. v4 release_sites) return a VersionedTableResource and do + # not accept a version argument in their function signature. Select the requested + # version explicitly instead of relying on the resource default. + if isinstance(resource, VersionedTableResource): + if version not in resource.versions: + available_versions = ", ".join(sorted(resource.versions.keys())) + raise ValueError( + f"Requested version '{version}' is not available for " + f"{gnomad_input_file}. Available versions: {available_versions}" + ) + + logger.info( + "Using resource version '%s' for %s.", + version, + gnomad_input_file, + ) + return resource.versions[version].ht() + + return resource.ht() + + def main(args): """Perform validity checks for federated data.""" hl.init( @@ -955,12 +1291,21 @@ def main(args): test_n_partitions = args.test_n_partitions config_path = args.config_path verbose = args.verbose + output_base = args.output_base if args.exclude_xnonpar_y_in_logtest and not args.use_logtest_ht: raise ValueError( "exclude_xnonpar_y_in_logtest can only be used with use_logtest_ht." ) + if not args.use_logtest_ht and ( + args.gnomad_input_file is None or args.gnomad_version is None + ): + raise ValueError( + "When --use-logtest-ht is not set, both --gnomad-input-file and " + "--gnomad-version are required." + ) + try: # Read in config file and validate. with hl.hadoop_open(config_path, "r") as f: @@ -968,6 +1313,10 @@ def main(args): validate_config(config, schema) + data_type = config["data_type"] + allele_type_fields = ALLELE_TYPE_FIELDS[data_type] + region_flag_fields = REGION_FLAG_FIELDS[data_type] + # Read in field necessity markdown file. # When submitting hail dataproc job, include "--files field_requirements.md". try: @@ -985,10 +1334,20 @@ def main(args): if args.use_logtest_ht: logger.info("Using logtest ht...") ht = create_logtest_ht(args.exclude_xnonpar_y_in_logtest) + validate_config_fields_in_ht(ht=ht, config=config) else: - # TODO: Add resources to intake federated data once obtained. - ht = public_release(data_type="genomes").ht() + # Load data from the specified gnomAD resource function. + ht = load_gnomad_data( + gnomad_input_file=args.gnomad_input_file, + version=args.gnomad_version, + data_type=data_type, + test=args.gnomad_test, + sample_set=args.gnomad_sample_set, + public_release=args.gnomad_public_release, + environment=args.gnomad_environment, + ) + output_base = f"{output_base}/{data_type}/{args.gnomad_input_file}" # Check that fields specified in the config are present in the Table. validate_config_fields_in_ht(ht=ht, config=config) @@ -998,48 +1357,17 @@ def main(args): if build != "GRCh38": raise ValueError(f"Reference genome is {build}, not GRCh38!") - # Filter to test partitions if specified. if test_n_partitions: logger.info( "Filtering to %d partitions and sex chromosomes...", test_n_partitions, ) - test_ht = ht._filter_partitions(range(test_n_partitions)) - - x_ht = hl.filter_intervals( - ht, [hl.parse_locus_interval("chrX")] - )._filter_partitions(range(test_n_partitions)) - - y_ht = hl.filter_intervals( - ht, [hl.parse_locus_interval("chrY")] - )._filter_partitions(range(test_n_partitions)) - - ht = test_ht.union(x_ht, y_ht) - - row_to_globals_check = { - config["freq_fields"]["freq"]: [ - config["freq_fields"]["freq_meta"], - config["freq_fields"]["freq_meta_sample_count"], - ] - } - if config["freq_fields"].get("freq_index_dict"): - row_to_globals_check[config["freq_fields"]["freq"]].append( - config["freq_fields"]["freq_index_dict"] - ) - - if config.get("faf_fields"): - row_to_globals_check[config["faf_fields"]["faf"]] = [ - config["faf_fields"]["faf_meta"], - ] - if config["faf_fields"].get("faf_index_dict"): - row_to_globals_check[config["faf_fields"]["faf"]].append( - config["faf_fields"]["faf_index_dict"] - ) + ht = filter_to_test_partitions(ht, test_n_partitions) logger.info("Check that row and global annotations lengths match...") - check_global_and_row_annot_lengths( - t=ht, - row_to_globals_check=row_to_globals_check, + run_row_to_globals_length_check( + ht=ht, + config=config, check_all_rows=not args.check_only_first_rows_to_globals, ) check_globals_for_retired_terms(ht) @@ -1069,6 +1397,8 @@ def main(args): field_issues, fields_validated, type_issues, types_validated ) + check_fields_not_in_requirements(ht, field_types) + # TODO: Add in lof per person check. logger.info("Unfurl array annotations...") annotations = unfurl_array_annotations( @@ -1078,38 +1408,8 @@ def main(args): ) ht = ht.annotate(info=ht.info.annotate(**annotations)) - info_dict = {} - - # Add region_flag fields if present. - missing_region_flags = [] - if "region_flags" in ht.row: - for field in REGION_FLAG_FIELDS: - if field in ht["region_flags"]: - info_dict[field] = ht["region_flags"][field] - else: - missing_region_flags.append(field) - region_flags = [f for f in REGION_FLAG_FIELDS if f not in missing_region_flags] - if missing_region_flags: - logger.warning("Missing region_flag fields: %s", missing_region_flags) - - # Add allele_info fields if present. - missing_allele_info = [] - if "allele_info" in ht.row: - for field in ALLELE_TYPE_FIELDS: - if field in ht["allele_info"]: - info_dict[field] = ht["allele_info"][field] - else: - missing_allele_info.append(field) - if missing_allele_info: - logger.warning("Missing allele type fields: %s", missing_allele_info) - - # Add monoallelic and only_het fields to info dict. - if "monoallelic" in ht.row: - info_dict["monoallelic"] = ht["monoallelic"] - if "only_het" in ht.row: - info_dict["only_het"] = ht["only_het"] - - ht = ht.annotate(info=ht.info.annotate(**info_dict)) + logger.info("Creating info annotations...") + ht = add_info_annotations(ht, region_flag_fields, allele_type_fields) # If config specifies to check for monoallelic and only heterozygous sites, # create the site_gt_check_expr to pass to validate_federated_data. @@ -1121,12 +1421,14 @@ def main(args): else: site_gt_check_expr = None + region_flags = [f for f in region_flag_fields if f in ht.info] + validate_federated_data( ht=ht, - missingness_threshold=config["missingness_threshold"], - struct_annotations_for_missingness=config[ - "struct_annotations_for_missingness" - ], + missingness_threshold=args.missingness_threshold, + struct_annotations_to_skip_missingness=config.get( + "struct_annotations_to_skip_missingness" + ), freq_meta_expr=ht[config["freq_fields"]["freq_meta"]], freq_annotations_to_sum=config["freq_annotations_to_sum"], sort_order=config["sort_order"], @@ -1138,18 +1440,19 @@ def main(args): site_gt_check_expr=site_gt_check_expr, ) - handler.flush() + memory_handler.flush() log_output = log_stream.getvalue() # TODO: Create resource functions when know organization of federated data. - log_file = args.output_base + ".log" - output_file = args.output_base + ".html" + log_file = output_base + ".log" + output_file = output_base + ".html" # Write parsed log to html file. with hl.hadoop_open(log_file, "w") as f: f.write(log_output) parsed_logs = parse_log_file(log_file) + logger.info("Writing html file to %s...", output_file) generate_html_report(parsed_logs, output_file) finally: @@ -1158,16 +1461,16 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) - # Create a mutually exclusive group for --test-n-partitions and --use-test-ht. + # Create a mutually exclusive group for --test-n-partitions and --use-logtest-ht. test_group = parser.add_mutually_exclusive_group() test_group.add_argument( "--test-n-partitions", help=( "Use only N partitions of the input (as well as sex chromosomes) for testing purposes. Defaults" - "to 2 if passed without a value. Cannot be used if --use-logtest-ht is set." + " to 2 if passed without a value. Cannot be used if --use-logtest-ht is set." ), nargs="?", const=2, @@ -1191,21 +1494,31 @@ def main(args): parser.add_argument( "--config-path", help=( - "Path to JSON config file for defining parameters. Paramters to define are as follows:" - "missingness_threshold: Float defining upper cutoff for allowed amount of missingness. Missingness above this value will be flagged as 'FAILED'." - "struct_annotations_for_missingness: List of struct annotations to check for missingness." - "freq_fields: Dictionary containing the names of frequency-related fields ('freq': Name of annotation containing the array of frequency metric objects " - "corresponding to each frequency metadata group; 'freq_meta': Name of annotation containing allele frequency metadata, an ordered list containing the frequency aggregation group for " - "each element of the 'freq' array row annotation, with at least the following groups: ('group': adj/raw, 'gen_anc': inferred genetic ancestry group, 'sex': sex karyotype).; 'freq_meta_sample_count': Name of " - "annotation containing sample count per sample grouping defined in the 'freq_meta' global annotation." - "faf_fields: Dictionary containing the names of filtering allele frequency (FAF) related fields ('faf': Name of annotation containing structs of FAF information; 'faf_meta': Name of annotation " - "for FAF metadata, an ordered list containing the frequency aggregation group for each element of the 'faf' arrays, with at least the following groups: ('group': adj/raw, 'gen_anc': inferred genetic ancestry group, 'sex': sex karyotype). " - "freq_annotations_to_sum: List of annotation fields within `freq_meta` to sum. Example: ['AC', 'AN', 'homozygote_count']." - "sort_order: Order in which groupings are unfurled into flattened annotations. Default is ['gen_anc', 'sex', 'group']." - "nhomalt_metric: Name of metric denoting homozygous alternate count." - "subsets: List of sample subsets to include for the subset validity check." - "variant_filter_field: String of variant filtration used in the filters annotation of the Hail Table (e.g. 'RF', 'VQSR', 'AS_VQSR')." - "check_mono_and_only_het: Boolean indicating whether to check for monoallelic and 100 percent heterozygous sites in the Table ('monoallelic' and 'only_het' annotations must be present)." + "Path to JSON config file for defining parameters. Parameters to define are as follows:\n" + " - struct_annotations_to_skip_missingness: Optional list of top-level struct annotations to skip during missingness checks.\n" + " - freq_fields: Dictionary containing the names of frequency-related fields:\n" + " * freq: Name of annotation containing the array of frequency metric objects\n" + " corresponding to each frequency metadata group.\n" + " * freq_meta: Name of annotation containing allele frequency metadata, an\n" + " ordered list containing the frequency aggregation group for each element\n" + " of the freq array row annotation, with at least the following groups:\n" + " group (adj/raw), gen_anc (inferred genetic ancestry group), and sex\n" + " (sex karyotype).\n" + " * freq_meta_sample_count: Name of annotation containing sample count per\n" + " sample grouping defined in the freq_meta global annotation.\n" + " - faf_fields: Dictionary containing the names of filtering allele frequency (FAF) related fields:\n" + " * faf: Name of annotation containing structs of FAF information.\n" + " * faf_meta: Name of annotation for FAF metadata, an ordered list\n" + " containing the frequency aggregation group for each element of the faf\n" + " arrays, with at least the following groups: group (adj/raw), gen_anc\n" + " (inferred genetic ancestry group), and sex (sex karyotype).\n" + " - freq_annotations_to_sum: List of annotation fields within `freq_meta` to sum. Example: ['AC', 'AN', 'homozygote_count'].\n" + " - sort_order: Order in which groupings are unfurled into flattened annotations. Default is ['gen_anc', 'sex', 'group'].\n" + " - nhomalt_metric: Name of metric denoting homozygous alternate count.\n" + " - subsets: List of sample subsets to include for the subset validity check.\n" + " - variant_filter_field: String of variant filtration used in the filters annotation of the Hail Table (e.g. 'RF', 'VQSR', 'AS_VQSR').\n" + " - data_type: Data type to run checks on. One of 'exomes' or 'genomes'.\n" + " - check_mono_and_only_het: Whether to run the check for monoallelic and 100 percent heterozygous sites in the Table('monoallelic' and 'only_het' annotations must be present)." ), type=str, ) @@ -1227,6 +1540,53 @@ def main(args): type=str, default="gs://gnomad-tmp/federated_validity_checks/federated_validity_checks", ) - + parser.add_argument( + "--missingness-threshold", + help="Float defining upper cutoff for allowed amount of missingness. Missingness above this value will be flagged as 'FAILED'.", + type=float, + default=0.50, + ) + # Create a group for gnomAD input arguments. + gnomad_group = parser.add_argument_group("gnomad", "gnomAD input options") + gnomad_group.add_argument( + "--gnomad-input-file", + help="Source to load gnomAD data from. 'freq' loads from get_freq and 'release_sites' loads from release_sites. Default is None.", + choices=["freq", "release_sites"], + type=str, + default=None, + ) + gnomad_group.add_argument( + "--gnomad-version", + help="Version of gnomAD resources to use. Default is None.", + choices=["4.0", "4.1", "4.1.1", "5.0"], + default=None, + type=str, + ) + gnomad_group.add_argument( + "--gnomad-test", + help="Load test dataset (smaller subset for testing).", + action="store_true", + ) + gnomad_group.add_argument( + "--gnomad-sample-set", + help="Sample set of annotation resource to load, if applicable. One of 'aou', 'gnomad', or 'merged'. Default is None.", + choices=["aou", "gnomad", "merged"], + type=str, + default=None, + ) + gnomad_group.add_argument( + "--gnomad-public-release", + help="Whether or not to use the public version of the release when loading data. Only applicable when loading 'release_sites'.", + action="store_true", + ) + gnomad_group.add_argument( + "--gnomad-environment", + help=( + "Environment to use when loading gnomAD data. Must be one of 'rwb', 'batch', or 'dataproc'. Default is None." + ), + choices=["rwb", "batch", "dataproc"], + type=str, + default=None, + ) args = parser.parse_args() main(args)