Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
28a663f
add check for fiedds not in requirements
klaricch Mar 3, 2026
db03f85
add vep 115 to requirements md
klaricch Mar 3, 2026
cd8af11
add to logtest
klaricch Mar 4, 2026
cdbbaa0
add array to check_fields_not_in_requirements
klaricch Mar 5, 2026
c9e9b96
Apply suggestions from code review
klaricch Mar 12, 2026
0b08139
add data type and move missignness
klaricch Mar 13, 2026
40c10a7
add load_gnomad_data
klaricch Mar 13, 2026
76b8ac1
edit output_base
klaricch Mar 13, 2026
808a3fd
make pipe compatible both md and html
klaricch Mar 16, 2026
4db09be
nest faf_index_dict check
klaricch Mar 16, 2026
ee3db47
isort
klaricch Mar 16, 2026
9bddcc0
update config and schema
klaricch Mar 16, 2026
5f90965
small edit
klaricch Mar 16, 2026
3167dc4
Apply suggestions from code review
klaricch Mar 17, 2026
5d23469
fix help text
klaricch Mar 30, 2026
83e0d8a
PR suggestions
klaricch Mar 30, 2026
1cd91cf
copilot PR suggestions
klaricch Mar 30, 2026
1d5c5c0
Apply suggestions from code review
klaricch Apr 3, 2026
95d9ce1
Update gnomad_qc/v5/data_ingestion/federated_validity_checks.py
klaricch Apr 3, 2026
9a1c925
PR suggestions
klaricch Apr 3, 2026
e910866
isort
klaricch Apr 3, 2026
eeedb0b
small edit
klaricch Apr 3, 2026
c3e8a1d
update args
klaricch Apr 8, 2026
8f711f6
update args
klaricch Apr 8, 2026
d7bba40
rewrite check_missingness
klaricch Apr 9, 2026
8166274
edits to check missingness
klaricch Apr 9, 2026
5945669
edit missingness
klaricch Apr 9, 2026
c29266a
add describe and global eval
klaricch Apr 15, 2026
28b168b
Update gnomad_qc/v5/data_ingestion/federated_validity_checks.py
klaricch Apr 21, 2026
4869089
Update gnomad_qc/v5/data_ingestion/federated_validity_checks.py
klaricch Apr 21, 2026
7e8a323
Update gnomad_qc/v5/data_ingestion/federated_validity_checks.py
klaricch Apr 21, 2026
6c4bb99
PR suggestions
klaricch Apr 21, 2026
05fa16d
PR suggestions
klaricch Apr 21, 2026
e798383
change file structure
klaricch Apr 21, 2026
55561c9
move configs
klaricch Apr 21, 2026
c0741a0
add moved files
klaricch Apr 21, 2026
bbb3170
fix import
klaricch Apr 21, 2026
ed4c062
remove info line
klaricch Apr 21, 2026
9d3b53b
add init
klaricch Apr 21, 2026
e20cb03
Update gnomad_qc/federated/federated_validity_checks.py
klaricch Apr 23, 2026
5ee37b8
PR suggestions
klaricch Apr 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 176 additions & 69 deletions gnomad_qc/v5/data_ingestion/federated_validity_checks.py
Comment thread
klaricch marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,83 @@ def _check_field_exists_and_type(
return field_issues, type_issues, fields_validated, types_validated


def check_fields_not_in_requirements(
ht: hl.Table, field_types: Dict[str, Dict[str, Any]]
) -> None:
"""Warn about fields in HT missing from requirements.
Comment thread
klaricch marked this conversation as resolved.
Outdated

:param ht: Hail Table.
:param field_types: Nested dictionary of both global and row fields and their expected types. There should be two keys: "global_field_types" and "row_field_types".
:return: None.
"""

def _flatten_dtype(dtype: hl.expr.types.HailType, prefix: str = "") -> List[str]:
"""Recursively extract nested names from a Hail DataType."""
names = []

# Handle structs.
if isinstance(dtype, hl.tstruct):
for field, field_dtype in dtype.items():
name = f"{prefix}.{field}" if prefix else field
# Check if this field itself is a struct or container
names.extend(_flatten_dtype(field_dtype, name))
# Handle arrays and sets.
elif isinstance(dtype, (hl.tarray, hl.tset)):
names.extend(_flatten_dtype(dtype.element_type, prefix))
# Handle dicts.
elif isinstance(dtype, hl.tdict):
names.extend(_flatten_dtype(dtype.value_type, prefix))
else:
if prefix:
names.append(prefix)

return names

# Define the mapping between HT components and the requirements dict.
tasks = [
("Global", ht.globals.dtype, "global_field_types"),
("Row", ht.row.dtype, "row_field_types"),
]

for label, dtype, req_key in tasks:
table_fields = set(_flatten_dtype(dtype))
Comment thread
klaricch marked this conversation as resolved.
required_fields = set(field_types.get(req_key, {}).keys())

unexpected = table_fields - required_fields

if unexpected:
logger.warning(
"%s fields present in Table but missing from requirements: %s",
label,
", ".join(sorted(unexpected)),
)


def filter_to_test_partitions(
ht: hl.Table,
test_n_partitions: int = 2,
) -> hl.Table:
"""
Filter the Table to a specified number of partitions on autosomes and sex chromosomes for testing purposes.

:param ht: Input Table.
:param test_n_partitions: Number of partitions to filter to. Default is 2.
:return: Filtered Table with only the specified number of partitions.
"""
test_ht = ht._filter_partitions(range(test_n_partitions))
x_ht = hl.filter_intervals(
ht, [hl.parse_locus_interval("chrX")]
)._filter_partitions(range(test_n_partitions))

y_ht = hl.filter_intervals(
ht, [hl.parse_locus_interval("chrY")]
)._filter_partitions(range(test_n_partitions))

ht = test_ht.union(x_ht, y_ht)

return ht


def check_missingness(
ht: hl.Table,
missingness_threshold: float = 0.5,
Expand Down Expand Up @@ -563,6 +640,92 @@ def check_missingness(
)


def run_row_to_globals_length_check(
ht: hl.Table,
config: dict,
Comment thread
klaricch marked this conversation as resolved.
Outdated
check_all_rows: bool = True,
) -> None:
"""
Build the row_to_globals_check mapping from config and run check_global_and_row_annot_lengths.

:param ht: Hail table to check.
:param config: Configuration dictionary containing freq_fields and optional faf_fields.
:param check_all_rows: Whether to check all rows. If False, only checks first rows. Default is True.
:return: None
"""
row_to_globals_check = {
config["freq_fields"]["freq"]: [
config["freq_fields"]["freq_meta"],
config["freq_fields"]["freq_meta_sample_count"],
]
}
if config["freq_fields"].get("freq_index_dict"):
row_to_globals_check[config["freq_fields"]["freq"]].append(
config["freq_fields"]["freq_index_dict"]
)
if config.get("faf_fields"):
row_to_globals_check[config["faf_fields"]["faf"]] = [
config["faf_fields"]["faf_meta"],
]
if config["faf_fields"].get("faf_index_dict"):
row_to_globals_check[config["faf_fields"]["faf"]].append(
config["faf_fields"]["faf_index_dict"]
)
Comment thread
klaricch marked this conversation as resolved.
Outdated
Comment thread
klaricch marked this conversation as resolved.
Outdated

check_global_and_row_annot_lengths(
t=ht, row_to_globals_check=row_to_globals_check, check_all_rows=check_all_rows
)


def add_info_annotations(
ht: hl.Table,
):
Comment thread
klaricch marked this conversation as resolved.
Outdated
"""
Add select annotations to `info` if present in the Table.
Comment thread
klaricch marked this conversation as resolved.
Outdated

:param ht: Table to annotate.
:return: Annotated Table with new `info` field.
"""
info_dict = {}

# Add region_flag fields if present.
Comment thread
klaricch marked this conversation as resolved.
Outdated
missing_region_flags = []

if "region_flags" in ht.row:
for field in REGION_FLAG_FIELDS:
if field in ht["region_flags"]:
info_dict[field] = ht["region_flags"][field]
else:
missing_region_flags.append(field)

if missing_region_flags:
logger.warning("Missing region_flag fields: %s", missing_region_flags)

# Add allele_info fields if present.
Comment thread
klaricch marked this conversation as resolved.
Outdated
missing_allele_info = []
if "allele_info" in ht.row:
for field in ALLELE_TYPE_FIELDS:
if field in ht["allele_info"]:
info_dict[field] = ht["allele_info"][field]
else:
missing_allele_info.append(field)

if missing_allele_info:
logger.warning("Missing allele type fields: %s", missing_allele_info)

# Add monoallelic and only_het if present.
Comment thread
klaricch marked this conversation as resolved.
Outdated
if "monoallelic" in ht.row:
info_dict["monoallelic"] = ht["monoallelic"]

if "only_het" in ht.row:
info_dict["only_het"] = ht["only_het"]

# Annotate info field.
Comment thread
klaricch marked this conversation as resolved.
Outdated
ht = ht.annotate(info=ht.info.annotate(**info_dict))

return ht


def validate_federated_data(
ht: hl.Table,
freq_meta_expr: hl.expr.ArrayExpression,
Expand Down Expand Up @@ -885,6 +1048,7 @@ def create_logtest_ht(exclude_xnonpar_y: bool = False) -> hl.Table:
faf_meta=faf_meta,
freq_meta_sample_count=freq_meta_sample_count,
faf_meta_sample_count=faf_meta_sample_count,
extra_global_field="extra_global_field",
Comment thread
mike-w-wilson marked this conversation as resolved.
)

# Add in retired terms to globals.
Expand Down Expand Up @@ -988,7 +1152,7 @@ def main(args):

else:
# TODO: Add resources to intake federated data once obtained.
ht = public_release(data_type="genomes").ht()
ht = public_release(data_type="exomes").ht()
Comment thread
klaricch marked this conversation as resolved.
Outdated

# Check that fields specified in the config are present in the Table.
validate_config_fields_in_ht(ht=ht, config=config)
Expand All @@ -998,48 +1162,17 @@ def main(args):
if build != "GRCh38":
raise ValueError(f"Reference genome is {build}, not GRCh38!")

# Filter to test partitions if specified.
if test_n_partitions:
logger.info(
"Filtering to %d partitions and sex chromosomes...",
test_n_partitions,
)
test_ht = ht._filter_partitions(range(test_n_partitions))

x_ht = hl.filter_intervals(
ht, [hl.parse_locus_interval("chrX")]
)._filter_partitions(range(test_n_partitions))

y_ht = hl.filter_intervals(
ht, [hl.parse_locus_interval("chrY")]
)._filter_partitions(range(test_n_partitions))

ht = test_ht.union(x_ht, y_ht)

row_to_globals_check = {
config["freq_fields"]["freq"]: [
config["freq_fields"]["freq_meta"],
config["freq_fields"]["freq_meta_sample_count"],
]
}
if config["freq_fields"].get("freq_index_dict"):
row_to_globals_check[config["freq_fields"]["freq"]].append(
config["freq_fields"]["freq_index_dict"]
)

if config.get("faf_fields"):
row_to_globals_check[config["faf_fields"]["faf"]] = [
config["faf_fields"]["faf_meta"],
]
if config["faf_fields"].get("faf_index_dict"):
row_to_globals_check[config["faf_fields"]["faf"]].append(
config["faf_fields"]["faf_index_dict"]
)
ht = filter_to_test_partitions(ht, test_n_partitions)

logger.info("Check that row and global annotations lengths match...")
check_global_and_row_annot_lengths(
t=ht,
row_to_globals_check=row_to_globals_check,
run_row_to_globals_length_check(
ht=ht,
config=config,
check_all_rows=not args.check_only_first_rows_to_globals,
)
check_globals_for_retired_terms(ht)
Expand Down Expand Up @@ -1069,6 +1202,8 @@ def main(args):
field_issues, fields_validated, type_issues, types_validated
)

check_fields_not_in_requirements(ht, field_types)

# TODO: Add in lof per person check.
logger.info("Unfurl array annotations...")
annotations = unfurl_array_annotations(
Expand All @@ -1078,38 +1213,8 @@ def main(args):
)
ht = ht.annotate(info=ht.info.annotate(**annotations))

info_dict = {}

# Add region_flag fields if present.
missing_region_flags = []
if "region_flags" in ht.row:
for field in REGION_FLAG_FIELDS:
if field in ht["region_flags"]:
info_dict[field] = ht["region_flags"][field]
else:
missing_region_flags.append(field)
region_flags = [f for f in REGION_FLAG_FIELDS if f not in missing_region_flags]
if missing_region_flags:
logger.warning("Missing region_flag fields: %s", missing_region_flags)

# Add allele_info fields if present.
missing_allele_info = []
if "allele_info" in ht.row:
for field in ALLELE_TYPE_FIELDS:
if field in ht["allele_info"]:
info_dict[field] = ht["allele_info"][field]
else:
missing_allele_info.append(field)
if missing_allele_info:
logger.warning("Missing allele type fields: %s", missing_allele_info)

# Add monoallelic and only_het fields to info dict.
if "monoallelic" in ht.row:
info_dict["monoallelic"] = ht["monoallelic"]
if "only_het" in ht.row:
info_dict["only_het"] = ht["only_het"]

ht = ht.annotate(info=ht.info.annotate(**info_dict))
logger.info("Creating info annotations...")
ht = add_info_annotations(ht)

# If config specifies to check for monoallelic and only heterozygous sites,
# create the site_gt_check_expr to pass to validate_federated_data.
Expand All @@ -1121,6 +1226,8 @@ def main(args):
else:
site_gt_check_expr = None

region_flags = [f for f in REGION_FLAG_FIELDS if f in ht.info]

validate_federated_data(
ht=ht,
missingness_threshold=config["missingness_threshold"],
Expand Down Expand Up @@ -1225,7 +1332,7 @@ def main(args):
"Base path for output files. Will be used to create a log file and an html file."
),
type=str,
default="gs://gnomad-tmp/federated_validity_checks/federated_validity_checks",
default="gs://gnomad-tmp/federated_validity_checks/federated_validity_checks/exomes",
Comment thread
klaricch marked this conversation as resolved.
Outdated
)

args = parser.parse_args()
Expand Down
Loading