diff --git a/requirements.txt b/requirements.txt index 1f8a8f9..b54a747 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ bs4 coverage openpyxl pooch +pydantic pytest pyyaml requests diff --git a/scripts/unharmonised/DR_Opportunity_template.yaml b/scripts/unharmonised/DR_Opportunity_template.yaml new file mode 100644 index 0000000..db5364a --- /dev/null +++ b/scripts/unharmonised/DR_Opportunity_template.yaml @@ -0,0 +1,40 @@ +# Minimal template for a data request Opportunity, for use by community MIPs. + +Title: Short descriptive title of the Opportunity + +MIP: Name of MIP + +Description: Statement of the general purpose of this Opportunity's data request. + +Expected Impacts: (Optional) Explanation of why this combination of variables and experiments is important. + +Justification of Resources: (Optional) Explanation of how the requested variables map onto the impacts, and estimate of the resources required. + +Experiment Groups: +# An Experiment Group specifies a list of experiments for which requested variables should be produced. +- example_experiment_group # new Experiment Group, defined below +- deck # existing Experiment Group + +Variable Groups: +# Each Variable Group defines a set of requested variables and its priority. +- example_variable_group # new Variable Group, defined below +- baseline_monthly # existing Variable Group + +New Experiment Groups: + + example_experiment_group: + Title: Short descriptive title of the Experiment Group + Experiments: + - amip # existing experiment + - example_new_experiment # new experiment, must be registered in CVs + +New Variable Groups: + + example_variable_group: + Title: Short descriptive title of the Variable Group + Priority Level: High # High, Medium, or Low (not case sensitive) + Justification: (Optional) Explanation of why these variables are important. + Notes: (Optional) Any additional comments about the variable group. + Variables: # list of requested variable names + - land.gpp.tavg-u-hxy-lnd.mon.glb + - atmos.fco2nat.tavg-u-hxy-u.mon.glb diff --git a/scripts/unharmonised/README_unharmonised_workflow.md b/scripts/unharmonised/README_unharmonised_workflow.md new file mode 100644 index 0000000..503de59 --- /dev/null +++ b/scripts/unharmonised/README_unharmonised_workflow.md @@ -0,0 +1,30 @@ + +## MIP workflow for Unharmonised Data Request + +⚠️ *Everything in this document is a proposal, under development, and likely to change* + +### Opportunity template + +This allows MIPs to create a `json` file representation of a DR "Opportunity" with minimal effort. + +A DR Opportunity lists variables that are requested from a specified set of experiments. +It includes a description of the scienfitic purpose of the request. +This can be very brief, but including detailed information is also possible. +A template Opportunity is provided in `yaml` format, which a MIP can edit. + +First, copy the template: +```bash +cp DR_Opportunity_template.yaml new_MIP_data_request.yaml +``` +Edit the new file, which in this example is named `new_MIP_data_request.yaml`, to specify the requested variables and experiments from which they're requested. +Variables are grouped into Variable Groups, which have a priority level (High, Medium, Low) attached. +Experiments are grouped into Experiment Groups. +If a MIP simply has one list of variables that are all requested from the same list of experiments, then one Variable Group and one Experiment Group is sufficient. + +Then validate the new request against existing DR content: +```bash +./ingest.py new_MIP_data_request.yaml new_MIP_data_request.json v1.2.2.3 +``` +This should be run in an env where the DR python API is installed ([see here](https://github.com/CMIP-Data-Request/CMIP7_DReq_Software#installation) for installation guidance). +This performs some sanity checks, including checking that variable and experiment names are valid (i.e., they are defined in existing DR content and CMIP7 CVs). +If the checks pass, the output file, which here is `new_MIP_data_request.json`, represents in the new request's information in a format that can be used in the DR python API. diff --git a/scripts/unharmonised/example_validated_opportunity.json b/scripts/unharmonised/example_validated_opportunity.json new file mode 100644 index 0000000..d612881 --- /dev/null +++ b/scripts/unharmonised/example_validated_opportunity.json @@ -0,0 +1,42 @@ +{ + "Header": { + "Provenance": "Validated Opportunity from input file DR_Opportunity_template.yaml", + "Data Request version used for validation": "v1.2.2.3" + }, + "Opportunity": { + "title": "Short descriptive title of the Opportunity", + "mip": "Name of MIP", + "description": "Statement of the general purpose of this Opportunity's data request.", + "expected_impacts": "(Optional) Explanation of why this combination of variables and experiments is important.", + "justification_of_resources": "(Optional) Explanation of how the requested variables map onto the impacts, and estimate of the resources required.", + "experiment_groups": [ + "example_experiment_group", + "deck" + ], + "variable_groups": [ + "example_variable_group", + "baseline_monthly" + ] + }, + "New Experiment Groups": { + "example_experiment_group": { + "title": "Short descriptive title of the Experiment Group", + "experiments": [ + "amip", + "example_new_experiment" + ] + } + }, + "New Variable Groups": { + "example_variable_group": { + "title": "Short descriptive title of the Variable Group", + "priority_level": "High", + "justification": "(Optional) Explanation of why these variables are important.", + "notes": "(Optional) Any additional comments about the variable group.", + "variables": [ + "land.gpp.tavg-u-hxy-lnd.mon.glb", + "atmos.fco2nat.tavg-u-hxy-u.mon.glb" + ] + } + } +} \ No newline at end of file diff --git a/scripts/unharmonised/ingest.py b/scripts/unharmonised/ingest.py new file mode 100755 index 0000000..5c4cb4f --- /dev/null +++ b/scripts/unharmonised/ingest.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +''' +Ingest a yaml file that specifies a data request Opportunity +''' + +import argparse +import json +import yaml + +from collections import OrderedDict +from pydantic import BaseModel + +import data_request_api.content.dreq_content as dc +import data_request_api.query.dreq_query as dq +from data_request_api.query.dreq_classes import ( + PRIORITY_LEVELS, format_attribute_name) + + +class ExperimentGroup(BaseModel): + title: str + experiments: list[str] + +class VariableGroup(BaseModel): + title: str + priority_level: str + justification: str = '' + notes: str = '' + variables: list[str] + +class Opportunity(BaseModel): + title: str + mip: str + description: str + expected_impacts: str = '' + justification_of_resources: str = '' + experiment_groups: list[str] + variable_groups: list[str] + + +def parse_args(): + ''' Parse command line arguments''' + parser = argparse.ArgumentParser(description="Validate data request Opportunity specified by input yaml file") + + # Mandatory arguments + parser.add_argument('input', + help="Opportunity specifications (yaml file)") + parser.add_argument('output', + help="Validated Opportunity specifications (json file)") + parser.add_argument('dreq_version', choices=dc.get_versions(), + help="Data Request version used to validate input") + + return parser.parse_args() + + +if __name__ == '__main__': + + args = parse_args() + input_file = args.input + output_file = args.output + dreq_version = args.dreq_version + + # Read setup file for new Opportunity + with open(input_file, 'r') as f: + opp = yaml.safe_load(f) + + # Retrieve specs for any new variable or experiment groups so that they can be validated + # against existing DR content, below. + # The ExperimentGroup & VariableGroup pydantic models perform validation of the input. + sections = ['New Experiment Groups', 'New Variable Groups'] + for section in sections: + for name,info in opp[section].items(): + opp[section][name] = {format_attribute_name(k):v for k,v in info.items()} + match section: + case 'New Experiment Groups': + new_expt_groups = {name: ExperimentGroup(**info) for name,info in opp[section].items()} + case 'New Variable Groups': + new_var_groups = {name: VariableGroup(**info) for name,info in opp[section].items()} + case _: + raise ValueError('Invalid section: ' + section) + opp.pop(section) + + # Check priority levels in new Variable Groups are valid + for vg_name, vg in new_var_groups.items(): + if vg.priority_level.lower() not in PRIORITY_LEVELS: + raise ValueError(f'Unknown Priority Level for Variable Group {vg_name}: {vg.priority_level}') + + # Get DR content to use in further validating the input + dreq_content = dc.load(dreq_version) + base = dq._get_base_dreq_tables(dreq_content, dreq_version, purpose='request') + dreq_var_info = dq.get_variables_metadata(base, dreq_version) + cmip7_compound_names = set([var_info['cmip7_compound_name'] for var_info in dreq_var_info.values()]) + cmip6_compound_names = set([var_info['cmip6_compound_name'] for var_info in dreq_var_info.values()]) + dreq_expt_group_names = set(rec.name for rec in base['Experiment Group'].records.values()) + dreq_var_group_names = set(rec.name for rec in base['Variable Group'].records.values()) + + # Check new Variable Group names don't conflict with any already in the DR + for vg_name in new_var_groups: + if vg_name in dreq_var_group_names: + raise ValueError(f'Variable Group already exists in DR {dreq_version}: {vg_name}') + + # Check that the variable names in new Variable Groups are valid + for vg_name, vg in new_var_groups.items(): + invalid_variables = [] + for var_name in vg.variables: + # TODO: should user be forced to say whether using CMIP6 or CMIP7 variable names? + # TODO: if new variables are defined (beyond those in AFT DR) then need to add these here as valid names + if not (var_name in cmip7_compound_names or var_name in cmip6_compound_names): + invalid_variables.append(var_name) + if len(invalid_variables) > 0: + msg = f'Found {len(invalid_variables)} invalid variables found in Variable Group {vg_name}:\n' \ + + '\n'.join(invalid_variables) + raise ValueError(msg) + + # Check new Experiment Group names don't conflict with any already in the DR + for eg_name in new_expt_groups: + if eg_name in dreq_expt_group_names: + raise ValueError(f'Experiment Group already exists in DR {dreq_version}: {eg_name}') + + # Validate experiments against CVs + # TODO: get valid CMIP7 experiments using esgvoc + # (cannot rely on AFT DR list since community MIPs will define new experiments) + + # Use Opportunity pydantic model to validate the input + opp = {format_attribute_name(k):v for k,v in opp.items()} + opp = Opportunity(**opp) + + # Check full Variable Group and Experiment Group lists are either (1) defined as new, + # or (2) exist already in the DR. + all_expt_group_names = dreq_expt_group_names.union(new_expt_groups.keys()) + all_var_group_names = dreq_var_group_names.union(new_var_groups.keys()) + for eg_name in opp.experiment_groups: + if eg_name not in all_expt_group_names: + raise ValueError(f'Experiment Group {eg_name} has not been newly defined and does not already exist in DR {dreq_version}') + for vg_name in opp.variable_groups: + if vg_name not in all_var_group_names: + raise ValueError(f'Variable Group {vg_name} has not been newly defined and does not already exist in DR {dreq_version}') + + # Write output file + out = OrderedDict({ + 'Header': OrderedDict({ + 'Provenance': f'Validated Opportunity from input file {input_file}', + 'Data Request version used for validation': dreq_version, + }), + 'Opportunity' : OrderedDict(opp), + 'New Experiment Groups': OrderedDict({name: OrderedDict(info) for name,info in new_expt_groups.items()}), + 'New Variable Groups': OrderedDict({name: OrderedDict(info) for name,info in new_var_groups.items()}) + }) + with open(output_file, 'w') as f: + json.dump(out, f, indent=4) + print('Wrote ' + output_file)