diff --git a/README.md b/README.md index fd222a63..5c870498 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,6 @@ Or used for project initialization: * ``input_schema_name``: If using [Models in Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog), specify the name of the schema under which the models should be registered, but we recommend keeping the name the same as the project name. We default to using the same `schema_name` across catalogs, thus this schema must exist in each catalog used. For example, the training pipeline when executed in the staging environment will register the model to `staging..`, whereas the same pipeline executed in the prod environment will register the mode to `prod..`. Also, be sure that the service principals in each respective environment have the right permissions to access this schema, which would be `USE_CATALOG`, `USE_SCHEMA`, `MODIFY`, `CREATE_MODEL`, and `CREATE_TABLE`. * ``input_unity_catalog_read_user_group``: If using [Models in Unity Catalog](https://docs.databricks.com/en/mlflow/models-in-uc.html#models-in-unity-catalog), define the name of the user group to grant `EXECUTE` (read & use model) privileges for the registered model. Defaults to "account users". * ``input_include_feature_store``: If selected, will provide [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) stack components including: project structure and sample feature Python modules, feature engineering notebooks, ML resource configs to provision and manage Feature Store jobs, and automated integration tests covering feature engineering and training. - * ``input_include_mlflow_recipes``: If selected, will provide [MLflow Recipes](https://mlflow.org/docs/latest/recipes.html) stack components, dividing the training pipeline into configurable steps and profiles. See the generated ``README.md`` for next steps! diff --git a/cookiecutter.json b/cookiecutter.json deleted file mode 100644 index 8df2587f..00000000 --- a/cookiecutter.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "__please_use_databricks_cli_for_project_creation": "Please refer to README.md" -} diff --git a/databricks_template_schema.json b/databricks_template_schema.json index 69fc4797..9f3a665f 100644 --- a/databricks_template_schema.json +++ b/databricks_template_schema.json @@ -285,38 +285,6 @@ } } }, - "input_include_mlflow_recipes": { - "order": 19, - "type": "string", - "description": "\nWhether to include MLflow Recipes", - "default": "no", - "enum": ["no", "yes"], - "skip_prompt_if": { - "anyOf":[ - { - "properties": { - "input_include_models_in_unity_catalog": { - "const": "yes" - } - } - }, - { - "properties": { - "input_include_feature_store": { - "const": "yes" - } - } - }, - { - "properties": { - "input_setup_cicd_and_project": { - "const": "CICD_Only" - } - } - } - ] - } - }, "input_docker_image": { "order": 20, "type": "string", diff --git a/template/update_layout.tmpl b/template/update_layout.tmpl index a666e23c..9b606b14 100644 --- a/template/update_layout.tmpl +++ b/template/update_layout.tmpl @@ -32,51 +32,13 @@ {{ skip (printf `%s/%s` $root_dir `_params_testing_only.txt`) }} {{ end }} -# Remove Delta and Feature Store code in cases of MLflow Recipes. -{{ if (eq .input_include_mlflow_recipes `yes`) }} - # delta_paths - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/Train.py`) }} - # feature_store_paths - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `feature_engineering`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/feature_engineering`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithFeatureStore.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/feature-engineering-workflow-resource.yml`) }} -# Remove Delta and MLflow Recipes code in cases of Feature Store. -{{ else if (eq .input_include_feature_store `yes`) }} - # delta_paths - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/Train.py`) }} - # recipe_paths - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/profiles`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/steps`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/data`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/__init__.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithMLflowRecipes.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/recipe.yaml`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/README.md`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/ingest_test.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/split_test.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/train_test.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/test_sample.parquet`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/transform_test.py`) }} -# Remove MLflow Recipes and Feature Store code in cases of Delta Table. +# Remove Feature Store code if not selected; remove Delta Train notebook if Feature Store is selected +{{ if (eq .input_include_feature_store `yes`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/Train.py`) }} {{ else }} - # recipe_paths - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/profiles`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/steps`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/data`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/__init__.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithMLflowRecipes.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/recipe.yaml`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/README.md`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/ingest_test.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/split_test.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/train_test.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/test_sample.parquet`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/transform_test.py`) }} - # feature_store_paths {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `feature_engineering`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/feature_engineering`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithFeatureStore.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/TrainWithFeatureStore.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/feature-engineering-workflow-resource.yml`) }} {{ end }} diff --git a/template/{{.input_root_dir}}/README.md.tmpl b/template/{{.input_root_dir}}/README.md.tmpl index 9894e69f..a95a4303 100644 --- a/template/{{.input_root_dir}}/README.md.tmpl +++ b/template/{{.input_root_dir}}/README.md.tmpl @@ -37,7 +37,7 @@ contained in the following files: │ │ │ ├── databricks.yml <- databricks.yml is the root bundle file for the ML project that can be loaded by databricks CLI bundles. It defines the bundle name, workspace URL and resource config component to be included. │ │ -{{- if and (eq .input_include_feature_store `no`) (eq .input_include_mlflow_recipes `no`) }} +{{- if (eq .input_include_feature_store `no`) }} │ ├── training <- Training folder contains Notebook that trains and registers the model. │ │ │ ├── validation <- Optional model validation step before deploying a model. @@ -93,45 +93,6 @@ contained in the following files: │ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ │ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow -{{- else }} -│ ├── training <- Folder for model development via MLflow recipes. -│ │ │ -│ │ ├── steps <- MLflow recipe steps (Python modules) implementing ML pipeline logic, e.g. model training and evaluation. Most -│ │ │ development work happens here. See https://mlflow.org/docs/latest/recipes.html for details -│ │ │ -│ │ ├── notebooks <- Databricks notebook that runs the MLflow recipe, i.e. run the logic in `steps`. Used to -│ │ │ drive code execution on Databricks for CI/CD. In most cases, you do not need to modify -│ │ │ the notebook. -│ │ │ -│ │ ├── recipe.yaml <- The main recipe configuration file that declaratively defines the attributes and behavior -│ │ │ of each recipe step, such as the input dataset to use for training a model or the -│ │ │ performance criteria for promoting a model to production. -│ │ │ -│ │ ├── profiles <- Environment-specific (e.g. dev vs test vs prod) configurations for MLflow recipes execution. -│ │ -│ │ -│ ├── validation <- Optional model validation step before deploying a model. -│ │ -│ ├── monitoring <- Model monitoring, feature monitoring, etc. -│ │ -│ ├── deployment <- Model deployment and endpoint deployment. -│ │ │ -│ │ ├── batch_inference <- Batch inference code that will run as part of scheduled workflow. -│ │ │ -│ │ ├── model_deployment <- As part of CD workflow, promote model to Production stage in model registry. -│ │ -│ ├── tests <- Unit tests for the ML project, including modules under `steps`. -│ │ -│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. -│ │ -│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow -│ │ -│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow -│ │ -│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment -│ │ -│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow -{{- end }} {{- end }} │ {{- if or (eq .input_cicd_platform `github_actions`) (eq .input_cicd_platform `github_actions_for_github_enterprise_servers`) }} diff --git a/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl b/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl index 465f32a1..8c88d689 100644 --- a/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl +++ b/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl @@ -8,7 +8,6 @@ input_default_branch={{.input_default_branch}} input_release_branch={{.input_release_branch}} input_read_user_group={{.input_read_user_group}} input_include_feature_store={{.input_include_feature_store}} -input_include_mlflow_recipes={{.input_include_mlflow_recipes}} input_include_models_in_unity_catalog={{.input_include_models_in_unity_catalog}} input_schema_name={{.input_schema_name}} input_unity_catalog_read_user_group={{.input_unity_catalog_read_user_group}} diff --git a/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl b/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl index 2e26ef0b..abbc989a 100644 --- a/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl +++ b/template/{{.input_root_dir}}/docs/mlops-setup.md.tmpl @@ -6,8 +6,6 @@ * [Create a hosted Git repo](#create-a-hosted-git-repo) * [Configure CI/CD]({{ if (eq .input_cicd_platform `github_actions`) }}#configure-cicd---github-actions{{ else if (eq .input_cicd_platform `azure_devops`) }}#configure-cicd---azure-devops{{ else if (eq .input_cicd_platform `gitlab`) }}#configure-cicd---gitlab{{ end }}) {{- if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} -{{- if (eq .input_include_mlflow_recipes `yes`) }} -* [Configure profiles for tests, staging, and prod](#configure-profiles-for-tests-staging-and-prod){{ end }} * [Merge PR with initial ML code](#merge-a-pr-with-your-initial-ml-code) {{- end }} {{ if not (eq .input_release_branch .input_default_branch) -}} @@ -355,17 +353,6 @@ add the value `.gitlab/pipelines/{{.input_project_name}}-triggers-cicd.yml` whic {{ end }} {{- if (eq .input_setup_cicd_and_project `CICD_and_Project`)}} -{{- if (eq .input_include_mlflow_recipes `yes`) }} -## Configure profiles for tests, staging, and prod -Address the TODOs in the following files: -* [databricks-dev.yaml](../{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-dev.yaml): specify recipe configs to use in dev workspace -* [databricks-staging.yaml](../{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-staging.yaml): specify recipe configs to use in recurring model training and batch inference - jobs that run in the staging workspace -* [databricks-prod.yaml](../{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-prod.yaml) specify recipe configs to use in recurring model training and batch inference - jobs that run in the prod workspace -* [databricks-test.yaml](../{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-test.yaml): specify recipe configs to use in integration tests(CI) -{{- end }} - ## Merge a PR with your initial ML code Create and push a PR branch adding the ML code to the repository. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl index cdb7f98e..6368bca6 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl @@ -37,7 +37,7 @@ contained in the following files: │ │ │ ├── databricks.yml <- databricks.yml is the root bundle file for the ML project that can be loaded by databricks CLI bundles. It defines the bundle name, workspace URL and resource config component to be included. │ │ -{{- if and (eq .input_include_feature_store `no`) (eq .input_include_mlflow_recipes `no`) }} +{{- if (eq .input_include_feature_store `no`) }} │ ├── training <- Training folder contains Notebook that trains and registers the model. │ │ │ ├── validation <- Optional model validation step before deploying a model. @@ -93,44 +93,6 @@ contained in the following files: │ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment │ │ │ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow -{{- else }} -│ ├── training <- Folder for model development via MLflow recipes. -│ │ │ -│ │ ├── steps <- MLflow recipe steps (Python modules) implementing ML pipeline logic, e.g. model training and evaluation. Most -│ │ │ development work happens here. See https://mlflow.org/docs/latest/recipes.html for details -│ │ │ -│ │ ├── notebooks <- Databricks notebook that runs the MLflow recipe, i.e. run the logic in `steps`. Used to -│ │ │ drive code execution on Databricks for CI/CD. In most cases, you do not need to modify -│ │ │ the notebook. -│ │ │ -│ │ ├── recipe.yaml <- The main recipe configuration file that declaratively defines the attributes and behavior -│ │ │ of each recipe step, such as the input dataset to use for training a model or the -│ │ │ performance criteria for promoting a model to production. -│ │ │ -│ │ ├── profiles <- Environment-specific (e.g. dev vs test vs prod) configurations for MLflow recipes execution. -│ │ -│ │ -│ ├── validation <- Optional model validation step before deploying a model. -│ │ -│ ├── monitoring <- Model monitoring, feature monitoring, etc. -│ │ -│ ├── deployment <- Model deployment and endpoint deployment. -│ │ │ -│ │ ├── batch_inference <- Batch inference code that will run as part of scheduled workflow. -│ │ │ -│ │ ├── model_deployment <- As part of CD workflow, promote model to Production stage in model registry. -│ │ -│ ├── tests <- Unit tests for the ML project, including modules under `steps`. -│ │ -│ ├── resources <- ML resource (ML jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. -│ │ -│ ├── model-workflow-resource.yml <- ML resource config definition for model training, validation, deployment workflow -│ │ -│ ├── batch-inference-workflow-resource.yml <- ML resource config definition for batch inference workflow -│ │ -│ ├── ml-artifacts-resource.yml <- ML resource config definition for model and experiment -│ │ -│ ├── monitoring-resource.yml <- ML resource config definition for quality monitoring workflow {{- end }} ``` @@ -146,7 +108,7 @@ In each module, there is `compute_features_fn` method that you need to implement The output dataframe will be persisted in a [time-series Feature Store table]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "machine-learning/feature-store/time-series.html")) }}). See the example modules' documentation for more information. * Python unit tests for feature computation modules in `tests/feature_engineering` folder. -* Feature engineering notebook, `feature_engineering/notebooks/GenerateAndWriteFeatures.py`, that reads input dataframes, dynamically loads feature computation modules, executes their `compute_features_fn` method and writes the outputs to a Feature Store table (creating it if missing). +* Feature engineering notebook, `feature_engineering/GenerateAndWriteFeatures.py`, that reads input dataframes, dynamically loads feature computation modules, executes their `compute_features_fn` method and writes the outputs to a Feature Store table (creating it if missing). * Training notebook that [trains]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "machine-learning/feature-store/train-models-with-feature-store.html")) }} ) a regression model by creating a training dataset using the Feature Store client. * Model deployment and batch inference notebooks that deploy and use the trained model. * An automated integration test is provided (in `.github/workflows/{{ .input_project_name }}-run-tests.yml`) that executes a multi task run on Databricks involving the feature engineering and model training notebooks. @@ -200,7 +162,7 @@ Otherwise, e.g. if iterating on ML code for a new project, follow the steps belo You can iterate on the feature transform modules locally in your favorite IDE before running them on Databricks. #### Running code on Databricks -You can iterate on ML code by running the provided `feature_engineering/notebooks/GenerateAndWriteFeatures.py` notebook on Databricks using +You can iterate on ML code by running the provided `feature_engineering/GenerateAndWriteFeatures.py` notebook on Databricks using [Repos]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/index.html")) }}). This notebook drives execution of the feature transforms code defined under ``features``. You can use multiple browser tabs to edit logic in `features` and run the feature engineering pipeline in the `GenerateAndWriteFeatures.py` notebook. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py.tmpl similarity index 85% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py.tmpl index 99bff2b9..8f893a6b 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py.tmpl @@ -39,14 +39,7 @@ dbutils.widgets.text( # COMMAND ---------- -import os - -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../../requirements.txt +# MAGIC %pip install -r ../../requirements.txt # COMMAND ---------- @@ -54,18 +47,7 @@ dbutils.library.restartPython() # COMMAND ---------- -import sys -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd .. -sys.path.append("../..") - -# COMMAND ---------- - # DBTITLE 1,Define input and output variables -{{- if (eq .input_include_models_in_unity_catalog "no") }} -from utils import get_deployed_model_stage_for_env{{end}} env = dbutils.widgets.get("env") input_table_name = dbutils.widgets.get("input_table_name") @@ -75,7 +57,7 @@ assert input_table_name != "", "input_table_name notebook parameter must be spec assert output_table_name != "", "output_table_name notebook parameter must be specified" assert model_name != "", "model_name notebook parameter must be specified" {{- if (eq .input_include_models_in_unity_catalog "no") }} -stage = get_deployed_model_stage_for_env(env) +stage = {"dev": "Staging", "staging": "Staging", "prod": "Production", "test": "Production"}[env] model_uri = f"models:/{model_name}/{stage}"{{else}} alias = "champion" model_uri = f"models:/{model_name}@{alias}"{{end}} diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/notebooks/ModelDeployment.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/ModelDeployment.py.tmpl similarity index 90% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/notebooks/ModelDeployment.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/ModelDeployment.py.tmpl index 64f526ce..951ba945 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/notebooks/ModelDeployment.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/ModelDeployment.py.tmpl @@ -29,15 +29,6 @@ dbutils.widgets.dropdown("env", "None", ["None", "staging", "prod"], "Environmen # COMMAND ---------- -import os -import sys -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd .. -sys.path.append("../..") - -# COMMAND ---------- - from deploy import deploy model_uri = dbutils.jobs.taskValues.get("Train", "model_uri", debugValue="") diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl index f60c9617..c2677996 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl @@ -1,8 +1,4 @@ import sys -import pathlib - -sys.path.append(str(pathlib.Path(__file__).parent.parent.parent.resolve())) -{{if (eq .input_include_models_in_unity_catalog "no")}}from utils import get_deployed_model_stage_for_env{{end}} from mlflow.tracking import MlflowClient {{ if (eq .input_include_models_in_unity_catalog "no") }} @@ -19,7 +15,7 @@ def deploy(model_uri, env): _, model_name, version = model_uri.split("/") client = MlflowClient() mv = client.get_model_version(model_name, version) - target_stage = get_deployed_model_stage_for_env(env) + target_stage = {"dev": "Staging", "staging": "Staging", "prod": "Production", "test": "Production"}[env] if mv.current_stage != target_stage: client.transition_model_version_stage( name=model_name, diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/GenerateAndWriteFeatures.py.tmpl similarity index 96% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/GenerateAndWriteFeatures.py.tmpl index c8ac612c..262dae93 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/GenerateAndWriteFeatures.py.tmpl @@ -63,13 +63,6 @@ dbutils.widgets.text( # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd ../features - -# COMMAND ---------- - # DBTITLE 1,Define input and output variables input_table_path = dbutils.widgets.get("input_table_path") output_table_name = dbutils.widgets.get("output_table_name") diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/notebooks/MonitoredMetricViolationCheck.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/MonitoredMetricViolationCheck.py.tmpl similarity index 91% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/notebooks/MonitoredMetricViolationCheck.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/MonitoredMetricViolationCheck.py.tmpl index bfec9085..7db01bf1 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/notebooks/MonitoredMetricViolationCheck.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/MonitoredMetricViolationCheck.py.tmpl @@ -40,15 +40,6 @@ dbutils.widgets.text( # COMMAND ---------- -import os -import sys -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd .. -sys.path.append("../..") - -# COMMAND ---------- - from metric_violation_check_query import sql_query table_name_under_monitor = dbutils.widgets.get("table_name_under_monitor") diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl index 1dc40225..7be5eb29 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl @@ -1,10 +1,5 @@ # This file is used for the main SQL query that checks the last {num_evaluation_windows} metric violations and whether at least {num_violation_windows} of those runs violate the condition. -import sys -import pathlib - -sys.path.append(str(pathlib.Path(__file__).parent.parent.parent.resolve())) - """The SQL query is divided into three main parts. The first part selects the top {num_evaluation_windows} values of the metric to be monitored, ordered by the time window, and saves as recent_metrics. ```sql diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl index a76b3ec0..231b1b44 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl @@ -163,7 +163,7 @@ Its central purpose is to evaluate a registered model and validate its quality b Model validation contains three components: * [model-workflow-resource.yml](./model-workflow-resource.yml) contains the resource config and input parameters for model validation. * [validation.py](../validation/validation.py) defines custom metrics and validation thresholds that are referenced by the above resource config files. -* [notebooks/ModelValidation](../validation/notebooks/ModelValidation.py) contains the validation job implementation. In most cases you don't need to modify this file. +* [ModelValidation](../validation/ModelValidation.py) contains the validation job implementation. In most cases you don't need to modify this file. To set up and enable model validation, update [validation.py](../validation/validation.py) to return desired custom metrics and validation thresholds, then resolve the `TODOs` in the ModelValidation task of [model-workflow-resource.yml](./model-workflow-resource.yml). @@ -177,9 +177,9 @@ Its central purpose is to track production model performances, feature distribut Monitoring contains four components: * [metric_violation_check_query.py](../monitoring/metric_violation_check_query.py) defines a query that checks for violation of the monitored metric. -* [notebooks/MonitoredMetricViolationCheck](../monitoring/notebooks/MonitoredMetricViolationCheck.py) acts as an entry point, executing the violation check query against the monitored inference table. +* [MonitoredMetricViolationCheck](../monitoring/MonitoredMetricViolationCheck.py) acts as an entry point, executing the violation check query against the monitored inference table. It emits a boolean value based on the query result. -* [monitoring-resource.yml](./monitoring-resource.yml) contains the resource config, inputs parameters for monitoring, and orchestrates model retraining based on monitoring. It first runs the [notebooks/MonitoredMetricViolationCheck](../monitoring/notebooks/MonitoredMetricViolationCheck.py) +* [monitoring-resource.yml](./monitoring-resource.yml) contains the resource config, inputs parameters for monitoring, and orchestrates model retraining based on monitoring. It first runs the [MonitoredMetricViolationCheck](../monitoring/MonitoredMetricViolationCheck.py) entry point then decides whether to execute the model retraining workflow. To set up and enable monitoring: @@ -226,7 +226,7 @@ resources: - task_key: batch_inference_job <<: *new_cluster notebook_task: - notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py + notebook_path: ../deployment/batch_inference/BatchInference.py base_parameters: env: ${bundle.target} input_table_name: batch_inference_input_table_name @@ -234,12 +234,12 @@ resources: ``` The example above defines a Databricks job with name `${bundle.target}-{{ .input_project_name }}-batch-inference-job` -that runs the notebook under `{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py` to regularly apply your ML model for batch inference. +that runs the notebook under `{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py` to regularly apply your ML model for batch inference. At the start of the resource definition, we declared an anchor `new_cluster` that will be referenced and used later. For more information about anchors in yaml schema, please refer to the [yaml documentation](https://yaml.org/spec/1.2.2/#3222-anchors-and-aliases). We specify a `batch_inference_job` under `resources/jobs` to define a databricks workflow with internal key `batch_inference_job` and job name `{bundle.target}-{{ .input_project_name }}-batch-inference-job`. -The workflow contains a single task with task key `batch_inference_job`. The task runs notebook `../deployment/batch_inference/notebooks/BatchInference.py` with provided parameters `env` and `input_table_name` passing to the notebook. +The workflow contains a single task with task key `batch_inference_job`. The task runs notebook `../deployment/batch_inference/BatchInference.py` with provided parameters `env` and `input_table_name` passing to the notebook. After setting up databricks CLI, you can run command `databricks bundle schema` to learn more about databricks CLI bundles schema. The notebook_path is the relative path starting from the resource yaml file. @@ -281,7 +281,7 @@ resources: - task_key: batch_inference_job <<: *new_cluster notebook_task: - notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py + notebook_path: ../deployment/batch_inference/BatchInference.py base_parameters: env: ${bundle.target} input_table_name: ${var.batch_inference_input_table} diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl index e11b6306..74b9fdaa 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl @@ -20,7 +20,7 @@ resources: - task_key: batch_inference_job <<: *new_cluster notebook_task: - notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py + notebook_path: ../deployment/batch_inference/BatchInference.py base_parameters: env: ${bundle.target} {{ if (eq .input_include_feature_store `yes`) }}{{ if (eq .input_include_models_in_unity_catalog `yes`) }}input_table_name: ${bundle.target}.{{ .input_schema_name }}.feature_store_inference_input # TODO: create input table for inference diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl index 8818181e..fc9d8df4 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl @@ -23,7 +23,7 @@ resources: - task_key: PickupFeatures job_cluster_key: write_feature_table_job_cluster notebook_task: - notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py + notebook_path: ../feature_engineering/GenerateAndWriteFeatures.py base_parameters: # TODO modify these arguments to reflect your setup. input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled @@ -40,7 +40,7 @@ resources: - task_key: DropoffFeatures job_cluster_key: write_feature_table_job_cluster notebook_task: - notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py + notebook_path: ../feature_engineering/GenerateAndWriteFeatures.py base_parameters: # TODO: modify these arguments to reflect your setup. input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl index 3c063b75..681e131c 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl @@ -22,8 +22,8 @@ resources: tasks: - task_key: Train job_cluster_key: model_training_job_cluster - {{ if and (eq .input_include_feature_store `no`) (eq .input_include_mlflow_recipes `no`) }}notebook_task: - notebook_path: ../training/notebooks/Train.py + {{ if (eq .input_include_feature_store `no`) }}notebook_task: + notebook_path: ../training/Train.py base_parameters: env: ${bundle.target} # TODO: Update training_data_path @@ -34,7 +34,7 @@ resources: # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} {{ else if (eq .input_include_feature_store `yes`) }}notebook_task: - notebook_path: ../training/notebooks/TrainWithFeatureStore.py + notebook_path: ../training/TrainWithFeatureStore.py base_parameters: env: ${bundle.target} # TODO: Update training_data_path @@ -48,21 +48,14 @@ resources: {{- else -}}dropoff_features_table: ${var.catalog_name}.{{ .input_schema_name }}.trip_dropoff_features{{ end }} # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} - {{- else -}}notebook_task: - notebook_path: ../training/notebooks/TrainWithMLflowRecipes.py - base_parameters: - env: ${bundle.target} - # git source information of current ML resource deployment. It will be persisted as part of the workflow run - git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit}{{ end }} + {{ end }} - task_key: ModelValidation job_cluster_key: model_training_job_cluster depends_on: - task_key: Train notebook_task: - notebook_path: ../validation/notebooks/ModelValidation.py + notebook_path: ../validation/ModelValidation.py base_parameters: - {{- if (eq .input_include_mlflow_recipes `yes`) }} - env: ${bundle.target}{{ end }} experiment_name: ${var.experiment_name} # The `run_mode` defines whether model validation is enabled or not. # It can be one of the three values: @@ -80,7 +73,6 @@ resources: # Please refer to data parameter in mlflow.evaluate documentation https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate # TODO: update validation_input validation_input: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - {{- if (eq .input_include_mlflow_recipes `no`) }} # A string describing the model type. The model type can be either "regressor" and "classifier". # Please refer to model_type parameter in mlflow.evaluate documentation https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate # TODO: update model_type @@ -88,7 +80,7 @@ resources: # The string name of a column from data that contains evaluation labels. # Please refer to targets parameter in mlflow.evaluate documentation https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate # TODO: targets - targets: fare_amount{{ end }} + targets: fare_amount # Specifies the name of the function in {{ .input_project_name }}/training_validation_deployment/validation/validation.py that returns custom metrics. # TODO(optional): custom_metrics_loader_function custom_metrics_loader_function: custom_metrics @@ -105,7 +97,7 @@ resources: depends_on: - task_key: ModelValidation notebook_task: - notebook_path: ../deployment/model_deployment/notebooks/ModelDeployment.py + notebook_path: ../deployment/model_deployment/ModelDeployment.py base_parameters: env: ${bundle.target} # git source information of current ML resource deployment. It will be persisted as part of the workflow run diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl index c68d64c7..de14e471 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl @@ -41,7 +41,7 @@ resources: - task_key: monitored_metric_violation_check <<: *new_cluster notebook_task: - notebook_path: ../monitoring/notebooks/MonitoredMetricViolationCheck.py + notebook_path: ../monitoring/MonitoredMetricViolationCheck.py base_parameters: env: ${bundle.target} table_name_under_monitor: {{ .input_inference_table_name }} diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/ingest_test.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/ingest_test.py.tmpl deleted file mode 100644 index c74e8b02..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/ingest_test.py.tmpl +++ /dev/null @@ -1,22 +0,0 @@ -import pytest -import os -import tempfile -import pandas as pd -from pandas import DataFrame -from {{template `project_name_alphanumeric_underscore` .}}.training.steps.ingest import load_file_as_dataframe - - -@pytest.fixture -def sample_data(): - return pd.read_parquet( - os.path.join(os.path.dirname(__file__), "test_sample.parquet") - ) - - -def test_ingest_function_reads_csv_correctly(sample_data): - tempdir = tempfile.mkdtemp() - csv_path = os.path.join(tempdir, "test_sample.csv") - sample_data.to_csv(csv_path) - - ingested = load_file_as_dataframe(csv_path, "csv") - assert isinstance(ingested, DataFrame) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/split_test.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/split_test.py.tmpl deleted file mode 100644 index ffe5e34e..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/split_test.py.tmpl +++ /dev/null @@ -1,36 +0,0 @@ -import pytest -import os -import pandas as pd -from pandas import DataFrame -from {{template `project_name_alphanumeric_underscore` .}}.training.steps.split import process_splits - - -@pytest.fixture -def sample_data(): - return pd.read_parquet( - os.path.join(os.path.dirname(__file__), "test_sample.parquet") - ) - - -def test_post_split_fn_returns_datasets_with_correct_spec(sample_data): - train = sample_data[0:3] - validation = sample_data[4:7] - test = sample_data[7:10] - (train_processed, validation_processed, test_processed) = process_splits( - train, validation, test - ) - assert isinstance(train_processed, DataFrame) - assert isinstance(validation_processed, DataFrame) - assert isinstance(test_processed, DataFrame) - - -def test_post_split_fn_returns_non_empty_datasets(sample_data): - train = sample_data[0:3] - validation = sample_data[4:7] - test = sample_data[7:10] - (train_processed, validation_processed, test_processed) = process_splits( - train, validation, test - ) - assert not train_processed.empty - assert not validation_processed.empty - assert not test_processed.empty diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/test_sample.parquet b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/test_sample.parquet deleted file mode 100644 index 692734d1..00000000 Binary files a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/test_sample.parquet and /dev/null differ diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/train_test.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/train_test.py.tmpl deleted file mode 100644 index 7764b1a3..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/train_test.py.tmpl +++ /dev/null @@ -1,13 +0,0 @@ -from {{template `project_name_alphanumeric_underscore` .}}.training.steps.train import estimator_fn -from sklearn.utils.estimator_checks import check_estimator - - -def test_train_fn_returns_object_with_correct_spec(): - regressor = estimator_fn() - assert callable(getattr(regressor, "fit", None)) - assert callable(getattr(regressor, "predict", None)) - - -def test_train_fn_passes_check_estimator(): - regressor = estimator_fn() - check_estimator(regressor) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/transform_test.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/transform_test.py.tmpl deleted file mode 100644 index ea119eea..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/tests/training/transform_test.py.tmpl +++ /dev/null @@ -1,7 +0,0 @@ -from {{template `project_name_alphanumeric_underscore` .}}.training.steps.transform import transformer_fn - - -def test_tranform_fn_returns_object_with_correct_spec(): - transformer = transformer_fn() - assert callable(getattr(transformer, "fit", None)) - assert callable(getattr(transformer, "transform", None)) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl deleted file mode 100644 index efc2e449..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl +++ /dev/null @@ -1,132 +0,0 @@ -# ML Developer Guide using MLflow Recipes - -[(back to project README)](../README.md) - -## Table of contents -* [Initial setup](#initial-setup): adapting the provided example code to your ML problem -* [Iterating on ML code](#iterating-on-ml-code): making and testing ML code changes on Databricks or your local machine. -* [Next steps](#next-steps) - -## Initial setup -This folder contains example ML code to train a regression model to predict NYC taxi fares using -[MLflow recipes](https://mlflow.org/docs/latest/recipes.html). - -**Note**: MLflow Recipes currently supports regression and classification problems. Usage of MLflow Recipes is encouraged but not required: you can still use the provided -CI/CD and ML resource configs to build production ML pipelines, as long as you provide ML notebooks under `notebooks` -directory of the corresponding component, for example, model training notebooks in `{{template `project_name_alphanumeric_underscore` .}}/training/notebooks`, -batch inference notebook in `{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks`. -See code comments in files under `notebooks` for the expected interface & behavior of these notebooks. - -If you're not using MLflow Recipes, you can still follow the docs below to develop your ML code, skipping sections -that are targeted at MLflow Recipes users. Then, when you're ready -to productionize your ML project, ask your ops team to set up CI/CD and deploy -production jobs{{ if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} per the [MLOps setup guide](../../docs/mlops-setup.md){{end}}. - -### Configure your ML pipeline -**This section assumes use of MLflow Recipes**. - -Address TODOs in the recipe configs under `{{template `project_name_alphanumeric_underscore` .}}/training/recipe.yaml`, `{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-dev.yaml`, -and `{{template `project_name_alphanumeric_underscore` .}}/training/profiles/local.yaml`, specifying configs such as the training dataset path(s) to use when developing -locally or on Databricks. - -For details on the meaning of recipe configurations, see the comments in [this example recipe.yaml](https://github.com/mlflow/recipes-regression-template/blob/main/recipe.yaml). -The purpose and behavior of the individual recipe steps (`ingest`, `train`, etc) being configured are also -described in detail in -the [Recipe overview](https://mlflow.org/docs/latest/recipes.html) -and [API documentation](https://mlflow.org/docs/latest/python_api/mlflow.recipes.html). - -After configuring your recipe, you can iterate on and test ML code under ``{{template `project_name_alphanumeric_underscore` .}}/training/steps``. -We expect most development to take place in the abovementioned YAML config files and -`{{template `project_name_alphanumeric_underscore` .}}/training/steps/train.py` (model training logic). - -## Iterating on ML code - -### Deploy ML code and resources to dev workspace using Bundles - -Refer to [Local development and dev workspace](../resources/README.md#local-development-and-dev-workspace) -to use databricks CLI bundles to deploy ML code together with ML resource configs to dev workspace. - -### Develop on Databricks using Databricks Repos - -#### Prerequisites -You'll need: -* Access to run commands on a cluster running Databricks Runtime ML version 11.0 or above in your dev Databricks workspace -* To set up [Databricks Repos]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/index.html")) }}): see instructions below - -#### Configuring Databricks Repos -To use Repos, [set up git integration]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/repos-setup.html")) }}) in your dev workspace. - -If the current project has already been pushed to a hosted Git repo, follow the -[UI workflow]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/git-operations-with-repos#add-a-repo-and-connect-remotely-later")) }}) -to clone it into your dev workspace and iterate. - -Otherwise, e.g. if iterating on ML code for a new project, follow the steps below: -* Follow the [UI workflow]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/git-operations-with-repos#add-a-repo-and-connect-remotely-later")) }}) - for creating a repo, but uncheck the "Create repo by cloning a Git repository" checkbox. -* Install the `dbx` CLI via `pip install --upgrade dbx` -* Run `databricks configure --profile {{ .input_project_name }}-dev --token --host `, passing the URL of your dev workspace. - This should prompt you to enter an API token -* [Create a personal access token]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/auth/pat.html")) }}) - in your dev workspace and paste it into the prompt from the previous step -* From within the root directory of the current project, use the [dbx sync](https://dbx.readthedocs.io/en/latest/guides/python/devloop/mixed/#using-dbx-sync-repo-for-local-to-repo-synchronization) tool to copy code files from your local machine into the Repo by running - `dbx sync repo --profile {{ .input_project_name }}-dev --source . --dest-repo your-repo-name`, where `your-repo-name` should be the last segment of the full repo name (`/Repos/username/your-repo-name`) - -#### Running code on Databricks -You can iterate on ML code by running the provided `{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py` notebook on Databricks using -[Repos]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/index.html")) }}). This notebook drives execution of -the ML code defined under ``{{template `project_name_alphanumeric_underscore` .}}/training/steps``. You can use multiple browser tabs to edit -logic in `steps` and run the training recipe in the `TrainWithMLflowRecipes.py` notebook. - - -### Develop locally -**Note: this section assumes use of MLflow Recipes**. - -You can also iterate on ML code locally. - -#### Prerequisites -* Python 3.8+ -* Install model training and test dependencies via `pip install -I -r {{template `project_name_alphanumeric_underscore` .}}/requirements.txt -r test-requirements.txt` from project root directory. - -#### Trigger model training -Run `mlp run --profile local` to trigger training locally. See the -[MLflow recipes CLI docs](https://mlflow.org/docs/latest/recipes.html#key-concepts) for details. - -#### Inspect results in the UI -To facilitate saving and sharing results from local iteration with collaborators, we recommend configuring your -environment to log to a Databricks MLflow tracking server, as described in [this guide]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "mlflow/access-hosted-tracking-server.html")) }}). -Then, update `profiles/local.yaml` to use a Databricks tracking URI, -e.g. `databricks://` instead of a local `sqlite://` URI. You can then easily view model training results in the Databricks UI. - -If you prefer to log results locally (the default), you can view model training results by running the MLflow UI: - -```sh -mlflow ui \ - --backend-store-uri sqlite:///mlruns.db \ - --default-artifact-root ./mlruns \ - --host localhost -``` - -Then, open a browser tab pointing to [http://127.0.0.1:5000](http://127.0.0.1:5000) - -#### Run unit tests -You can run unit tests for your ML code via `pytest tests`. - -## Next Steps -If you're iterating on ML code for an existing, already-deployed ML project, -{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} -follow [Submitting a Pull Request](../../docs/ml-pull-request.md) -{{- else }} -ask your ops team to set up CI/CD by initializing MLOps Stacks with the `CICD_Only` parameter -{{- end }} -to submit your code for testing and production deployment. - -Otherwise, if exploring a new ML problem and satisfied with the results (e.g. you were able to train -a model with reasonable performance on your dataset), you may be ready to productionize your pipeline. -{{- if (eq .input_setup_cicd_and_project `CICD_and_Project`) }} -To do this, follow the [MLOps Setup Guide](../../docs/mlops-setup.md) to set up CI/CD and deploy -production training/inference pipelines. -{{- else }} -To do this, ask your ops team to set up CI/CD by initializing MLOps Stacks with the `CICD_Only` parameter. -{{- end }} - -[(back to project README)](../README.md) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/Train.py.tmpl similarity index 95% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/Train.py.tmpl index eb909157..3286e277 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/Train.py.tmpl @@ -26,13 +26,7 @@ # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt +# MAGIC %pip install -r ../requirements.txt # COMMAND ---------- diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithFeatureStore.py.tmpl similarity index 98% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithFeatureStore.py.tmpl index aa2e35f3..d19d3584 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithFeatureStore.py.tmpl @@ -26,13 +26,7 @@ # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt +# MAGIC %pip install -r ../requirements.txt # COMMAND ---------- diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/__init__.py.tmpl deleted file mode 100644 index e69de29b..00000000 diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/data/sample.parquet b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/data/sample.parquet deleted file mode 100644 index 4efd1712..00000000 Binary files a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/data/sample.parquet and /dev/null differ diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl deleted file mode 100644 index c06cc6c4..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl +++ /dev/null @@ -1,140 +0,0 @@ -# Databricks notebook source -################################################################################## -# Model Training Notebook -## -# This notebook runs the MLflow Regression Recipe to train and registers an MLflow model in the model registry. -# It is configured and can be executed as the "Train" task in the model_training_job workflow defined under -# ``{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml`` -# -# NOTE: In general, we recommend that you do not modify this notebook directly, and instead update data-loading -# and model training logic in Python modules under the `steps` directory. -# Modifying this notebook can break model training CI/CD. -# -# However, if you do need to make changes (e.g. to remove the use of MLflow Recipes APIs), -# be sure to preserve the following interface expected by CI and the production model training job: -# -# Parameters: -# -# * env (optional): Name of the environment the notebook is run in (test, staging, or prod). -# You can add environment-specific logic to this notebook based on the value of this parameter, -# e.g. read training data from different tables or data sources across environments. -# The `databricks-dev` profile will be used if users manually run the notebook. -# The `databricks-test` profile will be used during CI test runs. -# This separates the potentially many runs/models logged during integration tests from -# runs/models produced by staging/production model training jobs. You can use the value of this -# parameter to further customize the behavior of this notebook based on whether it's running as -# a test or for recurring model training in staging/production. -# -# -# Return values: -# * model_uri: The notebook must return the URI of the registered model as notebook output specified through -# dbutils.notebook.exit() AND as a task value with key "model_uri" specified through -# dbutils.jobs.taskValues(...), for use by downstream notebooks. -# -# For details on MLflow Recipes and the individual split, transform, train etc steps below, including usage examples, -# see the Regression Recipe overview documentation: https://mlflow.org/docs/latest/recipes.html -# and Regression Recipes API documentation: https://mlflow.org/docs/latest/python_api/mlflow.recipes.html -################################################################################## - -# COMMAND ---------- -# MAGIC %load_ext autoreload -# MAGIC %autoreload 2 - -# COMMAND ---------- - -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt - -# COMMAND ---------- - -dbutils.library.restartPython() - -# COMMAND ---------- - -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd ../ - -# COMMAND ---------- - -from mlflow.recipes import Recipe - -try: - # Profile from databricks workflow input will be used. - env = dbutils.widgets.get("env") - profile = f"databricks-{env}" -except: - # When the users manually run the notebook, we use the "databricks-dev" profile instead of staging, prod or test. - profile = "databricks-dev" - -# COMMAND ---------- - -from mlflow.recipes.utils import ( - get_recipe_config, - get_recipe_root_path, -) - -root_path = get_recipe_root_path() -config = get_recipe_config(root_path, profile) -if config['experiment']['name'].startswith(f"/{env}-{{template `experiment_base_name` .}}"): - print("WARNING: The experiment name may not have been set correctly. Please confirm that the experiment name in the profile YAML file matches the experiment_name variable in {{template `project_name_alphanumeric_underscore` .}}/databricks.yml.") - -# COMMAND ---------- - -r = Recipe(profile=profile) - -# COMMAND ---------- - -r.clean() - -# COMMAND ---------- - -r.inspect() - -# COMMAND ---------- - -r.run("ingest") - -# COMMAND ---------- - -r.run("split") - -# COMMAND ---------- - -r.run("transform") - -# COMMAND ---------- - -r.run("train") - -# COMMAND ---------- - -r.run("evaluate") - -# COMMAND ---------- - -r.run("register") - -# COMMAND ---------- - -r.inspect("train") - -# COMMAND ---------- - -test_data = r.get_artifact("test_data") -test_data.describe() - -# COMMAND ---------- - -model_version = r.get_artifact("registered_model_version") -model_uri = f"models:/{model_version.name}/{model_version.version}" -dbutils.jobs.taskValues.set("model_uri", model_uri) -dbutils.jobs.taskValues.set("model_name", model_version.name) -dbutils.jobs.taskValues.set("model_version", model_version.version) -dbutils.notebook.exit(model_uri) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-dev.yaml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-dev.yaml.tmpl deleted file mode 100644 index b8225b3e..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-dev.yaml.tmpl +++ /dev/null @@ -1,41 +0,0 @@ -# This profile contains config overrides for model development on Databricks -experiment: - # The name of the experiment to use during training or model validation in dev deployment target. - # TODO: This value must be the same as experiment_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for dev deployment target. - name: /dev-{{template `experiment_base_name` .}} - -# Set the registry server URI. This property is especially useful if you have a registry -# server that’s different from the tracking server. -model_registry: - # Specify the MLflow registered model name under which to register model versions - # This value must be the same as model_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for dev deployment target. - model_name: dev-{{template `model_name` .}} - -# Override the default train / validation / test dataset split ratios -SPLIT_RATIOS: [0.75, 0.125, 0.125] - -INGEST_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#ingest-step - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: update this field to point to the path to your model training dataset on the Databricks workspace - # you use for development - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -INGEST_SCORING_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#batch-scoring - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: Specify the name/path of the input table for batch inference in your dev workspace here - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -PREDICT_OUTPUT_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#predict-step - # Specify the output format of the batch scoring predict step - using: table - # TODO: Specify the name of the output table for batch inference in your dev workspace here - location: "{{ .input_project_name }}_batch_scoring" diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-prod.yaml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-prod.yaml.tmpl deleted file mode 100644 index bab5e6f1..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-prod.yaml.tmpl +++ /dev/null @@ -1,39 +0,0 @@ -experiment: - # The name of the experiment to use during training or model validation in prod deployment target. - # TODO: This value must be the same as experiment_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for prod deployment target. - name: /prod-{{template `experiment_base_name` .}} - -# Set the registry server URI. This property is especially useful if you have a registry -# server that’s different from the tracking server. -model_registry: - # The MLflow registered model name under which to register model versions - # This value must be the same as model_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for prod deployment target. - model_name: prod-{{template `model_name` .}} - -# Override the default train / validation / test dataset split ratios -SPLIT_RATIOS: [0.75, 0.125, 0.125] - -INGEST_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#ingest-step - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: update this field to point to the path to your model training dataset on your production Databricks workspace - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -INGEST_SCORING_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#batch-scoring - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: Specify the name/path of the input table for batch inference in your prod workspace here - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -PREDICT_OUTPUT_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#predict-step - # Specify the output format of the batch scoring predict step - using: table - # TODO: Specify the name of the output table for batch inference in your prod workspace here - location: "{{ .input_project_name }}_batch_scoring" diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-staging.yaml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-staging.yaml.tmpl deleted file mode 100644 index 0102f233..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-staging.yaml.tmpl +++ /dev/null @@ -1,41 +0,0 @@ -experiment: - # The name of the experiment to use during training or model validation in staging deployment target. - # TODO: This value must be the same as experiment_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for staging deployment target. - name: /staging-{{template `experiment_base_name` .}} - -# Set the registry server URI. This property is especially useful if you have a registry -# server that’s different from the tracking server. -model_registry: - # The MLflow registered model name under which to register model versions - # This value must be the same as model_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for staging deployment target. - model_name: staging-{{template `model_name` .}} - -# Override the default train / validation / test dataset split ratios -SPLIT_RATIOS: [0.75, 0.125, 0.125] - -INGEST_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#ingest-step - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: update this field to point to the path to your model training dataset on your staging Databricks workspace, - # to be used in the staging instance of your ML jobs. For example, you may want to create a downsampled version of your - # full production dataset and specify its path here. - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -INGEST_SCORING_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#batch-scoring - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: Specify the name/path of the input table for batch inference in your staging workspace here - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -PREDICT_OUTPUT_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#predict-step - # Specify the output format of the batch scoring predict step - using: table - # TODO: Specify the name of the output table for batch inference in your prod workspace here - location: "{{ .input_project_name }}_batch_scoring" diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-test.yaml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-test.yaml.tmpl deleted file mode 100644 index 4b5171cd..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/databricks-test.yaml.tmpl +++ /dev/null @@ -1,42 +0,0 @@ -experiment: - # The name of the experiment to use during training or model validation in test deployment target. - # TODO: This value must be the same as experiment_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for test deployment target. - name: /test-{{template `experiment_base_name` .}} - -# Set the registry server URI. This property is especially useful if you have a registry -# server that’s different from the tracking server. -model_registry: - # Specifies the name of the Registered Model to use when registering a trained model to - # the MLflow Model Registry - # This value must be the same as model_name defined in - # {{template `project_name_alphanumeric_underscore` .}}/databricks.yml for test deployment target. - model_name: test-{{template `model_name` .}} - -# Override the default train / validation / test dataset split ratios -SPLIT_RATIOS: [0.75, 0.125, 0.125] - -INGEST_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#ingest-step - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: update this field to point to the path to your model training dataset on your staging Databricks workspace, - # to be used in tests. For example, you may want to create a down-sampled version of your full production dataset - # and specify its path here. - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -INGEST_SCORING_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#batch-scoring - # TODO: Specify the format of the dataset - using: spark_sql - # TODO: Specify the name/path of the input table for batch inference tests here - sql: SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled` - loader_method: load_file_as_dataframe - -PREDICT_OUTPUT_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#predict-step - # Specify the output format of the batch scoring predict step - using: table - # TODO: Specify the name of the output table for batch inference in tests here - location: "{{ .input_project_name }}_batch_scoring_test" diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/local.yaml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/local.yaml.tmpl deleted file mode 100644 index 55c886bc..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/profiles/local.yaml.tmpl +++ /dev/null @@ -1,38 +0,0 @@ -experiment: - name: "/Shared/{{ .input_project_name }}" - tracking_uri: "sqlite:///mlruns.db" - artifact_location: "./mlruns" - -model_registry: - # Specifies the name of the Registered Model to use when registering a trained model to - # the MLflow Model Registry - model_name: {{`{{`}}MODEL_NAME|default('{{ .input_project_name }}_model'){{`}}`}} - -# Override the default train / validation / test dataset split ratios -SPLIT_RATIOS: [0.80, 0.10, 0.10] - -INGEST_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#ingest-step - # TODO: Specify the format of the dataset - using: parquet - # TODO: update this field to point to a local filesystem path containing e.g. a sample of your model training - # dataset for local development. - location: "../data/sample.parquet" - loader_method: load_file_as_dataframe - -INGEST_SCORING_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#batch-scoring - # Use a larger section of the TLC Trip Record Dataset for the batch scoring feature - # Specify the format of the dataset - using: parquet - # TODO: update this field to point to a local filesystem path containing e.g. a sample of your input dataset - # for batch inference - location: "../data/sample.parquet" - loader_method: load_file_as_dataframe - -PREDICT_OUTPUT_CONFIG: - # For different options please read: https://github.com/mlflow/recipes-regression-template#predict-step - # Specify the output format of the batch scoring predict step - using: parquet - # Specify the output location of the batch scoring predict step - location: "./data/sample_output.parquet" diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/recipe.yaml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/recipe.yaml.tmpl deleted file mode 100644 index c1194e9b..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/recipe.yaml.tmpl +++ /dev/null @@ -1,58 +0,0 @@ -# `recipe.yaml` is the main configuration file for an MLflow Recipe. -# Required recipe parameters should be defined in this file with either concrete values or -# variables such as {{`{{ INGEST_DATA_LOCATION }}.`}} - -# Variables must be dereferenced in a profile YAML file, located under `profiles/`. -# See `profiles/local.yaml` for example usage. One may switch among profiles quickly by -# providing a profile name such as `local` in the Recipe object constructor: -# `r = Recipe(profile="local")` -# -# NOTE: YAML does not support tabs for indentation. Please use spaces and ensure that all YAML -# files are properly formatted. - -recipe: "regression/v1" -# Specifies the name of the column containing targets / labels for model training and evaluation -target_col: "fare_amount" -# Sets the primary metric to use to evaluate model performance. This primary metric is used -# to sort MLflow Runs corresponding to the recipe in the MLflow Tracking UI -primary_metric: "root_mean_squared_error" -steps: - ingest: {{`{{INGEST_CONFIG}}`}} - split: - # Train/validation/test split ratios - split_ratios: {{`{{SPLIT_RATIOS|default([0.75, 0.125, 0.125])}}`}} - # Specifies the method to use to perform additional processing and cleaning on split datasets - post_split_method: process_splits - transform: - # Specifies the method that defines the data transformations to apply during model inference - transformer_method: transformer_fn - train: - # Specifies the method that defines the estimator type and parameters to use for model training - using: custom - estimator_method: estimator_fn - evaluate: - # Sets performance thresholds that a trained model must meet in order to be eligible for - # registration to the MLflow Model Registry - # TODO: specify pre-deployment validation criteria to apply to fitted models here - validation_criteria: - - metric: root_mean_squared_error - threshold: 10 - - metric: mean_absolute_error - threshold: 50 - - metric: weighted_mean_squared_error - threshold: 20 - register: - # Indicates whether or not a model that fails to meet performance thresholds should still - # be registered to the MLflow Model Registry - allow_non_validated_model: false - ingest_scoring: {{`{{INGEST_SCORING_CONFIG}}`}} - predict: - output: {{`{{PREDICT_OUTPUT_CONFIG}}`}} - # model_uri: "models/model.pkl" -# Defines custom performance metrics to compute during model training and evaluation -# TODO: specify custom metrics for model training here, or remove them if not applicable -custom_metrics: - - name: weighted_mean_squared_error - # Specifies the name of the function in `steps/custom_metrics.py` to use to compute the metric - function: weighted_mean_squared_error - greater_is_better: False diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/__init__.py.tmpl deleted file mode 100644 index e69de29b..00000000 diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/custom_metrics.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/custom_metrics.py.tmpl deleted file mode 100644 index 23d5eb25..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/custom_metrics.py.tmpl +++ /dev/null @@ -1,47 +0,0 @@ -""" -This module defines custom metric functions that are invoked during the 'train' and 'evaluate' -steps to provide model performance insights. Custom metric functions defined in this module are -referenced in the ``metrics`` section of ``recipe.yaml``, for example: - -.. code-block:: yaml - :caption: Example custom metrics definition in ``recipe.yaml`` - - metrics: - custom: - - name: weighted_mean_squared_error - function: weighted_mean_squared_error - greater_is_better: False -""" - -from typing import Dict - -from pandas import DataFrame -from sklearn.metrics import mean_squared_error - - -def weighted_mean_squared_error( - eval_df: DataFrame, - builtin_metrics: Dict[str, int], # pylint: disable=unused-argument -) -> int: - """ - Computes the weighted mean squared error (MSE) metric. - - :param eval_df: A Pandas DataFrame containing the following columns: - - - ``"prediction"``: Predictions produced by submitting input data to the model. - - ``"target"``: Ground truth values corresponding to the input data. - - :param builtin_metrics: A dictionary containing the built-in metrics that are calculated - automatically during model evaluation. The keys are the names of the - metrics and the values are the scalar values of the metrics. For more - information, see - https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate. - :return: A single-entry dictionary containing the MSE metric. The key is the metric name and - the value is the scalar metric value. Note that custom metric functions can return - dictionaries with multiple metric entries as well. - """ - return mean_squared_error( - eval_df["prediction"], - eval_df["target"], - sample_weight=1 / eval_df["prediction"].values, - ) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/ingest.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/ingest.py.tmpl deleted file mode 100644 index 7dfa89fc..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/ingest.py.tmpl +++ /dev/null @@ -1,40 +0,0 @@ -""" -This module defines the following routines used by the 'ingest' step of the regression recipe: - -- ``load_file_as_dataframe``: Defines customizable logic for parsing dataset formats that are not - natively parsed by MLflow Recipes (i.e. formats other than Parquet, Delta, and Spark SQL). -""" - -import logging - -from pandas import DataFrame - -_logger = logging.getLogger(__name__) - - -def load_file_as_dataframe(file_path: str, file_format: str) -> DataFrame: - """ - Load content from the specified dataset file as a Pandas DataFrame. - - This method is used to load dataset types that are not natively managed by MLflow Recipes - (datasets that are not in Parquet, Delta Table, or Spark SQL Table format). This method is - called once for each file in the dataset, and MLflow Recipes automatically combines the - resulting DataFrames together. - - :param file_path: The path to the dataset file. - :param file_format: The file format string, such as "csv". - :return: A Pandas DataFrame representing the content of the specified file. - """ - - if file_format == "csv": - import pandas - - _logger.warning( - "Loading dataset CSV using `pandas.read_csv()` with default arguments and assumed index" - " column 0 which may not produce the desired schema. If the schema is not correct, you" - " can adjust it by modifying the `load_file_as_dataframe()` function in" - " `steps/ingest.py`" - ) - return pandas.read_csv(file_path, index_col=0) - else: - raise NotImplementedError diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/split.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/split.py.tmpl deleted file mode 100644 index 5fa7f926..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/split.py.tmpl +++ /dev/null @@ -1,37 +0,0 @@ -""" -This module defines the following routines used by the 'split' step of the regression recipe: - -- ``process_splits``: Defines customizable logic for processing & cleaning the training, validation, - and test datasets produced by the data splitting procedure. -""" - -from pandas import DataFrame - - -def process_splits( - train_df: DataFrame, validation_df: DataFrame, test_df: DataFrame -) -> (DataFrame, DataFrame, DataFrame): - """ - Perform additional processing on the split datasets. - - :param train_df: The training dataset produced by the data splitting procedure. - :param validation_df: The validation dataset produced by the data splitting procedure. - :param test_df: The test dataset produced by the data splitting procedure. - :return: A tuple containing, in order: the processed training dataset, the processed - validation dataset, and the processed test dataset. - """ - - def process(df: DataFrame): - # Drop invalid data points - cleaned = df.dropna() - # Filter out invalid fare amounts and trip distance - cleaned = cleaned[ - (cleaned["fare_amount"] > 0) - & (cleaned["trip_distance"] < 400) - & (cleaned["trip_distance"] > 0) - & (cleaned["fare_amount"] < 1000) - ] - - return cleaned - - return process(train_df), process(validation_df), process(test_df) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/train.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/train.py.tmpl deleted file mode 100644 index c61b2bd8..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/train.py.tmpl +++ /dev/null @@ -1,17 +0,0 @@ -""" -This module defines the following routines used by the 'train' step of the regression recipe: - -- ``estimator_fn``: Defines the customizable estimator type and parameters that are used - during training to produce a model pipeline. -""" - - -def estimator_fn(): - """ - Returns an *unfitted* estimator that defines ``fit()`` and ``predict()`` methods. - The estimator's input and output signatures should be compatible with scikit-learn - estimators. - """ - from sklearn.linear_model import SGDRegressor - - return SGDRegressor(random_state=42) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/transform.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/transform.py.tmpl deleted file mode 100644 index 5e186987..00000000 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/steps/transform.py.tmpl +++ /dev/null @@ -1,62 +0,0 @@ -""" -This module defines the following routines used by the 'transform' step of the regression recipe: - -- ``transformer_fn``: Defines customizable logic for transforming input data before it is passed - to the estimator during model inference. -""" - -from pandas import DataFrame -from sklearn.compose import ColumnTransformer -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer - - -def calculate_features(df: DataFrame): - """ - Extend the input dataframe with pickup day of week and hour, and trip duration. - Drop the now-unneeded pickup datetime and dropoff datetime columns. - """ - df["pickup_dow"] = df["tpep_pickup_datetime"].dt.dayofweek - df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour - trip_duration = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"] - df["trip_duration"] = trip_duration.map(lambda x: x.total_seconds() / 60) - df.drop(columns=["tpep_pickup_datetime", "tpep_dropoff_datetime"], inplace=True) - return df - - -def transformer_fn(): - """ - Returns an *unfitted* transformer that defines ``fit()`` and ``transform()`` methods. - The transformer's input and output signatures should be compatible with scikit-learn - transformers. - """ - return Pipeline( - steps=[ - ( - "calculate_time_and_duration_features", - FunctionTransformer(calculate_features, feature_names_out="one-to-one"), - ), - ( - "encoder", - ColumnTransformer( - transformers=[ - ( - "hour_encoder", - OneHotEncoder(categories="auto", sparse_output=False), - ["pickup_hour"], - ), - ( - "day_encoder", - OneHotEncoder(categories="auto", sparse_output=False), - ["pickup_dow"], - ), - ( - "std_scaler", - StandardScaler(), - ["trip_distance", "trip_duration"], - ), - ] - ), - ), - ] - ) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/ModelValidation.py.tmpl similarity index 86% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/ModelValidation.py.tmpl index c2852616..d0aee26e 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/ModelValidation.py.tmpl @@ -24,10 +24,10 @@ {{end -}} # Baseline model is a requirement for relative change and absolute change validation thresholds. # * validation_input - Validation input. Please refer to data parameter in mlflow.evaluate documentation https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate -{{ if (eq .input_include_mlflow_recipes `no`) }}# * model_type - A string describing the model type. The model type can be either "regressor" and "classifier". +# * model_type - A string describing the model type. The model type can be either "regressor" and "classifier". # Please refer to model_type parameter in mlflow.evaluate documentation https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate # * targets - The string name of a column from data that contains evaluation labels. -# Please refer to targets parameter in mlflow.evaluate documentation https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate{{ end }} +# Please refer to targets parameter in mlflow.evaluate documentation https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate # * custom_metrics_loader_function - Specifies the name of the function in {{ .input_project_name }}/validation/validation.py that returns custom metrics. # * validation_thresholds_loader_function - Specifies the name of the function in {{ .input_project_name }}/validation/validation.py that returns model validation thresholds. # @@ -43,13 +43,7 @@ # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt +# MAGIC %pip install -r ../requirements.txt # COMMAND ---------- @@ -57,16 +51,6 @@ dbutils.library.restartPython() # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd ../ - -# COMMAND ---------- - -{{ if (eq .input_include_mlflow_recipes `yes`) }}dbutils.widgets.dropdown( - "env", "prod", ["staging", "prod"], "Environment(for input data)" -){{ end -}} dbutils.widgets.text( "experiment_name", "/dev-{{template `experiment_base_name` .}}", @@ -75,9 +59,8 @@ dbutils.widgets.text( dbutils.widgets.dropdown("run_mode", "disabled", ["disabled", "dry_run", "enabled"], "Run Mode") dbutils.widgets.dropdown("enable_baseline_comparison", "false", ["true", "false"], "Enable Baseline Comparison") dbutils.widgets.text("validation_input", "SELECT * FROM delta.`dbfs:/databricks-datasets/nyctaxi-with-zipcodes/subsampled`", "Validation Input") -{{ if (eq .input_include_mlflow_recipes `no`) }} dbutils.widgets.text("model_type", "regressor", "Model Type") -dbutils.widgets.text("targets", "fare_amount", "Targets"){{ end }} +dbutils.widgets.text("targets", "fare_amount", "Targets") dbutils.widgets.text("custom_metrics_loader_function", "custom_metrics", "Custom Metrics Loader Function") dbutils.widgets.text("validation_thresholds_loader_function", "validation_thresholds", "Validation Thresholds Loader Function") dbutils.widgets.text("evaluator_config_loader_function", "evaluator_config", "Evaluator Config Loader Function") @@ -115,11 +98,6 @@ import mlflow import os import tempfile import traceback -{{ if (eq .input_include_mlflow_recipes `yes`) }}from mlflow.recipes.utils import ( - get_recipe_config, - get_recipe_name, - get_recipe_root_path, -){{ end }} from mlflow.tracking.client import MlflowClient {{ if (eq .input_include_models_in_unity_catalog "no") }} client = MlflowClient() @@ -130,30 +108,6 @@ mlflow.set_registry_uri('databricks-uc') # set experiment experiment_name = dbutils.widgets.get("experiment_name") mlflow.set_experiment(experiment_name) -{{ if (eq .input_include_mlflow_recipes `yes`) }}env = dbutils.widgets.get("env") -assert env, "env notebook parameter must be specified" - -def get_model_type_from_recipe(): - try: - recipe_config = get_recipe_config("../training", f"databricks-{env}") - problem_type = recipe_config.get("recipe").split("/")[0] - if problem_type.lower() == "regression": - return "regressor" - elif problem_type.lower() == "classification": - return "classifier" - else: - raise Exception(f"Unsupported recipe {recipe_config}") - except Exception as ex: - print(f"Not able to get model type from mlflow recipe databricks-{env}.") - raise ex - -def get_targets_from_recipe(): - try: - recipe_config = get_recipe_config("../training", f"databricks-{env}") - return recipe_config.get("target_col") - except Exception as ex: - print(f"Not able to get targets from mlflow recipe databricks-{env}.") - raise ex{{ end }} # set model evaluation parameters that can be inferred from the job model_uri = dbutils.jobs.taskValues.get("Train", "model_uri", debugValue="") model_name = dbutils.jobs.taskValues.get("Train", "model_name", debugValue="") @@ -192,10 +146,9 @@ validation_input = dbutils.widgets.get("validation_input") assert validation_input data = spark.sql(validation_input) -{{ if (eq .input_include_mlflow_recipes `no`) }}model_type = dbutils.widgets.get("model_type") +model_type = dbutils.widgets.get("model_type") targets = dbutils.widgets.get("targets") -{{ else }}model_type = get_model_type_from_recipe() -targets = get_targets_from_recipe(){{ end }} + assert model_type assert targets diff --git a/tests/example-project-configs/aws/aws-github.json b/tests/example-project-configs/aws/aws-github.json index 30b6e596..53c2ef3a 100644 --- a/tests/example-project-configs/aws/aws-github.json +++ b/tests/example-project-configs/aws/aws-github.json @@ -10,7 +10,6 @@ "input_release_branch": "release", "input_read_user_group": "users", "input_include_feature_store": "no", - "input_include_mlflow_recipes": "no", "input_include_models_in_unity_catalog": "yes", "input_schema_name": "test_project_schema_name", "input_unity_catalog_read_user_group": "account users", diff --git a/tests/example-project-configs/azure/azure-devops.json b/tests/example-project-configs/azure/azure-devops.json index c38784b6..bc5df0bf 100644 --- a/tests/example-project-configs/azure/azure-devops.json +++ b/tests/example-project-configs/azure/azure-devops.json @@ -10,7 +10,6 @@ "input_release_branch": "release", "input_read_user_group": "users", "input_include_feature_store": "no", - "input_include_mlflow_recipes": "no", "input_include_models_in_unity_catalog": "yes", "input_schema_name": "test_project_schema_name", "input_unity_catalog_read_user_group": "account users", diff --git a/tests/example-project-configs/azure/azure-github.json b/tests/example-project-configs/azure/azure-github.json index 1e405b3a..9960817d 100644 --- a/tests/example-project-configs/azure/azure-github.json +++ b/tests/example-project-configs/azure/azure-github.json @@ -10,7 +10,6 @@ "input_release_branch": "release", "input_read_user_group": "users", "input_include_feature_store": "no", - "input_include_mlflow_recipes": "no", "input_include_models_in_unity_catalog": "yes", "input_schema_name": "test_project_schema_name", "input_unity_catalog_read_user_group": "account users", diff --git a/tests/example-project-configs/azure/azure-gitlab.json b/tests/example-project-configs/azure/azure-gitlab.json index 547ca50e..14301bdf 100644 --- a/tests/example-project-configs/azure/azure-gitlab.json +++ b/tests/example-project-configs/azure/azure-gitlab.json @@ -10,7 +10,6 @@ "input_release_branch": "release", "input_read_user_group": "users", "input_include_feature_store": "no", - "input_include_mlflow_recipes": "no", "input_include_models_in_unity_catalog": "yes", "input_schema_name": "test_project_schema_name", "input_unity_catalog_read_user_group": "account users", diff --git a/tests/example-project-configs/gcp/gcp-github.json b/tests/example-project-configs/gcp/gcp-github.json index 45e39adb..4e739433 100644 --- a/tests/example-project-configs/gcp/gcp-github.json +++ b/tests/example-project-configs/gcp/gcp-github.json @@ -9,7 +9,6 @@ "input_release_branch": "release", "input_read_user_group": "users", "input_include_feature_store": "no", - "input_include_mlflow_recipes": "no", "input_include_models_in_unity_catalog": "yes", "input_schema_name": "test_project_schema_name", "input_unity_catalog_read_user_group": "account users", diff --git a/tests/test_create_project.py b/tests/test_create_project.py index deb981dd..b83e1099 100644 --- a/tests/test_create_project.py +++ b/tests/test_create_project.py @@ -24,7 +24,6 @@ "input_release_branch": "release", "input_read_user_group": "users", "input_include_feature_store": "no", - "input_include_mlflow_recipes": "no", "input_include_models_in_unity_catalog": "no", "input_schema_name": "schema_name", "input_unity_catalog_read_user_group": "account users", @@ -176,7 +175,6 @@ def test_generate_project_with_default_values( cicd_platform, setup_cicd_and_project, include_feature_store, - include_mlflow_recipes, include_models_in_unity_catalog, ): """ @@ -218,7 +216,6 @@ def prepareContext( cicd_platform, setup_cicd_and_project, include_feature_store, - include_mlflow_recipes, include_models_in_unity_catalog, ): context = { @@ -230,8 +227,6 @@ def prepareContext( } if include_feature_store != "": context["input_include_feature_store"] = include_feature_store - if include_mlflow_recipes != "": - context["input_include_mlflow_recipes"] = include_mlflow_recipes if include_models_in_unity_catalog != "": context["input_include_models_in_unity_catalog"] = ( include_models_in_unity_catalog @@ -247,7 +242,6 @@ def test_generate_project_check_delta_output( cicd_platform, setup_cicd_and_project, include_feature_store, - include_mlflow_recipes, include_models_in_unity_catalog, ): """ @@ -261,7 +255,6 @@ def test_generate_project_check_delta_output( cicd_platform, setup_cicd_and_project, include_feature_store, - include_mlflow_recipes, include_models_in_unity_catalog, ) generate(tmpdir, databricks_cli, context=context) @@ -275,7 +268,7 @@ def test_generate_project_check_delta_output( ) if ( setup_cicd_and_project != "CICD_Only" - and include_mlflow_recipes == "no" + and include_feature_store == "no" ): assert os.path.isfile(delta_notebook_path) @@ -291,7 +284,6 @@ def test_generate_project_check_feature_store_output( cicd_platform, setup_cicd_and_project, include_feature_store, - include_mlflow_recipes, include_models_in_unity_catalog, ): """ @@ -305,7 +297,6 @@ def test_generate_project_check_feature_store_output( cicd_platform, setup_cicd_and_project, include_feature_store, - include_mlflow_recipes, include_models_in_unity_catalog, ) generate(tmpdir, databricks_cli, context=context) @@ -323,46 +314,6 @@ def test_generate_project_check_feature_store_output( assert not os.path.isfile(fs_notebook_path) -@parametrize_by_project_generation_params -def test_generate_project_check_recipe_output( - tmpdir, - databricks_cli, - cloud, - cicd_platform, - setup_cicd_and_project, - include_feature_store, - include_mlflow_recipes, - include_models_in_unity_catalog, -): - """ - Asserts the behavior of MLflow Recipes-related artifacts when generating MLOps Stacks. - """ - if cloud == "gcp" and include_models_in_unity_catalog == "yes": - # Skip test for GCP with Unity Catalog - return - context = prepareContext( - cloud, - cicd_platform, - setup_cicd_and_project, - include_feature_store, - include_mlflow_recipes, - include_models_in_unity_catalog, - ) - generate(tmpdir, databricks_cli, context=context) - recipe_notebook_path = ( - tmpdir - / TEST_PROJECT_NAME - / TEST_PROJECT_DIRECTORY - / "training" - / "notebooks" - / "TrainWithMLflowRecipes.py" - ) - if setup_cicd_and_project != "CICD_Only" and include_mlflow_recipes == "yes": - assert os.path.isfile(recipe_notebook_path) - else: - assert not os.path.isfile(recipe_notebook_path) - - @pytest.mark.parametrize( "workspace_url_suffix", [ diff --git a/tests/test_github_actions.py b/tests/test_github_actions.py index 151f95e8..62c8c18c 100644 --- a/tests/test_github_actions.py +++ b/tests/test_github_actions.py @@ -12,14 +12,14 @@ "cicd_platform", ["github_actions", "github_actions_for_github_enterprise_servers"] ) @pytest.mark.parametrize( - "setup_cicd_and_project,include_feature_store,include_mlflow_recipes,include_models_in_unity_catalog", + "setup_cicd_and_project,include_feature_store,include_models_in_unity_catalog", [ - ("CICD_and_Project", "no", "no", "no"), - ("CICD_and_Project", "no", "no", "yes"), - ("CICD_and_Project", "no", "yes", "no"), - ("CICD_and_Project", "yes", "no", "no"), - ("CICD_and_Project", "yes", "no", "yes"), - ("CICD_Only", "no", "no", "no"), + ("CICD_and_Project", "no", "no"), + ("CICD_and_Project", "no", "yes"), + ("CICD_and_Project", "no", "no"), + ("CICD_and_Project", "yes", "no"), + ("CICD_and_Project", "yes", "yes"), + ("CICD_Only", "no", "no"), ], ) @parametrize_by_cloud @@ -49,12 +49,12 @@ def test_generated_yaml_format( "cicd_platform", ["github_actions", "github_actions_for_github_enterprise_servers"] ) @pytest.mark.parametrize( - "setup_cicd_and_project,include_feature_store,include_mlflow_recipes,include_models_in_unity_catalog", + "setup_cicd_and_project,include_feature_store,include_models_in_unity_catalog", [ - ("CICD_and_Project", "no", "no", "no"), - ("CICD_and_Project", "no", "no", "yes"), - ("CICD_and_Project", "yes", "no", "no"), - ("CICD_and_Project", "yes", "no", "yes"), + ("CICD_and_Project", "no", "no"), + ("CICD_and_Project", "no", "yes"), + ("CICD_and_Project", "yes", "no"), + ("CICD_and_Project", "yes", "yes"), ], ) @parametrize_by_cloud diff --git a/tests/test_gitlab.py b/tests/test_gitlab.py index 724283a8..731d599a 100644 --- a/tests/test_gitlab.py +++ b/tests/test_gitlab.py @@ -10,14 +10,14 @@ @pytest.mark.parametrize("cicd_platform", ["gitlab"]) @pytest.mark.parametrize( - "setup_cicd_and_project,include_feature_store,include_mlflow_recipes,include_models_in_unity_catalog", + "setup_cicd_and_project,include_feature_store,include_models_in_unity_catalog", [ - ("CICD_and_Project", "no", "no", "no"), - ("CICD_and_Project", "no", "no", "yes"), - ("CICD_and_Project", "no", "yes", "no"), - ("CICD_and_Project", "yes", "no", "no"), - ("CICD_and_Project", "yes", "no", "yes"), - ("CICD_Only", "no", "no", "no"), + ("CICD_and_Project", "no", "no"), + ("CICD_and_Project", "no", "yes"), + ("CICD_and_Project", "no", "no"), + ("CICD_and_Project", "yes", "no"), + ("CICD_and_Project", "yes", "yes"), + ("CICD_Only", "no", "no"), ], ) @parametrize_by_cloud diff --git a/tests/test_mlp.py b/tests/test_mlp.py deleted file mode 100644 index ae219e0a..00000000 --- a/tests/test_mlp.py +++ /dev/null @@ -1,28 +0,0 @@ -from utils import ( - databricks_cli, - generated_project_dir, - parametrize_by_project_generation_params, -) -import pytest -import os -from mlflow.recipes import Recipe - - -@pytest.mark.parametrize( - "profile", - [ - "databricks-prod", - "databricks-staging", - "databricks-test", - "databricks-dev", - "local", - ], -) -@parametrize_by_project_generation_params -def test_mlp_yaml_valid(generated_project_dir, profile, include_mlflow_recipes): - # There's no MLP YAML configs generated so skip test in that case. - if include_mlflow_recipes == "no": - return - project_dir = generated_project_dir / "my-mlops-project" - os.chdir(project_dir / "my_mlops_project" / "training" / "notebooks") - Recipe(profile) diff --git a/tests/utils.py b/tests/utils.py index 42eaa8ac..dab873ee 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -19,7 +19,6 @@ "input_release_branch": "release", "input_read_user_group": "users", "input_include_feature_store": "no", - "input_include_mlflow_recipes": "no", "input_include_models_in_unity_catalog": "no", "input_schema_name": "schema_name", "input_unity_catalog_read_user_group": "account users", @@ -61,19 +60,17 @@ def parametrize_by_project_generation_params(fn): ], ) @pytest.mark.parametrize( - "setup_cicd_and_project,include_feature_store,include_mlflow_recipes,include_models_in_unity_catalog", + "setup_cicd_and_project,include_feature_store,include_models_in_unity_catalog", [ - ("CICD_and_Project", "no", "no", "no"), - ("CICD_and_Project", "no", "no", "yes"), - ("CICD_and_Project", "no", "yes", "no"), - ("CICD_and_Project", "yes", "no", "no"), - ("CICD_and_Project", "yes", "no", "yes"), - ("Project_Only", "no", "no", "no"), - ("Project_Only", "no", "no", "yes"), - ("Project_Only", "no", "yes", "no"), - ("Project_Only", "yes", "no", "no"), - ("Project_Only", "yes", "no", "yes"), - ("CICD_Only", "no", "no", "no"), + ("CICD_and_Project", "no", "no"), + ("CICD_and_Project", "no", "yes"), + ("CICD_and_Project", "yes", "no"), + ("CICD_and_Project", "yes", "yes"), + ("Project_Only", "no", "no"), + ("Project_Only", "no", "yes"), + ("Project_Only", "yes", "no"), + ("Project_Only", "yes", "yes"), + ("CICD_Only", "no", "no"), ], ) @wraps(fn) @@ -91,7 +88,6 @@ def generated_project_dir( cicd_platform, setup_cicd_and_project, include_feature_store, - include_mlflow_recipes, include_models_in_unity_catalog, ): params = { @@ -114,7 +110,6 @@ def generated_project_dir( { "input_project_name": "my-mlops-project", "input_include_feature_store": include_feature_store, - "input_include_mlflow_recipes": include_mlflow_recipes, "input_read_user_group": "users", "input_include_models_in_unity_catalog": include_models_in_unity_catalog, "input_schema_name": "schema_name",