diff --git a/cookiecutter.json b/cookiecutter.json deleted file mode 100644 index 8df2587f..00000000 --- a/cookiecutter.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "__please_use_databricks_cli_for_project_creation": "Please refer to README.md" -} diff --git a/template/update_layout.tmpl b/template/update_layout.tmpl index a666e23c..006aec52 100644 --- a/template/update_layout.tmpl +++ b/template/update_layout.tmpl @@ -35,22 +35,22 @@ # Remove Delta and Feature Store code in cases of MLflow Recipes. {{ if (eq .input_include_mlflow_recipes `yes`) }} # delta_paths - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/Train.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/Train.py`) }} # feature_store_paths {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `feature_engineering`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/feature_engineering`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithFeatureStore.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/TrainWithFeatureStore.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/feature-engineering-workflow-resource.yml`) }} # Remove Delta and MLflow Recipes code in cases of Feature Store. {{ else if (eq .input_include_feature_store `yes`) }} # delta_paths - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/Train.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/Train.py`) }} # recipe_paths {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/profiles`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/steps`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/data`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/__init__.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithMLflowRecipes.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/TrainWithMLflowRecipes.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/recipe.yaml`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/README.md`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/ingest_test.py`) }} @@ -65,7 +65,7 @@ {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/steps`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/data`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/__init__.py`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithMLflowRecipes.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/TrainWithMLflowRecipes.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/recipe.yaml`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/README.md`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training/ingest_test.py`) }} @@ -76,7 +76,7 @@ # feature_store_paths {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `feature_engineering`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/feature_engineering`) }} - {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/notebooks/TrainWithFeatureStore.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training/TrainWithFeatureStore.py`) }} {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/feature-engineering-workflow-resource.yml`) }} {{ end }} diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl index cdb7f98e..b84e027a 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl @@ -146,7 +146,7 @@ In each module, there is `compute_features_fn` method that you need to implement The output dataframe will be persisted in a [time-series Feature Store table]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "machine-learning/feature-store/time-series.html")) }}). See the example modules' documentation for more information. * Python unit tests for feature computation modules in `tests/feature_engineering` folder. -* Feature engineering notebook, `feature_engineering/notebooks/GenerateAndWriteFeatures.py`, that reads input dataframes, dynamically loads feature computation modules, executes their `compute_features_fn` method and writes the outputs to a Feature Store table (creating it if missing). +* Feature engineering notebook, `feature_engineering/GenerateAndWriteFeatures.py`, that reads input dataframes, dynamically loads feature computation modules, executes their `compute_features_fn` method and writes the outputs to a Feature Store table (creating it if missing). * Training notebook that [trains]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "machine-learning/feature-store/train-models-with-feature-store.html")) }} ) a regression model by creating a training dataset using the Feature Store client. * Model deployment and batch inference notebooks that deploy and use the trained model. * An automated integration test is provided (in `.github/workflows/{{ .input_project_name }}-run-tests.yml`) that executes a multi task run on Databricks involving the feature engineering and model training notebooks. @@ -200,7 +200,7 @@ Otherwise, e.g. if iterating on ML code for a new project, follow the steps belo You can iterate on the feature transform modules locally in your favorite IDE before running them on Databricks. #### Running code on Databricks -You can iterate on ML code by running the provided `feature_engineering/notebooks/GenerateAndWriteFeatures.py` notebook on Databricks using +You can iterate on ML code by running the provided `feature_engineering/GenerateAndWriteFeatures.py` notebook on Databricks using [Repos]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/index.html")) }}). This notebook drives execution of the feature transforms code defined under ``features``. You can use multiple browser tabs to edit logic in `features` and run the feature engineering pipeline in the `GenerateAndWriteFeatures.py` notebook. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py.tmpl similarity index 85% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py.tmpl index 99bff2b9..8f893a6b 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py.tmpl @@ -39,14 +39,7 @@ dbutils.widgets.text( # COMMAND ---------- -import os - -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../../requirements.txt +# MAGIC %pip install -r ../../requirements.txt # COMMAND ---------- @@ -54,18 +47,7 @@ dbutils.library.restartPython() # COMMAND ---------- -import sys -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd .. -sys.path.append("../..") - -# COMMAND ---------- - # DBTITLE 1,Define input and output variables -{{- if (eq .input_include_models_in_unity_catalog "no") }} -from utils import get_deployed_model_stage_for_env{{end}} env = dbutils.widgets.get("env") input_table_name = dbutils.widgets.get("input_table_name") @@ -75,7 +57,7 @@ assert input_table_name != "", "input_table_name notebook parameter must be spec assert output_table_name != "", "output_table_name notebook parameter must be specified" assert model_name != "", "model_name notebook parameter must be specified" {{- if (eq .input_include_models_in_unity_catalog "no") }} -stage = get_deployed_model_stage_for_env(env) +stage = {"dev": "Staging", "staging": "Staging", "prod": "Production", "test": "Production"}[env] model_uri = f"models:/{model_name}/{stage}"{{else}} alias = "champion" model_uri = f"models:/{model_name}@{alias}"{{end}} diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/predict.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/predict.py.tmpl index 1e3c6283..f77c6cb9 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/predict.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/predict.py.tmpl @@ -13,39 +13,31 @@ def predict_batch( mlflow.set_registry_uri("databricks-uc") {{ end }} table = spark_session.table(input_table_name) - {{ if (eq .input_include_feature_store `yes`) }} - {{ if (eq .input_include_models_in_unity_catalog `no`) }} - from databricks.feature_store import FeatureStoreClient - - fs_client = FeatureStoreClient() - - prediction_df = fs_client.score_batch(model_uri, table) - {{ else }} - from databricks.feature_engineering import FeatureEngineeringClient - + {{ if (eq .input_include_feature_store `yes`) }} + from databricks.feature_engineering import FeatureEngineeringClient + fe_client = FeatureEngineeringClient() - - prediction_df = fe_client.score_batch(model_uri = model_uri, df = table) - {{ end }} + + prediction_df = fe_client.score_batch(model_uri=model_uri, df=table) output_df = ( prediction_df.withColumn("prediction", prediction_df["prediction"]) .withColumn("model_id", lit(model_version)) .withColumn("timestamp", to_timestamp(lit(ts))) ) - {{ else }} + {{ else }} predict = mlflow.pyfunc.spark_udf( spark_session, model_uri, result_type="double", env_manager="virtualenv" - ) - + ) + output_df = ( table.withColumn("prediction", predict(struct(*table.columns))) .withColumn("model_id", lit(model_version)) .withColumn("timestamp", to_timestamp(lit(ts))) ) {{ end -}} - + output_df.display() # Model predictions are written to the Delta table provided as input. # Delta is the default format in Databricks Runtime 8.0 and above. - output_df.write.format("delta").mode("overwrite").saveAsTable(output_table_name) \ No newline at end of file + output_df.write.format("delta").mode("overwrite").saveAsTable(output_table_name) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/notebooks/ModelDeployment.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/ModelDeployment.py.tmpl similarity index 90% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/notebooks/ModelDeployment.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/ModelDeployment.py.tmpl index 64f526ce..951ba945 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/notebooks/ModelDeployment.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/ModelDeployment.py.tmpl @@ -29,15 +29,6 @@ dbutils.widgets.dropdown("env", "None", ["None", "staging", "prod"], "Environmen # COMMAND ---------- -import os -import sys -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd .. -sys.path.append("../..") - -# COMMAND ---------- - from deploy import deploy model_uri = dbutils.jobs.taskValues.get("Train", "model_uri", debugValue="") diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl index f60c9617..c2677996 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/deployment/model_deployment/deploy.py.tmpl @@ -1,8 +1,4 @@ import sys -import pathlib - -sys.path.append(str(pathlib.Path(__file__).parent.parent.parent.resolve())) -{{if (eq .input_include_models_in_unity_catalog "no")}}from utils import get_deployed_model_stage_for_env{{end}} from mlflow.tracking import MlflowClient {{ if (eq .input_include_models_in_unity_catalog "no") }} @@ -19,7 +15,7 @@ def deploy(model_uri, env): _, model_name, version = model_uri.split("/") client = MlflowClient() mv = client.get_model_version(model_name, version) - target_stage = get_deployed_model_stage_for_env(env) + target_stage = {"dev": "Staging", "staging": "Staging", "prod": "Production", "test": "Production"}[env] if mv.current_stage != target_stage: client.transition_model_version_stage( name=model_name, diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/GenerateAndWriteFeatures.py.tmpl similarity index 84% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/GenerateAndWriteFeatures.py.tmpl index c8ac612c..1a718769 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/notebooks/GenerateAndWriteFeatures.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/feature_engineering/GenerateAndWriteFeatures.py.tmpl @@ -63,13 +63,6 @@ dbutils.widgets.text( # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd ../features - -# COMMAND ---------- - # DBTITLE 1,Define input and output variables input_table_path = dbutils.widgets.get("input_table_path") output_table_name = dbutils.widgets.get("output_table_name") @@ -101,7 +94,7 @@ raw_data = spark.read.format("delta").load(input_table_path) # Compute the features. This is done by dynamically loading the features module. from importlib import import_module -mod = import_module(features_module) +mod = import_module(f"features.{features_module}") compute_features_fn = getattr(mod, "compute_features_fn") features_df = compute_features_fn( @@ -114,26 +107,6 @@ features_df = compute_features_fn( # COMMAND ---------- # DBTITLE 1, Write computed features. -{{- if (eq .input_include_models_in_unity_catalog `no`) }} -from databricks import feature_store - -fs = feature_store.FeatureStoreClient() - -# Create the feature table if it does not exist first. -# Note that this is a no-op if a table with the same name and schema already exists. -fs.create_table( - name=output_table_name, - primary_keys=[x.strip() for x in pk_columns.split(",")], - timestamp_keys=[ts_column], - df=features_df, -) - -# Write the computed features dataframe. -fs.write_table( - name=output_table_name, - df=features_df, - mode="merge", -){{ else }} from databricks.feature_engineering import FeatureEngineeringClient fe = FeatureEngineeringClient() @@ -141,8 +114,8 @@ fe = FeatureEngineeringClient() # Create the feature table if it does not exist first. # Note that this is a no-op if a table with the same name and schema already exists. fe.create_table( - name=output_table_name, - primary_keys=[x.strip() for x in pk_columns.split(",")] + [ts_column], # Include timeseries column in primary_keys + name=output_table_name, + primary_keys=[x.strip() for x in pk_columns.split(",")] + [ts_column], timestamp_keys=[ts_column], df=features_df, ) @@ -152,7 +125,7 @@ fe.write_table( name=output_table_name, df=features_df, mode="merge", -){{ end }} +) # COMMAND ---------- diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/notebooks/MonitoredMetricViolationCheck.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/MonitoredMetricViolationCheck.py.tmpl similarity index 91% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/notebooks/MonitoredMetricViolationCheck.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/MonitoredMetricViolationCheck.py.tmpl index bfec9085..7db01bf1 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/notebooks/MonitoredMetricViolationCheck.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/MonitoredMetricViolationCheck.py.tmpl @@ -40,15 +40,6 @@ dbutils.widgets.text( # COMMAND ---------- -import os -import sys -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd .. -sys.path.append("../..") - -# COMMAND ---------- - from metric_violation_check_query import sql_query table_name_under_monitor = dbutils.widgets.get("table_name_under_monitor") diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl index 1dc40225..7be5eb29 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/monitoring/metric_violation_check_query.py.tmpl @@ -1,10 +1,5 @@ # This file is used for the main SQL query that checks the last {num_evaluation_windows} metric violations and whether at least {num_violation_windows} of those runs violate the condition. -import sys -import pathlib - -sys.path.append(str(pathlib.Path(__file__).parent.parent.parent.resolve())) - """The SQL query is divided into three main parts. The first part selects the top {num_evaluation_windows} values of the metric to be monitored, ordered by the time window, and saves as recent_metrics. ```sql diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl index a76b3ec0..231b1b44 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/README.md.tmpl @@ -163,7 +163,7 @@ Its central purpose is to evaluate a registered model and validate its quality b Model validation contains three components: * [model-workflow-resource.yml](./model-workflow-resource.yml) contains the resource config and input parameters for model validation. * [validation.py](../validation/validation.py) defines custom metrics and validation thresholds that are referenced by the above resource config files. -* [notebooks/ModelValidation](../validation/notebooks/ModelValidation.py) contains the validation job implementation. In most cases you don't need to modify this file. +* [ModelValidation](../validation/ModelValidation.py) contains the validation job implementation. In most cases you don't need to modify this file. To set up and enable model validation, update [validation.py](../validation/validation.py) to return desired custom metrics and validation thresholds, then resolve the `TODOs` in the ModelValidation task of [model-workflow-resource.yml](./model-workflow-resource.yml). @@ -177,9 +177,9 @@ Its central purpose is to track production model performances, feature distribut Monitoring contains four components: * [metric_violation_check_query.py](../monitoring/metric_violation_check_query.py) defines a query that checks for violation of the monitored metric. -* [notebooks/MonitoredMetricViolationCheck](../monitoring/notebooks/MonitoredMetricViolationCheck.py) acts as an entry point, executing the violation check query against the monitored inference table. +* [MonitoredMetricViolationCheck](../monitoring/MonitoredMetricViolationCheck.py) acts as an entry point, executing the violation check query against the monitored inference table. It emits a boolean value based on the query result. -* [monitoring-resource.yml](./monitoring-resource.yml) contains the resource config, inputs parameters for monitoring, and orchestrates model retraining based on monitoring. It first runs the [notebooks/MonitoredMetricViolationCheck](../monitoring/notebooks/MonitoredMetricViolationCheck.py) +* [monitoring-resource.yml](./monitoring-resource.yml) contains the resource config, inputs parameters for monitoring, and orchestrates model retraining based on monitoring. It first runs the [MonitoredMetricViolationCheck](../monitoring/MonitoredMetricViolationCheck.py) entry point then decides whether to execute the model retraining workflow. To set up and enable monitoring: @@ -226,7 +226,7 @@ resources: - task_key: batch_inference_job <<: *new_cluster notebook_task: - notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py + notebook_path: ../deployment/batch_inference/BatchInference.py base_parameters: env: ${bundle.target} input_table_name: batch_inference_input_table_name @@ -234,12 +234,12 @@ resources: ``` The example above defines a Databricks job with name `${bundle.target}-{{ .input_project_name }}-batch-inference-job` -that runs the notebook under `{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/notebooks/BatchInference.py` to regularly apply your ML model for batch inference. +that runs the notebook under `{{template `project_name_alphanumeric_underscore` .}}/deployment/batch_inference/BatchInference.py` to regularly apply your ML model for batch inference. At the start of the resource definition, we declared an anchor `new_cluster` that will be referenced and used later. For more information about anchors in yaml schema, please refer to the [yaml documentation](https://yaml.org/spec/1.2.2/#3222-anchors-and-aliases). We specify a `batch_inference_job` under `resources/jobs` to define a databricks workflow with internal key `batch_inference_job` and job name `{bundle.target}-{{ .input_project_name }}-batch-inference-job`. -The workflow contains a single task with task key `batch_inference_job`. The task runs notebook `../deployment/batch_inference/notebooks/BatchInference.py` with provided parameters `env` and `input_table_name` passing to the notebook. +The workflow contains a single task with task key `batch_inference_job`. The task runs notebook `../deployment/batch_inference/BatchInference.py` with provided parameters `env` and `input_table_name` passing to the notebook. After setting up databricks CLI, you can run command `databricks bundle schema` to learn more about databricks CLI bundles schema. The notebook_path is the relative path starting from the resource yaml file. @@ -281,7 +281,7 @@ resources: - task_key: batch_inference_job <<: *new_cluster notebook_task: - notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py + notebook_path: ../deployment/batch_inference/BatchInference.py base_parameters: env: ${bundle.target} input_table_name: ${var.batch_inference_input_table} diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl index e11b6306..74b9fdaa 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/batch-inference-workflow-resource.yml.tmpl @@ -20,7 +20,7 @@ resources: - task_key: batch_inference_job <<: *new_cluster notebook_task: - notebook_path: ../deployment/batch_inference/notebooks/BatchInference.py + notebook_path: ../deployment/batch_inference/BatchInference.py base_parameters: env: ${bundle.target} {{ if (eq .input_include_feature_store `yes`) }}{{ if (eq .input_include_models_in_unity_catalog `yes`) }}input_table_name: ${bundle.target}.{{ .input_schema_name }}.feature_store_inference_input # TODO: create input table for inference diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl index 8818181e..fc9d8df4 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/feature-engineering-workflow-resource.yml.tmpl @@ -23,7 +23,7 @@ resources: - task_key: PickupFeatures job_cluster_key: write_feature_table_job_cluster notebook_task: - notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py + notebook_path: ../feature_engineering/GenerateAndWriteFeatures.py base_parameters: # TODO modify these arguments to reflect your setup. input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled @@ -40,7 +40,7 @@ resources: - task_key: DropoffFeatures job_cluster_key: write_feature_table_job_cluster notebook_task: - notebook_path: ../feature_engineering/notebooks/GenerateAndWriteFeatures.py + notebook_path: ../feature_engineering/GenerateAndWriteFeatures.py base_parameters: # TODO: modify these arguments to reflect your setup. input_table_path: /databricks-datasets/nyctaxi-with-zipcodes/subsampled diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl index 3c063b75..84deaebd 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/model-workflow-resource.yml.tmpl @@ -23,7 +23,7 @@ resources: - task_key: Train job_cluster_key: model_training_job_cluster {{ if and (eq .input_include_feature_store `no`) (eq .input_include_mlflow_recipes `no`) }}notebook_task: - notebook_path: ../training/notebooks/Train.py + notebook_path: ../training/Train.py base_parameters: env: ${bundle.target} # TODO: Update training_data_path @@ -34,7 +34,7 @@ resources: # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} {{ else if (eq .input_include_feature_store `yes`) }}notebook_task: - notebook_path: ../training/notebooks/TrainWithFeatureStore.py + notebook_path: ../training/TrainWithFeatureStore.py base_parameters: env: ${bundle.target} # TODO: Update training_data_path @@ -49,7 +49,7 @@ resources: # git source information of current ML resource deployment. It will be persisted as part of the workflow run git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} {{- else -}}notebook_task: - notebook_path: ../training/notebooks/TrainWithMLflowRecipes.py + notebook_path: ../training/TrainWithMLflowRecipes.py base_parameters: env: ${bundle.target} # git source information of current ML resource deployment. It will be persisted as part of the workflow run @@ -59,7 +59,7 @@ resources: depends_on: - task_key: Train notebook_task: - notebook_path: ../validation/notebooks/ModelValidation.py + notebook_path: ../validation/ModelValidation.py base_parameters: {{- if (eq .input_include_mlflow_recipes `yes`) }} env: ${bundle.target}{{ end }} @@ -105,7 +105,7 @@ resources: depends_on: - task_key: ModelValidation notebook_task: - notebook_path: ../deployment/model_deployment/notebooks/ModelDeployment.py + notebook_path: ../deployment/model_deployment/ModelDeployment.py base_parameters: env: ${bundle.target} # git source information of current ML resource deployment. It will be persisted as part of the workflow run diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl index c68d64c7..de14e471 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/monitoring-resource.yml.tmpl @@ -41,7 +41,7 @@ resources: - task_key: monitored_metric_violation_check <<: *new_cluster notebook_task: - notebook_path: ../monitoring/notebooks/MonitoredMetricViolationCheck.py + notebook_path: ../monitoring/MonitoredMetricViolationCheck.py base_parameters: env: ${bundle.target} table_name_under_monitor: {{ .input_inference_table_name }} diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl index efc2e449..ed004fec 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl @@ -72,7 +72,7 @@ Otherwise, e.g. if iterating on ML code for a new project, follow the steps belo `dbx sync repo --profile {{ .input_project_name }}-dev --source . --dest-repo your-repo-name`, where `your-repo-name` should be the last segment of the full repo name (`/Repos/username/your-repo-name`) #### Running code on Databricks -You can iterate on ML code by running the provided `{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py` notebook on Databricks using +You can iterate on ML code by running the provided `{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithMLflowRecipes.py` notebook on Databricks using [Repos]({{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "repos/index.html")) }}). This notebook drives execution of the ML code defined under ``{{template `project_name_alphanumeric_underscore` .}}/training/steps``. You can use multiple browser tabs to edit logic in `steps` and run the training recipe in the `TrainWithMLflowRecipes.py` notebook. diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/Train.py.tmpl similarity index 95% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/Train.py.tmpl index eb909157..3286e277 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/Train.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/Train.py.tmpl @@ -26,13 +26,7 @@ # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt +# MAGIC %pip install -r ../requirements.txt # COMMAND ---------- diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithFeatureStore.py.tmpl similarity index 77% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithFeatureStore.py.tmpl index aa2e35f3..923cad9f 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithFeatureStore.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithFeatureStore.py.tmpl @@ -26,13 +26,7 @@ # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt +# MAGIC %pip install -r ../requirements.txt # COMMAND ---------- @@ -177,33 +171,6 @@ taxi_data.display() # COMMAND ---------- # DBTITLE 1, Create FeatureLookups -{{- if (eq .input_include_models_in_unity_catalog `no`) }} -from databricks.feature_store import FeatureLookup -import mlflow - -pickup_features_table = dbutils.widgets.get("pickup_features_table") -dropoff_features_table = dbutils.widgets.get("dropoff_features_table") - -pickup_feature_lookups = [ - FeatureLookup( - table_name=pickup_features_table, - feature_names=[ - "mean_fare_window_1h_pickup_zip", - "count_trips_window_1h_pickup_zip", - ], - lookup_key=["pickup_zip"], - timestamp_lookup_key=["rounded_pickup_datetime"], - ), -] - -dropoff_feature_lookups = [ - FeatureLookup( - table_name=dropoff_features_table, - feature_names=["count_trips_window_30m_dropoff_zip", "dropoff_is_weekend"], - lookup_key=["dropoff_zip"], - timestamp_lookup_key=["rounded_dropoff_datetime"], - ), -]{{ else }} from databricks.feature_engineering import FeatureLookup import mlflow @@ -229,34 +196,11 @@ dropoff_feature_lookups = [ lookup_key=["dropoff_zip"], timestamp_lookup_key=["rounded_dropoff_datetime"], ), -]{{ end }} +] # COMMAND ---------- # DBTITLE 1, Create Training Dataset -{{ if (eq .input_include_models_in_unity_catalog `no`) }} -from databricks import feature_store - -# End any existing runs (in the case this notebook is being run for a second time) -mlflow.end_run() - -# Start an mlflow run, which is needed for the feature store to log the model -mlflow.start_run() - -# Since the rounded timestamp columns would likely cause the model to overfit the data -# unless additional feature engineering was performed, exclude them to avoid training on them. -exclude_columns = ["rounded_pickup_datetime", "rounded_dropoff_datetime"] - -fs = feature_store.FeatureStoreClient() - -# Create the training set that includes the raw input data merged with corresponding features from both feature tables -training_set = fs.create_training_set( - taxi_data, - feature_lookups=pickup_feature_lookups + dropoff_feature_lookups, - label="fare_amount", - exclude_columns=exclude_columns, -) -{{ else }} from databricks.feature_engineering import FeatureEngineeringClient # End any existing runs (in the case this notebook is being run for a second time) @@ -273,13 +217,11 @@ fe = FeatureEngineeringClient() # Create the training set that includes the raw input data merged with corresponding features from both feature tables training_set = fe.create_training_set( - df=taxi_data, # specify the df - feature_lookups=pickup_feature_lookups + dropoff_feature_lookups, - # both features need to be available; defined in GenerateAndWriteFeatures &/or feature-engineering-workflow-resource.yml + df=taxi_data, + feature_lookups=pickup_feature_lookups + dropoff_feature_lookups, label="fare_amount", exclude_columns=exclude_columns, ) -{{ end }} # Load the TrainingSet into a dataframe which can be passed into sklearn for training a model training_df = training_set.load_df() @@ -292,7 +234,7 @@ training_df.display() # COMMAND ---------- # MAGIC %md -# MAGIC Train a LightGBM model on the data returned by `TrainingSet.to_df`, then log the model with `FeatureStoreClient.log_model`. The model will be packaged with feature metadata. +# MAGIC Train a LightGBM model on the data returned by `TrainingSet.to_df`, then log the model with `FeatureEngineeringClient.log_model`. The model will be packaged with feature metadata. # COMMAND ---------- @@ -327,24 +269,15 @@ model = lgb.train(param, train_lgb_dataset, num_rounds) # COMMAND ---------- # DBTITLE 1, Log model and return output. -{{- if (eq .input_include_models_in_unity_catalog `no`) }} -# Log the trained model with MLflow and package it with feature lookup information. -fs.log_model( - model=model, #specify model - artifact_path="model_packaged", - flavor=mlflow.lightgbm, - training_set=training_set, - registered_model_name=model_name, -){{ else }} + # Log the trained model with MLflow and package it with feature lookup information. fe.log_model( - model=model, #specify model + model=model, artifact_path="model_packaged", flavor=mlflow.lightgbm, training_set=training_set, registered_model_name=model_name, ) -{{ end }} # The returned model URI is needed by the model deployment notebook. model_version = get_latest_model_version(model_name) diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithMLflowRecipes.py.tmpl similarity index 91% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithMLflowRecipes.py.tmpl index c06cc6c4..b2a9069f 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/TrainWithMLflowRecipes.py.tmpl @@ -42,13 +42,7 @@ # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt +# MAGIC %pip install -r ../requirements.txt # COMMAND ---------- @@ -56,13 +50,6 @@ dbutils.library.restartPython() # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd ../ - -# COMMAND ---------- - from mlflow.recipes import Recipe try: diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/ModelValidation.py.tmpl similarity index 96% rename from template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl rename to template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/ModelValidation.py.tmpl index c2852616..876461ca 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/notebooks/ModelValidation.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/validation/ModelValidation.py.tmpl @@ -43,13 +43,7 @@ # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path - -# COMMAND ---------- - -# MAGIC %pip install -r ../../requirements.txt +# MAGIC %pip install -r ../requirements.txt # COMMAND ---------- @@ -57,13 +51,6 @@ dbutils.library.restartPython() # COMMAND ---------- -import os -notebook_path = '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) -%cd $notebook_path -%cd ../ - -# COMMAND ---------- - {{ if (eq .input_include_mlflow_recipes `yes`) }}dbutils.widgets.dropdown( "env", "prod", ["staging", "prod"], "Environment(for input data)" ){{ end -}} @@ -316,12 +303,12 @@ data = rounded_taxi_data(data) # MLflow evaluate can take a lambda function instead of a model uri for a model # but id does not work for the baseline model as it requires a model_uri (baseline comparison is set to false) -from databricks.feature_store import FeatureStoreClient +from databricks.feature_engineering import FeatureEngineeringClient def get_fs_model(df): - fs_client = FeatureStoreClient() + fe_client = FeatureEngineeringClient() return ( - fs_client.score_batch(model_uri, spark.createDataFrame(df)) + fe_client.score_batch(model_uri=model_uri, df=spark.createDataFrame(df)) .select("prediction") .toPandas() )