-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
# Summary This adds a data engineering template. * ETL pipelines have their own folder structure as seen under https://github.com/databricks/bundle-examples/tree/data-engineering/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/%7B%7B.pipeline_name%7D%7D * uv is used to manage packages and scripts * `uv run add-asset` can be used to add a pipeline to a project * `uv run test` can be used to run all tests on serverless compute * `uv run pytest` can be used to run all tests on any type of compute as configured in the current IDE or .databrickscfg settings # How to try this You can give the template a try using ``` databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering ``` Note that each pipeline has a separate template. New pipelines can be added by using the short-hand `uv run add-asset` or by manually instantiating the pipeline template with ``` databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline ``` --------- Co-authored-by: Pieter Noordhuis <[email protected]>
- Loading branch information
1 parent
c9d4b1b
commit f86b32c
Showing
26 changed files
with
584 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# data-engineering template | ||
|
||
This template introduces a new structure for organizing data-engineering | ||
assets in DABs. | ||
|
||
Install it using | ||
|
||
``` | ||
databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering | ||
``` | ||
|
||
Note that by default this template doesn't come with any assets such as jobs or pipelines. | ||
Follow the instructions in the template setup and README to add them. |
46 changes: 46 additions & 0 deletions
46
contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"welcome_message": "\nWelcome to the data-engineering pipeline template!", | ||
"properties": { | ||
"pipeline_name": { | ||
"type": "string", | ||
"description": "\nPlease provide the name of the pipeline to generate.\npipeline_name", | ||
"default": "etl_pipeline", | ||
"order": 1 | ||
}, | ||
"format": { | ||
"type": "string", | ||
"description": "\nPlease select the format to use to define this pipeline.\nformat", | ||
"order": 2, | ||
"enum": [ | ||
"python files", | ||
"sql files", | ||
"notebooks" | ||
], | ||
"default": "python files" | ||
}, | ||
"only_python_files_supported": { | ||
"skip_prompt_if": { | ||
"properties": { | ||
"format": { | ||
"pattern": "python files" | ||
} | ||
} | ||
}, | ||
"default": "ignored", | ||
"type": "string", | ||
"description": "{{fail \"Only Python files are supported in this template at this time.\"}}", | ||
"order": 3 | ||
}, | ||
"include_job": { | ||
"type": "string", | ||
"description": "\nWould you like to include a job that automatically triggers this pipeline?\nThis trigger will only be enabled for production deployments.\ninclude_job", | ||
"order": 4, | ||
"enum": [ | ||
"yes", | ||
"no" | ||
], | ||
"default": "yes" | ||
} | ||
}, | ||
"success_message": "\n\n🪠 New pipeline definition generated under 'assets/{{.pipeline_name}}'!" | ||
} |
5 changes: 5 additions & 0 deletions
5
...lates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# This is the entry point for the {{.pipeline_name}} pipeline. | ||
# It makes sure all transformations in the transformations directory are included. | ||
import transformations | ||
|
||
__all__ = ["transformations"] |
4 changes: 4 additions & 0 deletions
4
...g/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# explorations | ||
|
||
This folder is reserved for personal, exploratory notebooks. | ||
By default these are not committed to Git, as 'explorations' is listed in .gitignore. |
52 changes: 52 additions & 0 deletions
52
...ssets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"application/vnd.databricks.v1+cell": { | ||
"cellMetadata": { | ||
"byteLimit": 2048000, | ||
"rowLimit": 10000 | ||
}, | ||
"inputWidgets": {}, | ||
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", | ||
"showTitle": false, | ||
"title": "" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import sys\n", | ||
"sys.path.append('..')\n", | ||
"from {{.pipeline_name}}.transformations import taxi_stats\n", | ||
"\n", | ||
"\n", | ||
"spark = SparkSession.builder.getOrCreate()\n", | ||
"spark.sql('SELECT * FROM taxi_stats').show()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"application/vnd.databricks.v1+notebook": { | ||
"dashboards": [], | ||
"language": "python", | ||
"notebookMetadata": { | ||
"pythonIndentUnit": 2 | ||
}, | ||
"notebookName": "ipynb-notebook", | ||
"widgets": {} | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"name": "python", | ||
"version": "3.11.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
8 changes: 8 additions & 0 deletions
8
...a-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import dlt | ||
from pyspark.sql import DataFrame | ||
from databricks.sdk.runtime import spark | ||
|
||
|
||
@dlt.view(comment="Small set of taxis for development (uses LIMIT 10)") | ||
def taxis() -> DataFrame: | ||
return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10") |
8 changes: 8 additions & 0 deletions
8
...-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import dlt | ||
from pyspark.sql import DataFrame | ||
from databricks.sdk.runtime import spark | ||
|
||
|
||
@dlt.view | ||
def taxis() -> DataFrame: | ||
return spark.sql("SELECT * FROM samples.nyctaxi.trips") |
7 changes: 7 additions & 0 deletions
7
...gineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from ..sources.dev.taxis import taxis | ||
from ..transformations import taxi_stats | ||
|
||
|
||
def test_taxi_stats(): | ||
result = taxi_stats.filter_taxis(taxis()) | ||
assert len(result.collect()) > 5 |
9 changes: 9 additions & 0 deletions
9
...eering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# __init__.py defines the 'transformations' Python package | ||
import importlib | ||
import pkgutil | ||
|
||
|
||
# Import all modules in the package except those starting with '_', like '__init__.py' | ||
for _, module_name, _ in pkgutil.iter_modules(__path__): | ||
if not module_name.startswith("_"): | ||
importlib.import_module(f"{__name__}.{module_name}") |
20 changes: 20 additions & 0 deletions
20
...ring/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import dlt | ||
from pyspark.sql.functions import to_date, count | ||
from pyspark.sql import DataFrame | ||
|
||
|
||
@dlt.table(comment="Daily statistics of NYC Taxi trips") | ||
def taxi_stats() -> DataFrame: | ||
"""Read from the 'taxis' view from etl_pipeline/sources.""" | ||
taxis = dlt.read("taxis") | ||
|
||
return filter_taxis(taxis) | ||
|
||
|
||
def filter_taxis(taxis: DataFrame) -> DataFrame: | ||
"""Group by date and calculate the number of trips.""" | ||
return ( | ||
taxis.withColumn("pickup_date", to_date("tpep_pickup_datetime")) | ||
.groupBy("pickup_date") | ||
.agg(count("*").alias("number_of_trips")) | ||
) |
23 changes: 23 additions & 0 deletions
23
...ng/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# The job that triggers {{.pipeline_name}}. | ||
resources: | ||
jobs: | ||
{{.pipeline_name}}_job: | ||
name: {{.pipeline_name}}_job | ||
|
||
trigger: | ||
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger | ||
periodic: | ||
interval: 1 | ||
unit: DAYS | ||
|
||
{{- if not is_service_principal}} | ||
|
||
email_notifications: | ||
on_failure: ${var.notifications} | ||
|
||
{{- end}} | ||
|
||
tasks: | ||
- task_key: refresh_pipeline | ||
pipeline_task: | ||
pipeline_id: ${resources.pipelines.{{.pipeline_name}}.id} |
17 changes: 17 additions & 0 deletions
17
...sets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
resources: | ||
pipelines: | ||
{{.pipeline_name}}: | ||
name: {{.pipeline_name}} | ||
serverless: true | ||
{{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} | ||
## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: | ||
# catalog: ${var.catalog} | ||
{{- else}} | ||
catalog: ${var.catalog} | ||
{{- end}} | ||
target: ${var.schema} | ||
libraries: | ||
- file: | ||
path: sources/${bundle.target}/*.py | ||
- file: | ||
path: __init__.py |
10 changes: 10 additions & 0 deletions
10
contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"welcome_message": "\nWelcome to the data-engineering ingest-pipeline template!", | ||
"properties": { | ||
"pipeline_name": { | ||
"type": "string", | ||
"description": "\n{{fail \"The ingest-pipeline template is not yet implemented.\"}}", | ||
"order": 3 | ||
} | ||
} | ||
} |
10 changes: 10 additions & 0 deletions
10
contrib/templates/data-engineering/assets/job/databricks_template_schema.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"welcome_message": "\nWelcome to the data-engineering job resource template!", | ||
"properties": { | ||
"pipeline_name": { | ||
"type": "string", | ||
"description": "\n{{fail \"The job template is not yet implemented.\"}}", | ||
"order": 3 | ||
} | ||
} | ||
} |
46 changes: 46 additions & 0 deletions
46
contrib/templates/data-engineering/databricks_template_schema.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"welcome_message": "\nWelcome to the data-engineering template for Databricks Asset Bundles!", | ||
"properties": { | ||
"project_name": { | ||
"type": "string", | ||
"default": "my_data_project", | ||
"description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name", | ||
"order": 1, | ||
"pattern": "^[A-Za-z0-9_]+$", | ||
"pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." | ||
}, | ||
"default_catalog": { | ||
"type": "string", | ||
"default": "{{default_catalog}}", | ||
"pattern": "^\\w*$", | ||
"pattern_match_failure_message": "Invalid catalog name.", | ||
"description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}.\ndefault_catalog", | ||
"order": 2 | ||
}, | ||
"personal_schemas": { | ||
"type": "string", | ||
"description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", | ||
"enum": [ | ||
"yes, use a schema based on the current user name during development", | ||
"no, use a shared schema during development" | ||
], | ||
"order": 3 | ||
}, | ||
"shared_schema": { | ||
"skip_prompt_if": { | ||
"properties": { | ||
"personal_schemas": { | ||
"const": "yes, use a schema based on the current user name during development" | ||
} | ||
} | ||
}, | ||
"type": "string", | ||
"default": "default", | ||
"pattern": "^\\w+$", | ||
"pattern_match_failure_message": "Invalid schema name.", | ||
"description": "\nPlease provide an initial schema during development.\ndefault_schema", | ||
"order": 4 | ||
} | ||
}, | ||
"success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd {{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!" | ||
} |
8 changes: 8 additions & 0 deletions
8
contrib/templates/data-engineering/template/{{.project_name}}/.gitignore
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.databricks/ | ||
build/ | ||
dist/ | ||
__pycache__/ | ||
*.egg-info | ||
.venv/ | ||
**/explorations/** | ||
**/!explorations/README.md |
3 changes: 3 additions & 0 deletions
3
contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Typings for Pylance in Visual Studio Code | ||
# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md | ||
from databricks.sdk.runtime import * |
7 changes: 7 additions & 0 deletions
7
contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"recommendations": [ | ||
"databricks.databricks", | ||
"ms-python.vscode-pylance", | ||
"redhat.vscode-yaml" | ||
] | ||
} |
22 changes: 22 additions & 0 deletions
22
contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
{ | ||
"python.analysis.stubPath": ".vscode", | ||
"databricks.python.envFile": "${workspaceFolder}/.env", | ||
"jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", | ||
"jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", | ||
"python.testing.pytestArgs": [ | ||
"." | ||
], | ||
"python.testing.unittestEnabled": false, | ||
"python.testing.pytestEnabled": true, | ||
{{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}} | ||
"python.analysis.extraPaths": ["assets/etl_pipeline"], | ||
"files.exclude": { | ||
"**/*.egg-info": true, | ||
"**/__pycache__": true, | ||
".pytest_cache": true, | ||
}, | ||
"[python]": { | ||
"editor.defaultFormatter": "ms-python.black-formatter", | ||
"editor.formatOnSave": true, | ||
}, | ||
} |
Oops, something went wrong.