diff --git a/contrib/templates/data-engineering/README.md b/contrib/templates/data-engineering/README.md new file mode 100644 index 0000000..c1ed1c6 --- /dev/null +++ b/contrib/templates/data-engineering/README.md @@ -0,0 +1,13 @@ +# data-engineering template + +This template introduces a new structure for organizing data-engineering +assets in DABs. + +Install it using + +``` +databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering +``` + +Note that by default this template doesn't come with any assets such as jobs or pipelines. +Follow the instructions in the template setup and README to add them. \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json new file mode 100644 index 0000000..a49171a --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json @@ -0,0 +1,46 @@ +{ + "welcome_message": "\nWelcome to the data-engineering pipeline template!", + "properties": { + "pipeline_name": { + "type": "string", + "description": "\nPlease provide the name of the pipeline to generate.\npipeline_name", + "default": "etl_pipeline", + "order": 1 + }, + "format": { + "type": "string", + "description": "\nPlease select the format to use to define this pipeline.\nformat", + "order": 2, + "enum": [ + "python files", + "sql files", + "notebooks" + ], + "default": "python files" + }, + "only_python_files_supported": { + "skip_prompt_if": { + "properties": { + "format": { + "pattern": "python files" + } + } + }, + "default": "ignored", + "type": "string", + "description": "{{fail \"Only Python files are supported in this template at this time.\"}}", + "order": 3 + }, + "include_job": { + "type": "string", + "description": "\nWould you like to include a job that automatically triggers this pipeline?\nThis trigger will only be enabled for production deployments.\ninclude_job", + "order": 4, + "enum": [ + "yes", + "no" + ], + "default": "yes" + } + }, + "success_message": "\n\n🪠 New pipeline definition generated under 'assets/{{.pipeline_name}}'!" +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py new file mode 100644 index 0000000..67f4c4c --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py @@ -0,0 +1,5 @@ +# This is the entry point for the {{.pipeline_name}} pipeline. +# It makes sure all transformations in the transformations directory are included. +import transformations + +__all__ = ["transformations"] diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md new file mode 100644 index 0000000..7292d7f --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md @@ -0,0 +1,4 @@ +# explorations + +This folder is reserved for personal, exploratory notebooks. +By default these are not committed to Git, as 'explorations' is listed in .gitignore. diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl new file mode 100644 index 0000000..ef1f017 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl @@ -0,0 +1,52 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')\n", + "from {{.pipeline_name}}.transformations import taxi_stats\n", + "\n", + "\n", + "spark = SparkSession.builder.getOrCreate()\n", + "spark.sql('SELECT * FROM taxi_stats').show()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "ipynb-notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py new file mode 100644 index 0000000..1fba2e2 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py @@ -0,0 +1,8 @@ +import dlt +from pyspark.sql import DataFrame +from databricks.sdk.runtime import spark + + +@dlt.view(comment="Small set of taxis for development (uses LIMIT 10)") +def taxis() -> DataFrame: + return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10") diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py new file mode 100644 index 0000000..15ce56a --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py @@ -0,0 +1,8 @@ +import dlt +from pyspark.sql import DataFrame +from databricks.sdk.runtime import spark + + +@dlt.view +def taxis() -> DataFrame: + return spark.sql("SELECT * FROM samples.nyctaxi.trips") diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py new file mode 100644 index 0000000..b0c4449 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py @@ -0,0 +1,7 @@ +from ..sources.dev.taxis import taxis +from ..transformations import taxi_stats + + +def test_taxi_stats(): + result = taxi_stats.filter_taxis(taxis()) + assert len(result.collect()) > 5 diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py new file mode 100644 index 0000000..80577db --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py @@ -0,0 +1,9 @@ +# __init__.py defines the 'transformations' Python package +import importlib +import pkgutil + + +# Import all modules in the package except those starting with '_', like '__init__.py' +for _, module_name, _ in pkgutil.iter_modules(__path__): + if not module_name.startswith("_"): + importlib.import_module(f"{__name__}.{module_name}") diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py new file mode 100644 index 0000000..5c5dcd9 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py @@ -0,0 +1,20 @@ +import dlt +from pyspark.sql.functions import to_date, count +from pyspark.sql import DataFrame + + +@dlt.table(comment="Daily statistics of NYC Taxi trips") +def taxi_stats() -> DataFrame: + """Read from the 'taxis' view from etl_pipeline/sources.""" + taxis = dlt.read("taxis") + + return filter_taxis(taxis) + + +def filter_taxis(taxis: DataFrame) -> DataFrame: + """Group by date and calculate the number of trips.""" + return ( + taxis.withColumn("pickup_date", to_date("tpep_pickup_datetime")) + .groupBy("pickup_date") + .agg(count("*").alias("number_of_trips")) + ) diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl new file mode 100644 index 0000000..a75b746 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl @@ -0,0 +1,23 @@ +# The job that triggers {{.pipeline_name}}. +resources: + jobs: + {{.pipeline_name}}_job: + name: {{.pipeline_name}}_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + {{- if not is_service_principal}} + + email_notifications: + on_failure: ${var.notifications} + + {{- end}} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.{{.pipeline_name}}.id} \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl new file mode 100644 index 0000000..86890fd --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl @@ -0,0 +1,17 @@ +resources: + pipelines: + {{.pipeline_name}}: + name: {{.pipeline_name}} + serverless: true + {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} + ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: + # catalog: ${var.catalog} + {{- else}} + catalog: ${var.catalog} + {{- end}} + target: ${var.schema} + libraries: + - file: + path: sources/${bundle.target}/*.py + - file: + path: __init__.py diff --git a/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json new file mode 100644 index 0000000..0f7dddd --- /dev/null +++ b/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json @@ -0,0 +1,10 @@ +{ + "welcome_message": "\nWelcome to the data-engineering ingest-pipeline template!", + "properties": { + "pipeline_name": { + "type": "string", + "description": "\n{{fail \"The ingest-pipeline template is not yet implemented.\"}}", + "order": 3 + } + } +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/job/databricks_template_schema.json b/contrib/templates/data-engineering/assets/job/databricks_template_schema.json new file mode 100644 index 0000000..5e0d4b9 --- /dev/null +++ b/contrib/templates/data-engineering/assets/job/databricks_template_schema.json @@ -0,0 +1,10 @@ +{ + "welcome_message": "\nWelcome to the data-engineering job resource template!", + "properties": { + "pipeline_name": { + "type": "string", + "description": "\n{{fail \"The job template is not yet implemented.\"}}", + "order": 3 + } + } +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/databricks_template_schema.json b/contrib/templates/data-engineering/databricks_template_schema.json new file mode 100644 index 0000000..575488f --- /dev/null +++ b/contrib/templates/data-engineering/databricks_template_schema.json @@ -0,0 +1,46 @@ +{ + "welcome_message": "\nWelcome to the data-engineering template for Databricks Asset Bundles!", + "properties": { + "project_name": { + "type": "string", + "default": "my_data_project", + "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name", + "order": 1, + "pattern": "^[A-Za-z0-9_]+$", + "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." + }, + "default_catalog": { + "type": "string", + "default": "{{default_catalog}}", + "pattern": "^\\w*$", + "pattern_match_failure_message": "Invalid catalog name.", + "description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}.\ndefault_catalog", + "order": 2 + }, + "personal_schemas": { + "type": "string", + "description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", + "enum": [ + "yes, use a schema based on the current user name during development", + "no, use a shared schema during development" + ], + "order": 3 + }, + "shared_schema": { + "skip_prompt_if": { + "properties": { + "personal_schemas": { + "const": "yes, use a schema based on the current user name during development" + } + } + }, + "type": "string", + "default": "default", + "pattern": "^\\w+$", + "pattern_match_failure_message": "Invalid schema name.", + "description": "\nPlease provide an initial schema during development.\ndefault_schema", + "order": 4 + } + }, + "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd {{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!" +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore b/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore new file mode 100644 index 0000000..f6a3b5f --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi new file mode 100644 index 0000000..0edd518 --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json new file mode 100644 index 0000000..5d15eba --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl new file mode 100644 index 0000000..2f753e8 --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl @@ -0,0 +1,22 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}} + "python.analysis.extraPaths": ["assets/etl_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/README.md.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/README.md.tmpl new file mode 100644 index 0000000..cadd25c --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/README.md.tmpl @@ -0,0 +1,86 @@ +# {{.project_name}} + +The '{{.project_name}}' project was generated by using the data-engineering template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. We recommend the UV package manager to install project dependencies. It's a drop-in replacement for `pip`. + See https://docs.astral.sh/uv/getting-started/installation/ for full installation instructions, + or run: + ``` + $ pip install uv + ``` + +4. Install all project dependencies: + ``` + $ uv sync + ``` + + See the "Running unit tests" below for more on testing. + +5. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + +## Adding assets such as pipelines and jobs + +By default, the data-engineering template does not include any assets. + +1. To add an asset, run the `add-asset` script: + ``` + $ uv run add-asset + ``` + + or, if you don't use UV, use + + ``` + $ export TYPE=etl-pipeline + $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/$TYPE + ``` + +2. Optionally, run all tests on serverless compute after adding an asset: + ``` + $ uv run test + ``` + +## Deploying assets + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +## Running unit tests + +1. Run tests on a serverless environment using: + ``` + $ uv run test + ``` + +2. Optionally, to run unit tests in a different environment, such as on a cluster, + please refer to the documentation of DB connect at + https://docs.databricks.com/en/dev-tools/databricks-connect/python/install.html diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/assets/README.md b/contrib/templates/data-engineering/template/{{.project_name}}/assets/README.md new file mode 100644 index 0000000..f6c8907 --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/assets/README.md @@ -0,0 +1,4 @@ +This folder is reserved for Databricks Asset Bundles definitions. + +New jobs and pipelines should conventions from the 'data-engineering' template. +See https://github.com/databricks/bundle-examples/blob/main/contrib/templates/data-engineering/README.md. diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py b/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py new file mode 100644 index 0000000..2b7f5db --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py @@ -0,0 +1,40 @@ +# conftest.py is used to configure pytest. +# This file is in the root since it affects all tests through this bundle. +# It makes sure all 'assets/*' directories are added to `sys.path` so that +# tests can import them. +import os +import sys +import dlt +import pathlib +import pytest +import warnings +from pyspark.sql import SparkSession +from databricks.connect import DatabricksSession + +# Dynamically find and add all `assets/*` directories to `sys.path` +for path in pathlib.Path(pathlib.Path(__file__).parent / "assets").glob("*"): + resolved_path = str(path.resolve()) + if resolved_path not in sys.path: + sys.path.append(resolved_path) + +# For older databricks-connect, work around issues importing SparkSession +# and errors when SPARK_REMOTE is set. +SparkSession.builder = DatabricksSession.builder +os.environ.pop("SPARK_REMOTE", None) + +# Make dlt.views in 'sources/dev' available for tests +warnings.filterwarnings( + "ignore", + message="This is a stub that only contains the interfaces to Delta Live Tables.*", + category=UserWarning, +) +dlt.enable_local_execution() +dlt.view = lambda func=None, *args, **kwargs: func or (lambda f: f) + + +# Provide a 'spark' fixture for tests and make sure the session is eagerly initialized +@pytest.fixture(scope="session", autouse=True) +def spark() -> SparkSession: + if hasattr(DatabricksSession.builder, "validateSession"): + return DatabricksSession.builder.validateSession().getOrCreate() + return DatabricksSession.builder.getOrCreate() diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/databricks.yml.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/databricks.yml.tmpl new file mode 100644 index 0000000..d988fcc --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/databricks.yml.tmpl @@ -0,0 +1,53 @@ +# This is a Databricks asset bundle definition for {{.project_name}}. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: {{.project_name}} + +include: + - assets/*.yml + - assets/*/*.yml + +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +{{- $dev_schema := .shared_schema }} +{{- $prod_schema := .shared_schema }} +{{- if (regexp "^yes").MatchString .personal_schemas}} + {{- $dev_schema = "${workspace.current_user.short_name}"}} + {{- $prod_schema = "default"}} +{{- end}} + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: {{workspace_host}} + variables: + catalog: {{.default_catalog}} + schema: {{$dev_schema}} + notifications: [] + prod: + mode: production + workspace: + host: {{workspace_host}} + # We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy. + root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + permissions: + - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} + level: CAN_MANAGE + run_as: + {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} + variables: + catalog: {{.default_catalog}} + schema: {{$prod_schema}} + notifications: [{{user_name}}] \ No newline at end of file diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml new file mode 100644 index 0000000..966b1ab --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml @@ -0,0 +1,20 @@ +[project] +name = "my_data_project" +version = "0.1.0" +description = "Databricks ETL pipeline project" +requires-python = "==3.10.*" +dependencies = [ + "databricks-dlt", + "pytest", + "databricks-connect==15.1.*", +] + +[project.scripts] +add-asset = "scripts.add_asset:main" +test = "scripts.test:main" + +[tool.uv] +package = true + +[tool.setuptools.packages.find] +include = ["scripts"] \ No newline at end of file diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py new file mode 100644 index 0000000..931db61 --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# +# add_asset.py is used to initialize a new asset from the data-engineering template. +# +import sys +import subprocess +from typing import Literal + +VALID_ASSETS = ["etl-pipeline", "job", "ingest-pipeline"] +AssetType = Literal["etl-pipeline", "job", "ingest-pipeline"] + + +def init_bundle(asset_type: AssetType) -> None: + cmd = f"databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/{asset_type}" + subprocess.run(cmd, shell=True) + + +def show_menu() -> AssetType: + print("\nSelect asset type to initialize:") + for i, asset in enumerate(VALID_ASSETS, 1): + print(f"{i}. {asset}") + + while True: + try: + choice = int(input("\nEnter number (1-3): ")) + if 1 <= choice <= len(VALID_ASSETS): + return VALID_ASSETS[choice - 1] + print("Invalid choice. Please try again.") + except ValueError: + print("Please enter a number.") + + +def main(): + if len(sys.argv) > 1: + asset_type = sys.argv[1] + if asset_type not in VALID_ASSETS: + print(f"Error: Asset type must be one of {VALID_ASSETS}") + sys.exit(1) + else: + asset_type = show_menu() + + init_bundle(asset_type) + + +if __name__ == "__main__": + main() diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py new file mode 100644 index 0000000..4748c81 --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# +# test.py runs the unit tests for this project using pytest and serverless compute. +# To use a different form of compute, instead use 'uv run pytest' or +# use your IDE's testing panel. When using VS Code, consider using the Databricks extension. +# +import os +import subprocess + + +def main(): + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + subprocess.run(["pytest"], check=True) + + +if __name__ == "__main__": + main()