DBT duckDB example, and fixes to DB SQLite example (#327)

modal-labs · Jul 28, 2023 · abea30b · abea30b
1 parent 2aef4fa
commit abea30b
Show file tree

Hide file tree

Showing 27 changed files with 813 additions and 9 deletions.
diff --git a/10_integrations/dbt/dbt_duckdb.py b/10_integrations/dbt/dbt_duckdb.py
@@ -0,0 +1,193 @@
+# ---
+# cmd: ["modal", "run", "dbt_duckdb.py::run", "--command", "run"]
+# ---
+#
+# This example contains a minimal but capable cloud data warehouse.
+# It's comprised of the following:
+#
+# - [DuckDB](https://duckdb.org) as the warehouse's OLAP database engine
+# - AWS S3 as the data storage provider
+# - [DBT](https://docs.getdbt.com/docs/introduction) as the data transformation tool
+#
+# Meet your new cloud data warehouse.
+
+from pathlib import Path
+
+import modal
+
+# ## Bucket name configuration
+#
+# The only thing in the source code that you must update is the S3 bucket name.
+# AWS S3 bucket names are globally unique, and the one in this source is used by Modal.
+#
+# Update the `BUCKET_NAME` variable below and also any references to the original value
+# within `sample_proj_duckdb_s3/models/`. The AWS IAM policy below also includes the bucket
+# name and that must be updated.
+
+BUCKET_NAME = "modal-example-dbt-duckdb-s3"
+LOCAL_DBT_PROJECT = Path(__file__).parent / "sample_proj_duckdb_s3"
+PROJ_PATH = "/root/dbt"
+PROFILES_PATH = "/root/dbt_profile"
+TARGET_PATH = "/root/target"
+dbt_image = (
+    modal.Image.debian_slim()
+    .pip_install(
+        "boto3",
+        "dbt-duckdb>=1.5.1",
+        "pandas",
+        "pyarrow",
+    )
+    .env(
+        {
+            "DBT_PROJECT_DIR": PROJ_PATH,
+            "DBT_PROFILES_DIR": PROFILES_PATH,
+            "DBT_TARGET_PATH": TARGET_PATH,
+        }
+    )
+)
+stub = modal.Stub(name="example-dbt-duckdb-s3", image=dbt_image)
+
+# ## DBT Configuration
+#
+# Most of the DBT code and configuration is taken directly from the
+# https://github.com/dbt-labs/jaffle_shop demo and modified to support
+# using dbt-duckdb with an S3 bucket.
+#
+# The DBT profiles.yml configuration is taken from
+# https://github.com/jwills/dbt-duckdb#configuring-your-profile.
+#
+# Here we mount all this local code and configuration into the Modal function
+# so that it will be available when we run DBT in the Modal cloud.
+
+dbt_project = modal.Mount.from_local_dir(
+    LOCAL_DBT_PROJECT, remote_path=PROJ_PATH
+)
+dbt_profiles = modal.Mount.from_local_file(
+    local_path=LOCAL_DBT_PROJECT / "profiles.yml",
+    remote_path=Path(PROFILES_PATH, "profiles.yml"),
+)
+dbt_target = modal.NetworkFileSystem.persisted("dbt-target")
+# Create this secret using the "AWS" template at https://modal.com/secrets/create.
+# Be sure that the AWS user you provide credentials for has permission to
+# create S3 buckets and read/write data from them.
+#
+# The policy required for this example is the following.
+# Not that you *must* update the bucket name listed in the policy to your
+# own bucket name.
+#
+# ```json
+# {
+#     "Statement": [
+#         {
+#             "Action": "s3:*",
+#             "Effect": "Allow",
+#                "arn:aws:s3:::modal-example-dbt-duckdb-s3/*",
+#                "arn:aws:s3:::modal-example-dbt-duckdb-s3"
+#             "Sid": "duckdbs3access"
+#         }
+#     ],
+#     "Version": "2012-10-17"
+# }
+# ```
+#
+# Below we will use this user in a Modal function to create an S3 bucket and
+# populate it with .parquet data.
+s3_secret = modal.Secret.from_name("modal-examples-aws-user")
+
+# ## Seed data
+#
+# In order to provide source data for DBT to ingest and transform,
+# we have this `create_source_data` function which creates an AWS S3 bucket and
+# populates it with .parquet files based of CSV data in the seeds/ directory.
+#
+# This is not the typical way that seeds/ data is used, but it is fine for this
+# demonstration example. See https://docs.getdbt.com/docs/build/seeds for more info.
+
+
+@stub.function(
+    mounts=[dbt_project],
+    secrets=[s3_secret],
+)
+def create_source_data():
+    import boto3
+    import pandas as pd
+
+    s3_client = boto3.client("s3")
+    s3_client.create_bucket(Bucket=BUCKET_NAME)
+
+    for seed_csv_path in Path(PROJ_PATH, "seeds").glob("*.csv"):
+        print(f"found seed file {seed_csv_path}")
+        name = seed_csv_path.stem
+        df = pd.read_csv(seed_csv_path)
+        parquet_filename = f"{name}.parquet"
+        df.to_parquet(parquet_filename)
+
+        object_key = f"sources/{parquet_filename}"
+        print(f"uploading {object_key=} to S3 bucket '{BUCKET_NAME}'")
+        s3_client.upload_file(parquet_filename, BUCKET_NAME, object_key)
+
+
+# This `daily_build` function runs on a schedule to keep the DuckDB data warehouse
+# up-to-date. Currently, the source data for this warehouse is static, so the updates
+# don't really update anything, just re-build. But this example could be extended
+# to have sources which continually provide new data across time.
+
+
+@stub.function(
+    schedule=modal.Period(days=1),
+    secrets=[s3_secret],
+    mounts=[dbt_project, dbt_profiles],
+    network_file_systems={TARGET_PATH: dbt_target},
+)
+def daily_build() -> None:
+    run("build")
+
+
+# `modal run dbt_duckdb.py::run --command run`
+#
+# A successful run will log something a lot like the following:
+#
+# ```
+# 03:41:04  Running with dbt=1.5.0
+# 03:41:05  Found 5 models, 8 tests, 0 snapshots, 0 analyses, 313 macros, 0 operations, 3 seed files, 3 sources, 0 exposures, 0 metrics, 0 groups
+# 03:41:05
+# 03:41:06  Concurrency: 1 threads (target='modal')
+# 03:41:06
+# 03:41:06  1 of 5 START sql table model main.stg_customers ................................ [RUN]
+# 03:41:06  1 of 5 OK created sql table model main.stg_customers ........................... [OK in 0.45s]
+# 03:41:06  2 of 5 START sql table model main.stg_orders ................................... [RUN]
+# 03:41:06  2 of 5 OK created sql table model main.stg_orders .............................. [OK in 0.34s]
+# 03:41:06  3 of 5 START sql table model main.stg_payments ................................. [RUN]
+# 03:41:07  3 of 5 OK created sql table model main.stg_payments ............................ [OK in 0.36s]
+# 03:41:07  4 of 5 START sql external model main.customers ................................. [RUN]
+# 03:41:07  4 of 5 OK created sql external model main.customers ............................ [OK in 0.72s]
+# 03:41:07  5 of 5 START sql table model main.orders ....................................... [RUN]
+# 03:41:08  5 of 5 OK created sql table model main.orders .................................. [OK in 0.22s]
+# 03:41:08
+# 03:41:08  Finished running 4 table models, 1 external model in 0 hours 0 minutes and 3.15 seconds (3.15s).
+# 03:41:08  Completed successfully
+# 03:41:08
+# 03:41:08  Done. PASS=5 WARN=0 ERROR=0 SKIP=0 TOTAL=5
+# ```
+#
+
+
+@stub.function(
+    secrets=[s3_secret],
+    mounts=[dbt_project, dbt_profiles],
+    network_file_systems={TARGET_PATH: dbt_target},
+)
+def run(command: str) -> None:
+    from dbt.cli.main import dbtRunner
+
+    res = dbtRunner().invoke([command])
+    if res.exception:
+        print(res.exception)
+
+
+# Look for the "'materialized='external'" DBT config in the SQL templates
+# to see how `dbt-duckdb` is able to write back the transformed data to AWS S3!
+#
+# After running the 'run' command and seeing it succeed, check what's contained
+# under the bucket's `out/` key prefix. You'll see that DBT has run the transformations
+# defined in `sample_proj_duckdb_s3/models/` and produced output .parquet files.
diff --git a/10_integrations/dbt/modal_dbt.py → 10_integrations/dbt/dbt_sqlite.py b/10_integrations/dbt/modal_dbt.py → 10_integrations/dbt/dbt_sqlite.py
@@ -1,19 +1,27 @@
 # ---
 # lambda-test: false
-# cmd: ["modal", "run", "10_integrations.dbt.modal_dbt::run"]
+# cmd: ["modal", "run", "10_integrations.dbt.dbt_sqlite::run"]
 # ---
 
-"""This is a simple demonstration of how to run a dbt-core project on Modal
+"""
+This is a simple demonstration of how to run a dbt-core project on Modal
+using the dbt-sqlite adapter.
 
 The underlying DBT data and models are from https://docs.getdbt.com/docs/get-started/getting-started-dbt-core
 To run this example, first run the meltano example in 10_integrations/meltano to load the required data
 into sqlite.
 
-To run this example:
-`modal run modal_dbt.py::stub.run`
+**Run this example:**
+
+```
+modal run dbt_sqlite.py::stub.run
+```
+
+**Launch an interactive sqlite3 shell on the output database:**
 
-To launch an interactive sqlite3 shell on the output database:
-`modal run modal_dbt.py::stub.explore`
+```
+modal run dbt_sqlite.py::stub.explore
+```
 """
 
 import os
@@ -23,7 +31,7 @@
 
 import modal
 
-LOCAL_DBT_PROJECT = Path(__file__).parent / "sample_proj"
+LOCAL_DBT_PROJECT = Path(__file__).parent / "sample_proj_sqlite"
 REMOTE_DBT_PROJECT = "/sample_proj"
 RAW_SCHEMAS = "/raw"
 OUTPUT_SCHEMAS = "/db"
@@ -39,7 +47,7 @@
 
 image = (
     modal.Image.debian_slim()
-    .pip_install("dbt-sqlite")
+    .pip_install("dbt-core~=1.3.0", "dbt-sqlite~=1.3.0")
     .run_commands("apt-get install -y git")
 )
 
@@ -59,7 +67,9 @@
 )
 def dbt_cli(subcommand: typing.List):
     os.chdir(REMOTE_DBT_PROJECT)
-    subprocess.check_call(["dbt"] + subcommand)
+    cmd = ["dbt"] + subcommand
+    print(f"Running {' '.join(cmd)} against {REMOTE_DBT_PROJECT}")
+    subprocess.check_call(cmd)
 
 
 @stub.local_entrypoint()

diff --git a/10_integrations/dbt/sample_proj/.gitignore → ...ions/dbt/sample_proj_duckdb_s3/.gitignore b/10_integrations/dbt/sample_proj/.gitignore → ...ions/dbt/sample_proj_duckdb_s3/.gitignore
diff --git a/...egrations/dbt/sample_proj/dbt_project.yml → ...dbt/sample_proj_duckdb_s3/dbt_project.yml b/...egrations/dbt/sample_proj/dbt_project.yml → ...dbt/sample_proj_duckdb_s3/dbt_project.yml
diff --git a/10_integrations/dbt/sample_proj_duckdb_s3/models/customers.sql b/10_integrations/dbt/sample_proj_duckdb_s3/models/customers.sql
@@ -0,0 +1,70 @@
+with customers as (
+
+    select * from {{ ref('stg_customers') }}
+
+),
+
+orders as (
+
+    select * from {{ ref('stg_orders') }}
+
+),
+
+payments as (
+
+    select * from {{ ref('stg_payments') }}
+
+),
+
+customer_orders as (
+
+        select
+        customer_id,
+
+        min(order_date) as first_order,
+        max(order_date) as most_recent_order,
+        count(order_id) as number_of_orders
+    from orders
+
+    group by customer_id
+
+),
+
+customer_payments as (
+
+    select
+        orders.customer_id,
+        sum(amount) as total_amount
+
+    from payments
+
+    left join orders on
+         payments.order_id = orders.order_id
+
+    group by orders.customer_id
+
+),
+
+final as (
+
+    select
+        customers.customer_id,
+        customers.first_name,
+        customers.last_name,
+        customer_orders.first_order,
+        customer_orders.most_recent_order,
+        customer_orders.number_of_orders,
+        customer_payments.total_amount as customer_lifetime_value
+
+    from customers
+
+    left join customer_orders
+        on customers.customer_id = customer_orders.customer_id
+
+    left join customer_payments
+        on  customers.customer_id = customer_payments.customer_id
+
+)
+
+{{ config(materialized='external', format='parquet', location='s3://modal-example-dbt-duckdb-s3/out/customers.parquet') }}
+select * from final
diff --git a/10_integrations/dbt/sample_proj_duckdb_s3/models/orders.sql b/10_integrations/dbt/sample_proj_duckdb_s3/models/orders.sql
@@ -0,0 +1,57 @@
+{% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %}
+
+with orders as (
+
+    select * from {{ ref('stg_orders') }}
+
+),
+
+payments as (
+
+    select * from {{ ref('stg_payments') }}
+
+),
+
+order_payments as (
+
+    select
+        order_id,
+
+        {% for payment_method in payment_methods -%}
+        sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount,
+        {% endfor -%}
+
+        sum(amount) as total_amount
+
+    from payments
+
+    group by order_id
+
+),
+
+final as (
+
+    select
+        orders.order_id,
+        orders.customer_id,
+        orders.order_date,
+        orders.status,
+
+        {% for payment_method in payment_methods -%}
+
+        order_payments.{{ payment_method }}_amount,
+
+        {% endfor -%}
+
+        order_payments.total_amount as amount
+
+    from orders
+
+
+    left join order_payments
+        on orders.order_id = order_payments.order_id
+
+)
+
+{{ config(materialized='external', format='parquet', location='s3://modal-example-dbt-duckdb-s3/out/orders.parquet') }}
+select * from final
diff --git a/10_integrations/dbt/sample_proj_duckdb_s3/models/sources.yml b/10_integrations/dbt/sample_proj_duckdb_s3/models/sources.yml
@@ -0,0 +1,11 @@
+version: 2
+
+sources:
+  - name: external_source
+    meta:
+      external_location: "s3://modal-example-dbt-duckdb-s3/sources/{name}.parquet"
+    tables:
+      - name: raw_customers
+      - name: raw_orders
+      - name: raw_payments
+