Skip to content

Commit

Permalink
DBT duckDB example, and fixes to DB SQLite example (#327)
Browse files Browse the repository at this point in the history
  • Loading branch information
thundergolfer authored Jul 28, 2023
1 parent 2aef4fa commit abea30b
Show file tree
Hide file tree
Showing 27 changed files with 813 additions and 9 deletions.
193 changes: 193 additions & 0 deletions 10_integrations/dbt/dbt_duckdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# ---
# cmd: ["modal", "run", "dbt_duckdb.py::run", "--command", "run"]
# ---
#
# This example contains a minimal but capable cloud data warehouse.
# It's comprised of the following:
#
# - [DuckDB](https://duckdb.org) as the warehouse's OLAP database engine
# - AWS S3 as the data storage provider
# - [DBT](https://docs.getdbt.com/docs/introduction) as the data transformation tool
#
# Meet your new cloud data warehouse.

from pathlib import Path

import modal

# ## Bucket name configuration
#
# The only thing in the source code that you must update is the S3 bucket name.
# AWS S3 bucket names are globally unique, and the one in this source is used by Modal.
#
# Update the `BUCKET_NAME` variable below and also any references to the original value
# within `sample_proj_duckdb_s3/models/`. The AWS IAM policy below also includes the bucket
# name and that must be updated.

BUCKET_NAME = "modal-example-dbt-duckdb-s3"
LOCAL_DBT_PROJECT = Path(__file__).parent / "sample_proj_duckdb_s3"
PROJ_PATH = "/root/dbt"
PROFILES_PATH = "/root/dbt_profile"
TARGET_PATH = "/root/target"
dbt_image = (
modal.Image.debian_slim()
.pip_install(
"boto3",
"dbt-duckdb>=1.5.1",
"pandas",
"pyarrow",
)
.env(
{
"DBT_PROJECT_DIR": PROJ_PATH,
"DBT_PROFILES_DIR": PROFILES_PATH,
"DBT_TARGET_PATH": TARGET_PATH,
}
)
)
stub = modal.Stub(name="example-dbt-duckdb-s3", image=dbt_image)

# ## DBT Configuration
#
# Most of the DBT code and configuration is taken directly from the
# https://github.com/dbt-labs/jaffle_shop demo and modified to support
# using dbt-duckdb with an S3 bucket.
#
# The DBT profiles.yml configuration is taken from
# https://github.com/jwills/dbt-duckdb#configuring-your-profile.
#
# Here we mount all this local code and configuration into the Modal function
# so that it will be available when we run DBT in the Modal cloud.

dbt_project = modal.Mount.from_local_dir(
LOCAL_DBT_PROJECT, remote_path=PROJ_PATH
)
dbt_profiles = modal.Mount.from_local_file(
local_path=LOCAL_DBT_PROJECT / "profiles.yml",
remote_path=Path(PROFILES_PATH, "profiles.yml"),
)
dbt_target = modal.NetworkFileSystem.persisted("dbt-target")
# Create this secret using the "AWS" template at https://modal.com/secrets/create.
# Be sure that the AWS user you provide credentials for has permission to
# create S3 buckets and read/write data from them.
#
# The policy required for this example is the following.
# Not that you *must* update the bucket name listed in the policy to your
# own bucket name.
#
# ```json
# {
# "Statement": [
# {
# "Action": "s3:*",
# "Effect": "Allow",
# "arn:aws:s3:::modal-example-dbt-duckdb-s3/*",
# "arn:aws:s3:::modal-example-dbt-duckdb-s3"
# "Sid": "duckdbs3access"
# }
# ],
# "Version": "2012-10-17"
# }
# ```
#
# Below we will use this user in a Modal function to create an S3 bucket and
# populate it with .parquet data.
s3_secret = modal.Secret.from_name("modal-examples-aws-user")

# ## Seed data
#
# In order to provide source data for DBT to ingest and transform,
# we have this `create_source_data` function which creates an AWS S3 bucket and
# populates it with .parquet files based of CSV data in the seeds/ directory.
#
# This is not the typical way that seeds/ data is used, but it is fine for this
# demonstration example. See https://docs.getdbt.com/docs/build/seeds for more info.


@stub.function(
mounts=[dbt_project],
secrets=[s3_secret],
)
def create_source_data():
import boto3
import pandas as pd

s3_client = boto3.client("s3")
s3_client.create_bucket(Bucket=BUCKET_NAME)

for seed_csv_path in Path(PROJ_PATH, "seeds").glob("*.csv"):
print(f"found seed file {seed_csv_path}")
name = seed_csv_path.stem
df = pd.read_csv(seed_csv_path)
parquet_filename = f"{name}.parquet"
df.to_parquet(parquet_filename)

object_key = f"sources/{parquet_filename}"
print(f"uploading {object_key=} to S3 bucket '{BUCKET_NAME}'")
s3_client.upload_file(parquet_filename, BUCKET_NAME, object_key)


# This `daily_build` function runs on a schedule to keep the DuckDB data warehouse
# up-to-date. Currently, the source data for this warehouse is static, so the updates
# don't really update anything, just re-build. But this example could be extended
# to have sources which continually provide new data across time.


@stub.function(
schedule=modal.Period(days=1),
secrets=[s3_secret],
mounts=[dbt_project, dbt_profiles],
network_file_systems={TARGET_PATH: dbt_target},
)
def daily_build() -> None:
run("build")


# `modal run dbt_duckdb.py::run --command run`
#
# A successful run will log something a lot like the following:
#
# ```
# 03:41:04 Running with dbt=1.5.0
# 03:41:05 Found 5 models, 8 tests, 0 snapshots, 0 analyses, 313 macros, 0 operations, 3 seed files, 3 sources, 0 exposures, 0 metrics, 0 groups
# 03:41:05
# 03:41:06 Concurrency: 1 threads (target='modal')
# 03:41:06
# 03:41:06 1 of 5 START sql table model main.stg_customers ................................ [RUN]
# 03:41:06 1 of 5 OK created sql table model main.stg_customers ........................... [OK in 0.45s]
# 03:41:06 2 of 5 START sql table model main.stg_orders ................................... [RUN]
# 03:41:06 2 of 5 OK created sql table model main.stg_orders .............................. [OK in 0.34s]
# 03:41:06 3 of 5 START sql table model main.stg_payments ................................. [RUN]
# 03:41:07 3 of 5 OK created sql table model main.stg_payments ............................ [OK in 0.36s]
# 03:41:07 4 of 5 START sql external model main.customers ................................. [RUN]
# 03:41:07 4 of 5 OK created sql external model main.customers ............................ [OK in 0.72s]
# 03:41:07 5 of 5 START sql table model main.orders ....................................... [RUN]
# 03:41:08 5 of 5 OK created sql table model main.orders .................................. [OK in 0.22s]
# 03:41:08
# 03:41:08 Finished running 4 table models, 1 external model in 0 hours 0 minutes and 3.15 seconds (3.15s).
# 03:41:08 Completed successfully
# 03:41:08
# 03:41:08 Done. PASS=5 WARN=0 ERROR=0 SKIP=0 TOTAL=5
# ```
#


@stub.function(
secrets=[s3_secret],
mounts=[dbt_project, dbt_profiles],
network_file_systems={TARGET_PATH: dbt_target},
)
def run(command: str) -> None:
from dbt.cli.main import dbtRunner

res = dbtRunner().invoke([command])
if res.exception:
print(res.exception)


# Look for the "'materialized='external'" DBT config in the SQL templates
# to see how `dbt-duckdb` is able to write back the transformed data to AWS S3!
#
# After running the 'run' command and seeing it succeed, check what's contained
# under the bucket's `out/` key prefix. You'll see that DBT has run the transformations
# defined in `sample_proj_duckdb_s3/models/` and produced output .parquet files.
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
# ---
# lambda-test: false
# cmd: ["modal", "run", "10_integrations.dbt.modal_dbt::run"]
# cmd: ["modal", "run", "10_integrations.dbt.dbt_sqlite::run"]
# ---

"""This is a simple demonstration of how to run a dbt-core project on Modal
"""
This is a simple demonstration of how to run a dbt-core project on Modal
using the dbt-sqlite adapter.
The underlying DBT data and models are from https://docs.getdbt.com/docs/get-started/getting-started-dbt-core
To run this example, first run the meltano example in 10_integrations/meltano to load the required data
into sqlite.
To run this example:
`modal run modal_dbt.py::stub.run`
**Run this example:**
```
modal run dbt_sqlite.py::stub.run
```
**Launch an interactive sqlite3 shell on the output database:**
To launch an interactive sqlite3 shell on the output database:
`modal run modal_dbt.py::stub.explore`
```
modal run dbt_sqlite.py::stub.explore
```
"""

import os
Expand All @@ -23,7 +31,7 @@

import modal

LOCAL_DBT_PROJECT = Path(__file__).parent / "sample_proj"
LOCAL_DBT_PROJECT = Path(__file__).parent / "sample_proj_sqlite"
REMOTE_DBT_PROJECT = "/sample_proj"
RAW_SCHEMAS = "/raw"
OUTPUT_SCHEMAS = "/db"
Expand All @@ -39,7 +47,7 @@

image = (
modal.Image.debian_slim()
.pip_install("dbt-sqlite")
.pip_install("dbt-core~=1.3.0", "dbt-sqlite~=1.3.0")
.run_commands("apt-get install -y git")
)

Expand All @@ -59,7 +67,9 @@
)
def dbt_cli(subcommand: typing.List):
os.chdir(REMOTE_DBT_PROJECT)
subprocess.check_call(["dbt"] + subcommand)
cmd = ["dbt"] + subcommand
print(f"Running {' '.join(cmd)} against {REMOTE_DBT_PROJECT}")
subprocess.check_call(cmd)


@stub.local_entrypoint()
Expand Down
File renamed without changes.
70 changes: 70 additions & 0 deletions 10_integrations/dbt/sample_proj_duckdb_s3/models/customers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
with customers as (

select * from {{ ref('stg_customers') }}

),

orders as (

select * from {{ ref('stg_orders') }}

),

payments as (

select * from {{ ref('stg_payments') }}

),

customer_orders as (

select
customer_id,

min(order_date) as first_order,
max(order_date) as most_recent_order,
count(order_id) as number_of_orders
from orders

group by customer_id

),

customer_payments as (

select
orders.customer_id,
sum(amount) as total_amount

from payments

left join orders on
payments.order_id = orders.order_id

group by orders.customer_id

),

final as (

select
customers.customer_id,
customers.first_name,
customers.last_name,
customer_orders.first_order,
customer_orders.most_recent_order,
customer_orders.number_of_orders,
customer_payments.total_amount as customer_lifetime_value

from customers

left join customer_orders
on customers.customer_id = customer_orders.customer_id

left join customer_payments
on customers.customer_id = customer_payments.customer_id

)

{{ config(materialized='external', format='parquet', location='s3://modal-example-dbt-duckdb-s3/out/customers.parquet') }}
select * from final
57 changes: 57 additions & 0 deletions 10_integrations/dbt/sample_proj_duckdb_s3/models/orders.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %}

with orders as (

select * from {{ ref('stg_orders') }}

),

payments as (

select * from {{ ref('stg_payments') }}

),

order_payments as (

select
order_id,

{% for payment_method in payment_methods -%}
sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount,
{% endfor -%}

sum(amount) as total_amount

from payments

group by order_id

),

final as (

select
orders.order_id,
orders.customer_id,
orders.order_date,
orders.status,

{% for payment_method in payment_methods -%}

order_payments.{{ payment_method }}_amount,

{% endfor -%}

order_payments.total_amount as amount

from orders


left join order_payments
on orders.order_id = order_payments.order_id

)

{{ config(materialized='external', format='parquet', location='s3://modal-example-dbt-duckdb-s3/out/orders.parquet') }}
select * from final
11 changes: 11 additions & 0 deletions 10_integrations/dbt/sample_proj_duckdb_s3/models/sources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: 2

sources:
- name: external_source
meta:
external_location: "s3://modal-example-dbt-duckdb-s3/sources/{name}.parquet"
tables:
- name: raw_customers
- name: raw_orders
- name: raw_payments

Loading

0 comments on commit abea30b

Please sign in to comment.