Skip to content

Commit

Permalink
Add more unit tests for data pipeline utils in agent example (#29)
Browse files Browse the repository at this point in the history
* WIP

Signed-off-by: Sid Murching <[email protected]>

* WIP

Signed-off-by: Sid Murching <[email protected]>

* Remove lint workflow

Signed-off-by: Sid Murching <[email protected]>

* Add comments to utils

Signed-off-by: Sid Murching <[email protected]>

* Fix test

Signed-off-by: Sid Murching <[email protected]>

* Delete autogenerated comments

Signed-off-by: Sid Murching <[email protected]>

* Switch to .show() which is defined locally

Signed-off-by: Sid Murching <[email protected]>

* Add more tests for file loading & parsing logic in data pipeline

Signed-off-by: Sid Murching <[email protected]>

* Add more tests for file parsing

Signed-off-by: Sid Murching <[email protected]>

* Update agent_app_sample_code/tests/test_file_loading.py

* Add developer README

Signed-off-by: Sid Murching <[email protected]>

---------

Signed-off-by: Sid Murching <[email protected]>
  • Loading branch information
smurching authored Sep 27, 2024
1 parent dcbbe41 commit 2ed98a5
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 6 deletions.
1 change: 0 additions & 1 deletion agent_app_sample_code/02_data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ def file_parser(
"""
try:
filename, file_extension = os.path.splitext(doc_path)
parsed_document = {}

if file_extension == ".pdf":
pdf = io.BytesIO(raw_doc_contents_bytes)
Expand Down
55 changes: 51 additions & 4 deletions agent_app_sample_code/tests/test_file_loading.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,35 @@
from datetime import datetime

import pytest
import pyspark
import pandas as pd
from typing import TypedDict

from agent_app_sample_code.utils.file_loading import load_files_to_df
from agent_app_sample_code.utils.file_loading import load_files_to_df, apply_parsing_udf
from agent_app_sample_code.utils.typed_dicts_to_spark_schema import typed_dicts_to_spark_schema

@pytest.fixture(scope="module")
def spark():
return (
pyspark.sql.SparkSession.builder
.master("local[1]")
# Uncomment the following line for testing on Apple silicon locally
.config("spark.driver.bindAddress", "127.0.0.1")
.config("spark.task.maxFailures", "1") # avoid retry failed spark tasks
.getOrCreate()
)
)

def test_load_files_to_df(spark, tmpdir):
@pytest.fixture()
def example_files_dir(tmpdir):
temp_dir = tmpdir.mkdir("files_subdir")
file_1 = temp_dir.join("file1.txt")
file_2 = temp_dir.join("file2.txt")
file_1.write("file1 content")
file_2.write("file2 content")
yield temp_dir, file_1, file_2


def test_load_files_to_df(spark, example_files_dir):
temp_dir, file_1, file_2 = example_files_dir
raw_files_df = load_files_to_df(spark, str(temp_dir)).drop("modificationTime").orderBy("path")
assert raw_files_df.count() == 2
raw_pandas_df = raw_files_df.toPandas()
Expand All @@ -44,3 +53,41 @@ def test_load_files_to_df_throws_if_no_files(spark, tmpdir):
temp_dir = tmpdir.mkdir("files_subdir")
with pytest.raises(Exception, match="does not contain any files"):
load_files_to_df(spark, str(temp_dir))

class ParserReturnValue(TypedDict):
# Parsed content of the document
doc_content: str # do not change this name
# The status of whether the parser succeeds or fails, used to exclude failed files downstream
parser_status: str # do not change this name
# Unique ID of the document
doc_uri: str # do not change this name

def test_apply_parsing_udf(spark, example_files_dir):
def _mock_file_parser(
raw_doc_contents_bytes: bytes,
doc_path: str,
modification_time: datetime,
doc_bytes_length: int,
):
return {
"doc_content": raw_doc_contents_bytes.decode("utf-8"),
"parser_status": "SUCCESS",
"doc_uri": doc_path,
}

temp_dir, file_1, file_2 = example_files_dir
raw_files_df = load_files_to_df(spark, str(temp_dir)).orderBy("path")
parsed_df = apply_parsing_udf(raw_files_df, _mock_file_parser, parsed_df_schema=typed_dicts_to_spark_schema(ParserReturnValue))
assert parsed_df.count() == 2
parsed_pandas_df = parsed_df.toPandas()
# Expected DataFrame
expected_df = pd.DataFrame([{
"doc_content": file_1.read_text(encoding="utf-8"),
"parser_status": "SUCCESS",
"doc_uri": f"file:{str(file_1)}",
}, {
"doc_content": file_2.read_text(encoding="utf-8"),
"parser_status": "SUCCESS",
"doc_uri": f"file:{str(file_2)}",
}])
pd.testing.assert_frame_equal(parsed_pandas_df, expected_df)
11 changes: 10 additions & 1 deletion dev/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
# Databricks Mosaic Generative AI Cookbook

To start working on this book:
## Dev env setup
- clone the repo; `cd cookbook`
- use your preferred approach to starting a new python environment
- in that environment, `pip install -r dev/dev_requirements.txt`

## Updating website content
To test updates to site content at ai-cookbook.io
- build and preview the site with `jupyter-book build --all genai_cookbook`

The homepage is at `genai_cookbook/index.md`

The content pages are in `genai_cookbook/nbs/`

Jupyter book is fairly flexible and offers a lot of different options for formatting, cross-referencing, adding formatted callouts, etc. Read more at the [Jupyter Book docs](https://jupyterbook.org/en/stable/intro.html).

## Updating code
Use the `databricks sync` CLI command ([docs](https://docs.databricks.com/en/dev-tools/cli/sync-commands.html)) to sync the code in this repo to
your Databricks workspace. You can then iterate on code in your IDE and test changes in
Databricks. Be sure to add unit tests (as of the time of writing, tests are under `agent_app_sample_code/tests`).
You can run unit tests via `pytest`

0 comments on commit 2ed98a5

Please sign in to comment.