Skip to content

Commit

Permalink
min working example (#1)
Browse files Browse the repository at this point in the history
* πŸ“ basics

Signed-off-by: peter szemraj <[email protected]>

* πŸ› πŸ‘½οΈ fix doctr and update to package

Signed-off-by: peter szemraj <[email protected]>

* πŸ”Š improve logs

Signed-off-by: peter szemraj <[email protected]>

* πŸ™ˆ add custom ignores

Signed-off-by: peter szemraj <[email protected]>

* ✨ πŸ‘½οΈ load example inputs from url

Signed-off-by: peter szemraj <[email protected]>

* πŸ’© πŸ₯… MWE app integration w blocks

Signed-off-by: peter szemraj <[email protected]>

Signed-off-by: peter szemraj <[email protected]>
  • Loading branch information
pszemraj authored Dec 20, 2022
1 parent adc093d commit 0beb6c5
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 69 deletions.
30 changes: 25 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
# START CUSTOM IGNORES
# logs
*.log
*LOGFILE*
# output files need to be force-added
*.csv
*.png
*.jpg
*.jpeg
*.pkl
*.xlsx
*.txt
# cache
*__pycache__/
*.pyc
# reports folder - need to be force-added
*reports/
# scratch files and folders
*scratch*
*scratch/
# notebooks
*notebooks/
*.ipynb
# END CUSTOM IGNORES

# Temporary and binary files
*~
*.py[cod]
Expand All @@ -13,7 +38,6 @@ __pycache__/*
.*.swp
*/.ipynb_checkpoints/*
.DS_Store

# Project files
.ropeproject
.project
Expand All @@ -22,13 +46,11 @@ __pycache__/*
.idea
.vscode
tags

# Package files
*.egg
*.eggs/
.installed.cfg
*.egg-info

# Unittest and coverage
htmlcov/*
.coverage
Expand All @@ -37,7 +59,6 @@ htmlcov/*
junit*.xml
coverage.xml
.pytest_cache/

# Build and docs folder/files
build/*
dist/*
Expand All @@ -47,7 +68,6 @@ docs/_rst/*
docs/_build/*
cover/*
MANIFEST

# Per-project virtualenvs
.venv*/
.conda*/
Expand Down
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,38 @@
A continuation of the [document summarization](<https://huggingface.co/spaces/pszemraj/document-summarization>) space on huggingface.

## Installation

```bash
pip install -e .
```

To install all the dependencies _(includes PDF OCR, gradio UI demo)_, run:

```bash
pip install -e .[all]
```

## Usage

### UI Demo

Simply run the following command to start the UI demo:

```bash
ts-ui
```

Other args to be added soon

## Roadmap

- [ ] add argparse CLI for UI demo
- [ ] add CLI for summarization of all text files in a directory
- [ ] API for summarization of text docs

and other things I haven't thought of yet

---

[![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/)
80 changes: 24 additions & 56 deletions src/textsum/app.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
import os
import contextlib
import logging
import os
import random
import re
import time
from pathlib import Path

os.environ["USE_TORCH"] = "1"
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

import gradio as gr
import nltk
from cleantext import clean
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from pdf2text import convert_PDF_to_Text

from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
from utils import load_example_filenames, truncate_word_count, saves_summary
from textsum.pdf2text import convert_PDF_to_Text
from textsum.summarize import load_model_and_tokenizer, summarize_via_tokenbatches
from textsum.utils import load_example_filenames, saves_summary, truncate_word_count

_here = Path(__file__).parent

nltk.download("stopwords") # TODO=find where this requirement originates from

logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def proc_submission(
input_text: str,
Expand Down Expand Up @@ -51,7 +52,17 @@ def proc_submission(
Returns:
str in HTML format, string of the summary, str of score
"""
global model, tokenizer, model_sm, tokenizer_sm
# assert that the model is loaded and accessible
if "model" not in globals():
model, tokenizer = load_model_and_tokenizer(
"pszemraj/pegasus-x-large-book-summary"
)

if "model_sm" not in globals():
model_sm, tokenizer_sm = load_model_and_tokenizer(
"pszemraj/long-t5-tglobal-base-16384-book-summary"
)
settings = {
"length_penalty": float(length_penalty),
"repetition_penalty": float(repetition_penalty),
Expand Down Expand Up @@ -102,8 +113,8 @@ def proc_submission(

_summaries = summarize_via_tokenbatches(
tr_in,
model_sm if "base" in model_size.lower() else model,
tokenizer_sm if "base" in model_size.lower() else tokenizer,
model_sm if model_size == "LongT5-base" else model,
tokenizer_sm if model_size == "LongT5-base" else tokenizer,
batch_length=token_batch_length,
**settings,
)
Expand Down Expand Up @@ -131,37 +142,6 @@ def proc_submission(
return html, sum_text_out, scores_out, saved_file


def load_single_example_text(
example_path: str or Path,
max_pages=20,
):
"""
load_single_example - a helper function for the gradio module to load examples
Returns:
list of str, the examples
"""
global name_to_path
full_ex_path = name_to_path[example_path]
full_ex_path = Path(full_ex_path)
if full_ex_path.suffix == ".txt":
with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
raw_text = f.read()
text = clean(raw_text, lower=False)
elif full_ex_path.suffix == ".pdf":
logging.info(f"Loading PDF file {full_ex_path}")
conversion_stats = convert_PDF_to_Text(
full_ex_path,
ocr_model=ocr_model,
max_pages=max_pages,
)
text = conversion_stats["converted_text"]
else:
logging.error(f"Unknown file type {full_ex_path.suffix}")
text = "ERROR - check example path"

return text


def load_uploaded_file(file_obj, max_pages=20):
"""
load_uploaded_file - process an uploaded file
Expand Down Expand Up @@ -215,6 +195,7 @@ def main():
model_sm, tokenizer_sm = load_model_and_tokenizer(
"pszemraj/long-t5-tglobal-base-16384-book-summary"
)
# ensure that the models are global variables

logging.info("Loading OCR model")
with contextlib.redirect_stdout(None):
Expand All @@ -224,10 +205,7 @@ def main():
pretrained=True,
assume_straight_pages=True,
)
name_to_path = load_example_filenames(_here / "examples")
logging.info(f"Loaded {len(name_to_path)} examples")
demo = gr.Blocks()
_examples = list(name_to_path.keys())
with demo:

gr.Markdown("# Document Summarization with Long-Document Transformers")
Expand All @@ -254,11 +232,7 @@ def main():
value=2,
)
with gr.Column(variant="compact"):
example_name = gr.Dropdown(
_examples,
label="Examples",
value=random.choice(_examples),
)

uploaded_file = gr.File(
label="File Upload",
file_count="single",
Expand All @@ -271,9 +245,7 @@ def main():
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
)
with gr.Column(min_width=100, scale=0.5):
load_examples_button = gr.Button(
"Load Example",
)

load_file_button = gr.Button("Upload File")

with gr.Column():
Expand Down Expand Up @@ -342,10 +314,6 @@ def main():
)
gr.Markdown("---")

load_examples_button.click(
fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
)

load_file_button.click(
fn=load_uploaded_file, inputs=uploaded_file, outputs=[input_text]
)
Expand Down
59 changes: 51 additions & 8 deletions src/textsum/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,27 @@
utils.py - Utility functions for the project.
"""

import logging
import re
from pathlib import Path
import subprocess
from datetime import datetime
from pathlib import Path

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%m/%d/%Y %I:%M:%S",
)
from natsort import natsorted
import subprocess

# ------------------------- #

TEXT_EXAMPLE_URLS = {
"whisper_lecture": "https://pastebin.com/raw/X9PEgS2w",
"hf_blog_clip": "https://pastebin.com/raw/1RMg1Naz",
}

# ------------------------- #


def get_timestamp() -> str:
Expand Down Expand Up @@ -41,7 +57,7 @@ def truncate_word_count(text, max_words=512):
return processed


def load_examples(src, filetypes=[".txt", ".pdf"]):
def load_pdf_examples(src, filetypes=[".txt", ".pdf"]):
"""
load_examples - a helper function for the gradio module to load examples
Returns:
Expand All @@ -66,15 +82,42 @@ def load_examples(src, filetypes=[".txt", ".pdf"]):
return text_examples


def load_example_filenames(example_path: str or Path):
def load_text_examples(
urls: dict = TEXT_EXAMPLE_URLS, target_dir: str or Path = None
) -> Path:
"""
load_example_filenames - a helper function for the gradio module to load examples
Returns:
dict, the examples (filename:full path)
load_text_examples - load the text examples from the web to a directory
:param dict urls: the urls to the text examples, defaults to TEXT_EXAMPLE_URLS
:param strorPath target_dir: the path to the target directory, defaults to the current working directory
:return Path: the path to the directory containing the text examples
"""
target_dir = Path.cwd() if target_dir is None else Path(target_dir)
target_dir.mkdir(exist_ok=True)

for name, url in urls.items(): # download the examples
subprocess.run(["wget", url, "-O", target_dir / f"{name}.txt"])

return target_dir


def load_example_filenames(example_path: str or Path, ext: list = [".txt", ".md"]):
"""
load_example_filenames - load the example filenames from a directory
:param strorPath example_path: the path to the examples directory
:param list ext: the file extensions to load (default: [".txt", ".md"])
:return dict: the example filenames
"""
example_path = Path(example_path)
if not example_path.exists():
# download the examples
logging.info("Downloading the examples...")
example_path = load_text_examples(target_dir=example_path)

# load the examples into a list
examples = {f.name: f for f in example_path.glob("*.txt")}
examples = {f.name: f.resolve() for f in example_path.glob("*") if f.suffix in ext}
logging.info(f"Loaded {len(examples)} examples from {example_path}")
return examples


Expand Down

0 comments on commit 0beb6c5

Please sign in to comment.