min working example (#1)

* 📝 basics Signed-off-by: peter szemraj <[email protected]> * 🐛 👽️ fix doctr and update to package Signed-off-by: peter szemraj <[email protected]> * 🔊 improve logs Signed-off-by: peter szemraj <[email protected]> * 🙈 add custom ignores Signed-off-by: peter szemraj <[email protected]> * ✨ 👽️ load example inputs from url Signed-off-by: peter szemraj <[email protected]> * 💩 🥅 MWE app integration w blocks Signed-off-by: peter szemraj <[email protected]> Signed-off-by: peter szemraj <[email protected]>
pszemraj · Dec 20, 2022 · 0beb6c5 · 0beb6c5
1 parent adc093d
commit 0beb6c5
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 69 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,28 @@
+#  START CUSTOM IGNORES
+# logs
+*.log
+*LOGFILE*
+# output files need to be force-added
+*.csv
+*.png
+*.jpg
+*.jpeg
+*.pkl
+*.xlsx
+*.txt
+# cache
+*__pycache__/
+*.pyc
+# reports folder - need to be force-added
+*reports/
+# scratch files and folders
+*scratch*
+*scratch/
+# notebooks
+*notebooks/
+*.ipynb
+# END CUSTOM IGNORES
+
 # Temporary and binary files
 *~
 *.py[cod]
@@ -13,7 +38,6 @@ __pycache__/*
 .*.swp
 */.ipynb_checkpoints/*
 .DS_Store
-
 # Project files
 .ropeproject
 .project
@@ -22,13 +46,11 @@ __pycache__/*
 .idea
 .vscode
 tags
-
 # Package files
 *.egg
 *.eggs/
 .installed.cfg
 *.egg-info
-
 # Unittest and coverage
 htmlcov/*
 .coverage
@@ -37,7 +59,6 @@ htmlcov/*
 junit*.xml
 coverage.xml
 .pytest_cache/
-
 # Build and docs folder/files
 build/*
 dist/*
@@ -47,7 +68,6 @@ docs/_rst/*
 docs/_build/*
 cover/*
 MANIFEST
-
 # Per-project virtualenvs
 .venv*/
 .conda*/

diff --git a/README.md b/README.md
@@ -16,6 +16,38 @@
 
 A continuation of the [document summarization](<https://huggingface.co/spaces/pszemraj/document-summarization>) space on huggingface.
 
+## Installation
+
+```bash
+pip install -e .
+```
+
+To install all the dependencies _(includes PDF OCR, gradio UI demo)_, run:
+
+```bash
+pip install -e .[all]
+```
+
+## Usage
+
+### UI Demo
+
+Simply run the following command to start the UI demo:
+
+```bash
+ts-ui
+```
+
+Other args to be added soon
+
+## Roadmap
+
+- [ ] add argparse CLI for UI demo
+- [ ] add CLI for summarization of all text files in a directory
+- [ ] API for summarization of text docs
+
+and other things I haven't thought of yet
+
 ---
 
 [![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/)
diff --git a/src/textsum/app.py b/src/textsum/app.py
@@ -1,29 +1,30 @@
-import os
 import contextlib
 import logging
+import os
 import random
 import re
 import time
 from pathlib import Path
 
+os.environ["USE_TORCH"] = "1"
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
 import gradio as gr
 import nltk
 from cleantext import clean
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
-from pdf2text import convert_PDF_to_Text
 
-from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
-from utils import load_example_filenames, truncate_word_count, saves_summary
+from textsum.pdf2text import convert_PDF_to_Text
+from textsum.summarize import load_model_and_tokenizer, summarize_via_tokenbatches
+from textsum.utils import load_example_filenames, saves_summary, truncate_word_count
 
 _here = Path(__file__).parent
 
 nltk.download("stopwords")  # TODO=find where this requirement originates from
 
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-
 
 def proc_submission(
     input_text: str,
@@ -51,7 +52,17 @@ def proc_submission(
     Returns:
         str in HTML format, string of the summary, str of score
     """
+    global model, tokenizer, model_sm, tokenizer_sm
+    # assert that the model is loaded and accessible
+    if "model" not in globals():
+        model, tokenizer = load_model_and_tokenizer(
+            "pszemraj/pegasus-x-large-book-summary"
+        )
 
+    if "model_sm" not in globals():
+        model_sm, tokenizer_sm = load_model_and_tokenizer(
+            "pszemraj/long-t5-tglobal-base-16384-book-summary"
+        )
     settings = {
         "length_penalty": float(length_penalty),
         "repetition_penalty": float(repetition_penalty),
@@ -102,8 +113,8 @@ def proc_submission(
 
     _summaries = summarize_via_tokenbatches(
         tr_in,
-        model_sm if "base" in model_size.lower() else model,
-        tokenizer_sm if "base" in model_size.lower() else tokenizer,
+        model_sm if model_size == "LongT5-base" else model,
+        tokenizer_sm if model_size == "LongT5-base" else tokenizer,
         batch_length=token_batch_length,
         **settings,
     )
@@ -131,37 +142,6 @@ def proc_submission(
     return html, sum_text_out, scores_out, saved_file
 
 
-def load_single_example_text(
-    example_path: str or Path,
-    max_pages=20,
-):
-    """
-    load_single_example - a helper function for the gradio module to load examples
-    Returns:
-        list of str, the examples
-    """
-    global name_to_path
-    full_ex_path = name_to_path[example_path]
-    full_ex_path = Path(full_ex_path)
-    if full_ex_path.suffix == ".txt":
-        with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
-            raw_text = f.read()
-        text = clean(raw_text, lower=False)
-    elif full_ex_path.suffix == ".pdf":
-        logging.info(f"Loading PDF file {full_ex_path}")
-        conversion_stats = convert_PDF_to_Text(
-            full_ex_path,
-            ocr_model=ocr_model,
-            max_pages=max_pages,
-        )
-        text = conversion_stats["converted_text"]
-    else:
-        logging.error(f"Unknown file type {full_ex_path.suffix}")
-        text = "ERROR - check example path"
-
-    return text
-
-
 def load_uploaded_file(file_obj, max_pages=20):
     """
     load_uploaded_file - process an uploaded file
@@ -215,6 +195,7 @@ def main():
         model_sm, tokenizer_sm = load_model_and_tokenizer(
             "pszemraj/long-t5-tglobal-base-16384-book-summary"
         )
+    # ensure that the models are global variables
 
     logging.info("Loading OCR model")
     with contextlib.redirect_stdout(None):
@@ -224,10 +205,7 @@ def main():
             pretrained=True,
             assume_straight_pages=True,
         )
-    name_to_path = load_example_filenames(_here / "examples")
-    logging.info(f"Loaded {len(name_to_path)} examples")
     demo = gr.Blocks()
-    _examples = list(name_to_path.keys())
     with demo:
 
         gr.Markdown("# Document Summarization with Long-Document Transformers")
@@ -254,11 +232,7 @@ def main():
                         value=2,
                     )
                 with gr.Column(variant="compact"):
-                    example_name = gr.Dropdown(
-                        _examples,
-                        label="Examples",
-                        value=random.choice(_examples),
-                    )
+
                     uploaded_file = gr.File(
                         label="File Upload",
                         file_count="single",
@@ -271,9 +245,7 @@ def main():
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
                 with gr.Column(min_width=100, scale=0.5):
-                    load_examples_button = gr.Button(
-                        "Load Example",
-                    )
+
                     load_file_button = gr.Button("Upload File")
 
         with gr.Column():
@@ -342,10 +314,6 @@ def main():
             )
             gr.Markdown("---")
 
-        load_examples_button.click(
-            fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
-        )
-
         load_file_button.click(
             fn=load_uploaded_file, inputs=uploaded_file, outputs=[input_text]
         )

diff --git a/src/textsum/utils.py b/src/textsum/utils.py
@@ -2,11 +2,27 @@
     utils.py - Utility functions for the project.
 """
 
+import logging
 import re
-from pathlib import Path
+import subprocess
 from datetime import datetime
+from pathlib import Path
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S",
+)
 from natsort import natsorted
-import subprocess
+
+# ------------------------- #
+
+TEXT_EXAMPLE_URLS = {
+    "whisper_lecture": "https://pastebin.com/raw/X9PEgS2w",
+    "hf_blog_clip": "https://pastebin.com/raw/1RMg1Naz",
+}
+
+# ------------------------- #
 
 
 def get_timestamp() -> str:
@@ -41,7 +57,7 @@ def truncate_word_count(text, max_words=512):
     return processed
 
 
-def load_examples(src, filetypes=[".txt", ".pdf"]):
+def load_pdf_examples(src, filetypes=[".txt", ".pdf"]):
     """
     load_examples - a helper function for the gradio module to load examples
     Returns:
@@ -66,15 +82,42 @@ def load_examples(src, filetypes=[".txt", ".pdf"]):
     return text_examples
 
 
-def load_example_filenames(example_path: str or Path):
+def load_text_examples(
+    urls: dict = TEXT_EXAMPLE_URLS, target_dir: str or Path = None
+) -> Path:
     """
-    load_example_filenames - a helper function for the gradio module to load examples
-    Returns:
-        dict, the examples (filename:full path)
+    load_text_examples - load the text examples from the web to a directory
+
+    :param dict urls: the urls to the text examples, defaults to TEXT_EXAMPLE_URLS
+    :param strorPath target_dir: the path to the target directory, defaults to the current working directory
+    :return Path: the path to the directory containing the text examples
+    """
+    target_dir = Path.cwd() if target_dir is None else Path(target_dir)
+    target_dir.mkdir(exist_ok=True)
+
+    for name, url in urls.items():  # download the examples
+        subprocess.run(["wget", url, "-O", target_dir / f"{name}.txt"])
+
+    return target_dir
+
+
+def load_example_filenames(example_path: str or Path, ext: list = [".txt", ".md"]):
+    """
+    load_example_filenames - load the example filenames from a directory
+
+    :param strorPath example_path: the path to the examples directory
+    :param list ext: the file extensions to load (default: [".txt", ".md"])
+    :return dict: the example filenames
     """
     example_path = Path(example_path)
+    if not example_path.exists():
+        # download the examples
+        logging.info("Downloading the examples...")
+        example_path = load_text_examples(target_dir=example_path)
+
     # load the examples into a list
-    examples = {f.name: f for f in example_path.glob("*.txt")}
+    examples = {f.name: f.resolve() for f in example_path.glob("*") if f.suffix in ext}
+    logging.info(f"Loaded {len(examples)} examples from {example_path}")
     return examples