diff --git a/.gitignore b/.gitignore index e9e1e9b..e83121a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,28 @@ +# START CUSTOM IGNORES +# logs +*.log +*LOGFILE* +# output files need to be force-added +*.csv +*.png +*.jpg +*.jpeg +*.pkl +*.xlsx +*.txt +# cache +*__pycache__/ +*.pyc +# reports folder - need to be force-added +*reports/ +# scratch files and folders +*scratch* +*scratch/ +# notebooks +*notebooks/ +*.ipynb +# END CUSTOM IGNORES + # Temporary and binary files *~ *.py[cod] @@ -13,7 +38,6 @@ __pycache__/* .*.swp */.ipynb_checkpoints/* .DS_Store - # Project files .ropeproject .project @@ -22,13 +46,11 @@ __pycache__/* .idea .vscode tags - # Package files *.egg *.eggs/ .installed.cfg *.egg-info - # Unittest and coverage htmlcov/* .coverage @@ -37,7 +59,6 @@ htmlcov/* junit*.xml coverage.xml .pytest_cache/ - # Build and docs folder/files build/* dist/* @@ -47,7 +68,6 @@ docs/_rst/* docs/_build/* cover/* MANIFEST - # Per-project virtualenvs .venv*/ .conda*/ diff --git a/README.md b/README.md index ec7f292..ca9432a 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,38 @@ A continuation of the [document summarization]() space on huggingface. +## Installation + +```bash +pip install -e . +``` + +To install all the dependencies _(includes PDF OCR, gradio UI demo)_, run: + +```bash +pip install -e .[all] +``` + +## Usage + +### UI Demo + +Simply run the following command to start the UI demo: + +```bash +ts-ui +``` + +Other args to be added soon + +## Roadmap + +- [ ] add argparse CLI for UI demo +- [ ] add CLI for summarization of all text files in a directory +- [ ] API for summarization of text docs + +and other things I haven't thought of yet + --- [![Project generated with PyScaffold](https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold)](https://pyscaffold.org/) diff --git a/src/textsum/app.py b/src/textsum/app.py index f012d86..57d5c36 100644 --- a/src/textsum/app.py +++ b/src/textsum/app.py @@ -1,29 +1,30 @@ -import os import contextlib import logging +import os import random import re import time from pathlib import Path +os.environ["USE_TORCH"] = "1" +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + import gradio as gr import nltk from cleantext import clean from doctr.io import DocumentFile from doctr.models import ocr_predictor -from pdf2text import convert_PDF_to_Text -from summarize import load_model_and_tokenizer, summarize_via_tokenbatches -from utils import load_example_filenames, truncate_word_count, saves_summary +from textsum.pdf2text import convert_PDF_to_Text +from textsum.summarize import load_model_and_tokenizer, summarize_via_tokenbatches +from textsum.utils import load_example_filenames, saves_summary, truncate_word_count _here = Path(__file__).parent nltk.download("stopwords") # TODO=find where this requirement originates from -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - def proc_submission( input_text: str, @@ -51,7 +52,17 @@ def proc_submission( Returns: str in HTML format, string of the summary, str of score """ + global model, tokenizer, model_sm, tokenizer_sm + # assert that the model is loaded and accessible + if "model" not in globals(): + model, tokenizer = load_model_and_tokenizer( + "pszemraj/pegasus-x-large-book-summary" + ) + if "model_sm" not in globals(): + model_sm, tokenizer_sm = load_model_and_tokenizer( + "pszemraj/long-t5-tglobal-base-16384-book-summary" + ) settings = { "length_penalty": float(length_penalty), "repetition_penalty": float(repetition_penalty), @@ -102,8 +113,8 @@ def proc_submission( _summaries = summarize_via_tokenbatches( tr_in, - model_sm if "base" in model_size.lower() else model, - tokenizer_sm if "base" in model_size.lower() else tokenizer, + model_sm if model_size == "LongT5-base" else model, + tokenizer_sm if model_size == "LongT5-base" else tokenizer, batch_length=token_batch_length, **settings, ) @@ -131,37 +142,6 @@ def proc_submission( return html, sum_text_out, scores_out, saved_file -def load_single_example_text( - example_path: str or Path, - max_pages=20, -): - """ - load_single_example - a helper function for the gradio module to load examples - Returns: - list of str, the examples - """ - global name_to_path - full_ex_path = name_to_path[example_path] - full_ex_path = Path(full_ex_path) - if full_ex_path.suffix == ".txt": - with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f: - raw_text = f.read() - text = clean(raw_text, lower=False) - elif full_ex_path.suffix == ".pdf": - logging.info(f"Loading PDF file {full_ex_path}") - conversion_stats = convert_PDF_to_Text( - full_ex_path, - ocr_model=ocr_model, - max_pages=max_pages, - ) - text = conversion_stats["converted_text"] - else: - logging.error(f"Unknown file type {full_ex_path.suffix}") - text = "ERROR - check example path" - - return text - - def load_uploaded_file(file_obj, max_pages=20): """ load_uploaded_file - process an uploaded file @@ -215,6 +195,7 @@ def main(): model_sm, tokenizer_sm = load_model_and_tokenizer( "pszemraj/long-t5-tglobal-base-16384-book-summary" ) + # ensure that the models are global variables logging.info("Loading OCR model") with contextlib.redirect_stdout(None): @@ -224,10 +205,7 @@ def main(): pretrained=True, assume_straight_pages=True, ) - name_to_path = load_example_filenames(_here / "examples") - logging.info(f"Loaded {len(name_to_path)} examples") demo = gr.Blocks() - _examples = list(name_to_path.keys()) with demo: gr.Markdown("# Document Summarization with Long-Document Transformers") @@ -254,11 +232,7 @@ def main(): value=2, ) with gr.Column(variant="compact"): - example_name = gr.Dropdown( - _examples, - label="Examples", - value=random.choice(_examples), - ) + uploaded_file = gr.File( label="File Upload", file_count="single", @@ -271,9 +245,7 @@ def main(): placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)", ) with gr.Column(min_width=100, scale=0.5): - load_examples_button = gr.Button( - "Load Example", - ) + load_file_button = gr.Button("Upload File") with gr.Column(): @@ -342,10 +314,6 @@ def main(): ) gr.Markdown("---") - load_examples_button.click( - fn=load_single_example_text, inputs=[example_name], outputs=[input_text] - ) - load_file_button.click( fn=load_uploaded_file, inputs=uploaded_file, outputs=[input_text] ) diff --git a/src/textsum/utils.py b/src/textsum/utils.py index 6cb12fa..e7c2fe2 100644 --- a/src/textsum/utils.py +++ b/src/textsum/utils.py @@ -2,11 +2,27 @@ utils.py - Utility functions for the project. """ +import logging import re -from pathlib import Path +import subprocess from datetime import datetime +from pathlib import Path + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + datefmt="%m/%d/%Y %I:%M:%S", +) from natsort import natsorted -import subprocess + +# ------------------------- # + +TEXT_EXAMPLE_URLS = { + "whisper_lecture": "https://pastebin.com/raw/X9PEgS2w", + "hf_blog_clip": "https://pastebin.com/raw/1RMg1Naz", +} + +# ------------------------- # def get_timestamp() -> str: @@ -41,7 +57,7 @@ def truncate_word_count(text, max_words=512): return processed -def load_examples(src, filetypes=[".txt", ".pdf"]): +def load_pdf_examples(src, filetypes=[".txt", ".pdf"]): """ load_examples - a helper function for the gradio module to load examples Returns: @@ -66,15 +82,42 @@ def load_examples(src, filetypes=[".txt", ".pdf"]): return text_examples -def load_example_filenames(example_path: str or Path): +def load_text_examples( + urls: dict = TEXT_EXAMPLE_URLS, target_dir: str or Path = None +) -> Path: """ - load_example_filenames - a helper function for the gradio module to load examples - Returns: - dict, the examples (filename:full path) + load_text_examples - load the text examples from the web to a directory + + :param dict urls: the urls to the text examples, defaults to TEXT_EXAMPLE_URLS + :param strorPath target_dir: the path to the target directory, defaults to the current working directory + :return Path: the path to the directory containing the text examples + """ + target_dir = Path.cwd() if target_dir is None else Path(target_dir) + target_dir.mkdir(exist_ok=True) + + for name, url in urls.items(): # download the examples + subprocess.run(["wget", url, "-O", target_dir / f"{name}.txt"]) + + return target_dir + + +def load_example_filenames(example_path: str or Path, ext: list = [".txt", ".md"]): + """ + load_example_filenames - load the example filenames from a directory + + :param strorPath example_path: the path to the examples directory + :param list ext: the file extensions to load (default: [".txt", ".md"]) + :return dict: the example filenames """ example_path = Path(example_path) + if not example_path.exists(): + # download the examples + logging.info("Downloading the examples...") + example_path = load_text_examples(target_dir=example_path) + # load the examples into a list - examples = {f.name: f for f in example_path.glob("*.txt")} + examples = {f.name: f.resolve() for f in example_path.glob("*") if f.suffix in ext} + logging.info(f"Loaded {len(examples)} examples from {example_path}") return examples