cir-reports works

biobricks-ai · Dec 12, 2024 · 1bd01bb · 1bd01bb
1 parent 73d734f
commit 1bd01bb
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,7 @@ logs
 /list
 /brick
 
-__pycache__*
+__pycache__*
+.env
+log/*
+cache/*
diff --git a/cache/.gitignore b/cache/.gitignore
@@ -0,0 +1,2 @@
+/ingredient_page_links.json
+/all_pdf_links.json
diff --git a/dvc.lock b/dvc.lock
@@ -1,2 +1,50 @@
 schema: '2.0'
-stages: {}
+stages:
+  get_ingredient_links:
+    cmd: python stages/01_get_ingredient_links.py
+    deps:
+    - path: stages/01_get_ingredient_links.py
+      hash: md5
+      md5: 419ee6fb2fbec591e466a1a58cd02af7
+      size: 2527
+      isexec: true
+    outs:
+    - path: cache/ingredient_page_links.json
+      hash: md5
+      md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b
+      size: 223951
+  get_pdf_links:
+    cmd: python stages/02_get_pdf_links.py
+    deps:
+    - path: cache/ingredient_page_links.json
+      hash: md5
+      md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b
+      size: 223951
+    - path: stages/02_get_pdf_links.py
+      hash: md5
+      md5: a9f8b66eec9c4dba1b85aa531d19b651
+      size: 2215
+      isexec: true
+    outs:
+    - path: cache/all_pdf_links.json
+      hash: md5
+      md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7
+      size: 34390
+  download_pdfs:
+    cmd: python stages/03_download_pdfs.py
+    deps:
+    - path: cache/all_pdf_links.json
+      hash: md5
+      md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7
+      size: 34390
+    - path: stages/03_download_pdfs.py
+      hash: md5
+      md5: b968f299304438b687e2d51c329cf5ab
+      size: 775
+      isexec: true
+    outs:
+    - path: brick/cir_reports.pdf
+      hash: md5
+      md5: 480ab5f19049da242446fff3044bb088.dir
+      size: 395549978
+      nfiles: 364
diff --git a/dvc.yaml b/dvc.yaml
@@ -20,4 +20,4 @@ stages:
       - stages/03_download_pdfs.py
       - cache/all_pdf_links.json
     outs:
-      - brick/cir_reports_pdf
+      - brick/cir_reports.pdf
diff --git a/stages/02_get_pdf_links.py b/stages/02_get_pdf_links.py
@@ -5,13 +5,15 @@
 import time
 import json
 import pathlib
+import logging
 import requests
-
 from utils.simple_cache import simple_cache
 from tqdm import tqdm
 from tenacity import retry, stop_after_attempt, wait_exponential
 
 cachedir = pathlib.Path('./cache')
+format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+logging.basicConfig(filename='log/get_pdf_links.log', level=logging.INFO, format=format)
 
 # GET PDF LINKS ================================================================
 ingredient_page_links = json.load(open(cachedir / 'ingredient_page_links.json'))
@@ -24,7 +26,6 @@
 @simple_cache(simple_cache_dir.as_posix(), expiry_seconds=60*60*48)
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=15))
 def download_pdfs_from_ingredient_page(ingredient_page_link):
-    # response = scraperapi.scrape(ingredient_page_link, ultra_premium=True)
     response = requests.get(ingredient_page_link)
     response.raise_for_status()
     time.sleep(0.5)
@@ -37,17 +38,22 @@ def download_pdfs_from_ingredient_page(ingredient_page_link):
 for ingredient_page_link in tqdm(ingredient_page_links):
     try:
         pdf_links = download_pdfs_from_ingredient_page(ingredient_page_link)
+        if len(pdf_links) == 0:
+            logging.info(f"No pdf links found for {ingredient_page_link}")
+            continue
         all_pdf_links.extend(pdf_links)
     except Exception as e:
         print(f"Error downloading {ingredient_page_link}: {e}")
         continue
 
 # write all_pdf_links to cache/all_pdf_links.txt
-all_pdf_links = list(set(all_pdf_links))
-json.dump(all_pdf_links, open(cachedir / 'all_pdf_links.json', 'w'))
+unique_pdf_links = list(set(all_pdf_links))
+logging.info(f"Total pdf links found: {len(all_pdf_links)}")
+logging.info(f"Unique pdf links found: {len(unique_pdf_links)}")
+json.dump(unique_pdf_links, open(cachedir / 'all_pdf_links.json', 'w'))
 
 # TEST RESULT =====================================================================
 res = json.load(open(cachedir / 'all_pdf_links.json'))
 assert len(res) > 1000
-assert len(res) == len(set(res))
+assert len(res) == len(all_pdf_links)
 
diff --git a/stages/03_download_pdfs.py b/stages/03_download_pdfs.py
@@ -6,7 +6,7 @@
 from tqdm import tqdm
 
 cachedir = pathlib.Path('./cache')
-brickdir = pathlib.Path('./brick') / 'cir_reports_pdf'
+brickdir = pathlib.Path('./brick') / 'cir_reports.pdf'
 brickdir.mkdir(parents=True, exist_ok=True)
 
 # DOWNLOAD PDFS ================================================================

diff --git a/stages/utils/scraperapi.py b/stages/utils/scraperapi.py
@@ -0,0 +1,14 @@
+import os, requests, re, sqlite3, boto3, json, pathlib, dotenv
+
+dotenv.load_dotenv()
+scraperapi_key = os.getenv('SCRAPER_API')
+
+def scrape(scrape_url, autoparse=False, binary=False, ultra_premium=False):
+    params = {
+        'api_key': scraperapi_key,
+        'url': scrape_url,
+        'autoparse': autoparse,
+        'binary_target': binary,
+        'ultra_premium': ultra_premium
+    }
+    return requests.get('http://api.scraperapi.com', params=params)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,7 @@ logs @@
     /list
     /brick
-    __pycache__*
+    __pycache__*
+    .env
+    log/*
+    cache/*
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		/ingredient_page_links.json
		/all_pdf_links.json