Skip to content

Commit

Permalink
cir-reports works
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlue committed Dec 12, 2024
1 parent 73d734f commit 1bd01bb
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 9 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@ logs
/list
/brick

__pycache__*
__pycache__*
.env
log/*
cache/*
2 changes: 2 additions & 0 deletions cache/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/ingredient_page_links.json
/all_pdf_links.json
50 changes: 49 additions & 1 deletion dvc.lock
Original file line number Diff line number Diff line change
@@ -1,2 +1,50 @@
schema: '2.0'
stages: {}
stages:
get_ingredient_links:
cmd: python stages/01_get_ingredient_links.py
deps:
- path: stages/01_get_ingredient_links.py
hash: md5
md5: 419ee6fb2fbec591e466a1a58cd02af7
size: 2527
isexec: true
outs:
- path: cache/ingredient_page_links.json
hash: md5
md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b
size: 223951
get_pdf_links:
cmd: python stages/02_get_pdf_links.py
deps:
- path: cache/ingredient_page_links.json
hash: md5
md5: 8871c26dfaf39cdb150ba2a1dd8f3c0b
size: 223951
- path: stages/02_get_pdf_links.py
hash: md5
md5: a9f8b66eec9c4dba1b85aa531d19b651
size: 2215
isexec: true
outs:
- path: cache/all_pdf_links.json
hash: md5
md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7
size: 34390
download_pdfs:
cmd: python stages/03_download_pdfs.py
deps:
- path: cache/all_pdf_links.json
hash: md5
md5: b0f1fc13abfa2e6d2e15d4ae34f58cc7
size: 34390
- path: stages/03_download_pdfs.py
hash: md5
md5: b968f299304438b687e2d51c329cf5ab
size: 775
isexec: true
outs:
- path: brick/cir_reports.pdf
hash: md5
md5: 480ab5f19049da242446fff3044bb088.dir
size: 395549978
nfiles: 364
2 changes: 1 addition & 1 deletion dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ stages:
- stages/03_download_pdfs.py
- cache/all_pdf_links.json
outs:
- brick/cir_reports_pdf
- brick/cir_reports.pdf
16 changes: 11 additions & 5 deletions stages/02_get_pdf_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
import time
import json
import pathlib
import logging
import requests

from utils.simple_cache import simple_cache
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential

cachedir = pathlib.Path('./cache')
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(filename='log/get_pdf_links.log', level=logging.INFO, format=format)

# GET PDF LINKS ================================================================
ingredient_page_links = json.load(open(cachedir / 'ingredient_page_links.json'))
Expand All @@ -24,7 +26,6 @@
@simple_cache(simple_cache_dir.as_posix(), expiry_seconds=60*60*48)
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=15))
def download_pdfs_from_ingredient_page(ingredient_page_link):
# response = scraperapi.scrape(ingredient_page_link, ultra_premium=True)
response = requests.get(ingredient_page_link)
response.raise_for_status()
time.sleep(0.5)
Expand All @@ -37,17 +38,22 @@ def download_pdfs_from_ingredient_page(ingredient_page_link):
for ingredient_page_link in tqdm(ingredient_page_links):
try:
pdf_links = download_pdfs_from_ingredient_page(ingredient_page_link)
if len(pdf_links) == 0:
logging.info(f"No pdf links found for {ingredient_page_link}")
continue
all_pdf_links.extend(pdf_links)
except Exception as e:
print(f"Error downloading {ingredient_page_link}: {e}")
continue

# write all_pdf_links to cache/all_pdf_links.txt
all_pdf_links = list(set(all_pdf_links))
json.dump(all_pdf_links, open(cachedir / 'all_pdf_links.json', 'w'))
unique_pdf_links = list(set(all_pdf_links))
logging.info(f"Total pdf links found: {len(all_pdf_links)}")
logging.info(f"Unique pdf links found: {len(unique_pdf_links)}")
json.dump(unique_pdf_links, open(cachedir / 'all_pdf_links.json', 'w'))

# TEST RESULT =====================================================================
res = json.load(open(cachedir / 'all_pdf_links.json'))
assert len(res) > 1000
assert len(res) == len(set(res))
assert len(res) == len(all_pdf_links)

2 changes: 1 addition & 1 deletion stages/03_download_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tqdm import tqdm

cachedir = pathlib.Path('./cache')
brickdir = pathlib.Path('./brick') / 'cir_reports_pdf'
brickdir = pathlib.Path('./brick') / 'cir_reports.pdf'
brickdir.mkdir(parents=True, exist_ok=True)

# DOWNLOAD PDFS ================================================================
Expand Down
14 changes: 14 additions & 0 deletions stages/utils/scraperapi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os, requests, re, sqlite3, boto3, json, pathlib, dotenv

dotenv.load_dotenv()
scraperapi_key = os.getenv('SCRAPER_API')

def scrape(scrape_url, autoparse=False, binary=False, ultra_premium=False):
params = {
'api_key': scraperapi_key,
'url': scrape_url,
'autoparse': autoparse,
'binary_target': binary,
'ultra_premium': ultra_premium
}
return requests.get('http://api.scraperapi.com', params=params)

0 comments on commit 1bd01bb

Please sign in to comment.