Skip to content

Commit

Permalink
EPA DERs discovered with bing
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlue committed Jul 28, 2024
0 parents commit 14d4151
Show file tree
Hide file tree
Showing 12 changed files with 318 additions and 0 deletions.
57 changes: 57 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
FROM mcr.microsoft.com/devcontainers/python:3.9

# Install necessary packages
RUN apt-get update && apt-get install -y \
wget \
unzip \
xvfb \
libxi6 \
libgconf-2-4 \
libnss3-dev \
libxss1 \
libappindicator1 \
fonts-liberation \
libatk-bridge2.0-0 \
libgtk-3-0 \
libgbm1 \
libasound2 \
libdpkg-perl \
libatomic1 \
ca-certificates \
curl \
gnupg \
python3-venv \
awscli

# Install pipx and biobricks
RUN python3 -m pip install --user pipx \
&& python3 -m pipx ensurepath

# Create /mnt/biobricks directory and set permissions
RUN mkdir -p /mnt/biobricks/biobricks-ai \
&& chown -R vscode:vscode /mnt/biobricks

# Add pipx binaries to the PATH for all users
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> /etc/bash.bashrc \
&& echo 'export PATH="$PATH:$HOME/.local/bin"' >> /home/vscode/.bashrc

# Switch to vscode user to perform user-specific installations
USER vscode

# Install Python dependencies
COPY .devcontainer/requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt

# Accept build argument for BIOBRICKS_TOKEN & set to the default value if it is not given.
ARG BIOBRICKS_TOKEN
ENV DEFAULT_TOKEN=VQF6Q2U-NKktZ31ioVYa9w
ENV BIOBRICKS_TOKEN=${BIOBRICKS_TOKEN:-${DEFAULT_TOKEN}}
RUN if [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then export BIOBRICKS_TOKEN=$DEFAULT_TOKEN; fi

# Install biobricks and configure it
RUN /bin/bash -c 'source /etc/bash.bashrc && pipx install biobricks && biobricks version' \
&& /bin/bash -c 'if [ -z "$BIOBRICKS_TOKEN" ] || [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then echo "BIOBRICKS_TOKEN is not set or is too short (less than 5 characters)"; exit 1; fi' \
&& /bin/bash -c 'source /etc/bash.bashrc && biobricks configure --bblib=/mnt/biobricks --token=${BIOBRICKS_TOKEN} --interactive=False'

# Switch back to root user to complete setup
USER root
29 changes: 29 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"name": "SMRT Development Container",
"build": {
"dockerfile": "Dockerfile",
"context": "..",
"args": {
"BIOBRICKS_TOKEN": "${localEnv:BIOBRICKS_PUBLIC_TOKEN}"
}
},
"features": {
"ghcr.io/devcontainers/features/docker-in-docker:1": {}
},
"customizations": {
"vscode": {
"settings": {
"terminal.integrated.defaultProfile.linux": "bash",
"python.pythonPath": "/usr/local/bin/python"
},
"extensions": [
"ms-python.python",
"ms-toolsai.jupyter",
"ms-vsliveshare.vsliveshare", // Live Share extension
"github.copilot", // GitHub Copilot extension
"insilica.vscode-pycmd"
]
}
},
"remoteUser": "vscode"
}
7 changes: 7 additions & 0 deletions .devcontainer/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
python-dotenv==1.0.1
pandas==2.2.2
biobricks==0.3.7
fastparquet==2024.5.0
pyarrow==16.1.0
dvc==3.51.1
dvc-s3==3.2.0
3 changes: 3 additions & 0 deletions .dvc/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/config.local
/tmp
/cache
6 changes: 6 additions & 0 deletions .dvc/config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[core]
remote = biobricks.ai
['remote "biobricks.ai"']
url = https://ins-dvc.s3.amazonaws.com/insdvc
['remote "s3.biobricks.ai"']
url = s3://ins-dvc/insdvc
10 changes: 10 additions & 0 deletions .github/workflows/bricktools-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: bricktools-check
on: [push, workflow_dispatch]
jobs:
bricktools-check:
runs-on: ubuntu-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: bricktools check
uses: biobricks-ai/github-actions/bricktools-check@main
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
logs
/download
/list
/brick
.env
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# How to build bricks

1. Create a brick named `{newbrick}` from this template
```
gh repo create biobricks-ai/{newbrick} -p biobricks-ai/brick-template --public
gh repo clone biobricks-ai/{newbrick}
cd newbrick
```

2. Edit stages according to your needs:
Recommended scripts:
- ``01_download.sh``
- ``02_unzip.sh``
- ``03_build.sh`` calling a function to process individual files like ``csv2parquet.R`` or ``csv2parquet.py``

3. Replace stages in dvc.yaml with your new stages

4. Build your brick
```
dvc repro # runs new stages
```

5. Push the data to biobricks.ai
```
dvc push -r s3.biobricks.ai
```

6. Commit the brick
```
git add -A && git commit -m "some message"
git push
```

7. Monitor the bricktools github action

9 changes: 9 additions & 0 deletions dvc.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
schema: '2.0'
stages:
bingsearch:
cmd: python stages/01_bingquery.py
outs:
- path: download/search_results.parquet
hash: md5
md5: cbb02378c80cdefe3f2a9d8754aac6a9
size: 434703
26 changes: 26 additions & 0 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Brick DVC stages
# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml

# The complete process can be executed using:
# dvc repro
# If you want to force redoing the process use
# dvc repro -f
# Individual stage can be executed using:
# dvc repro <stage>

stages:

bingsearch:
cmd: python stages/01_bingquery.py
outs:
- download/search_results.parquet:
persist: true

download_pdfs:
cmd: python stages/02_download_pdf.py
deps:
- download/search_results.parquet
outs:
- brick/riskder.pdf:
persist: true
- brick/riskder.parquet
74 changes: 74 additions & 0 deletions stages/01_bingquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os, tqdm, time, pyarrow
import requests
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
bing_api_key = os.getenv('BINGAPI_KEY')

def bing_search(query, offset=0):
headers = {'Ocp-Apim-Subscription-Key': bing_api_key}
params = {
'q': query,
'mkt': 'en-US', # Specify the market or region
'count': 50, # 50 is the max value
'offset': offset, # The offset for pagination
}
response = requests.get('https://api.bing.microsoft.com/v7.0/search', headers=headers, params=params)
response.raise_for_status() # Raise an exception for HTTP errors
results = response.json()
urls = []
estimated_matches = results.get('webPages',{}).get('totalEstimatedMatches',{})
for item in results.get('webPages', {}).get('value', []):
license_info = next((rule['license'] for rule in item.get('contractualRules', []) if rule['_type'] == 'ContractualRules/LicenseAttribution'), None)
urls.append({
'query' : query,
'name': item.get('name'),
'url': item.get('url'),
'snippet': item.get('snippet'),
'license_name': license_info['name'] if license_info else '',
'license_url': license_info['url'] if license_info else ''
})
return urls, estimated_matches

query = "epa data evaluation record filetype:pdf"
res, estimated_total = bing_search(query)

all_results = []
offset = 0
pbar = tqdm.tqdm(total=estimated_total, desc="Fetching results")
while True:
results, _ = bing_search(query, offset)
if not results:
break
all_results.extend(results)
offset += len(results)
pbar.update(len(results))

if offset >= estimated_total:
break
time.sleep(0.1)

pbar.close()

# Create a DataFrame
df = pd.DataFrame(columns=['query', 'name', 'url', 'snippet', 'license_name', 'license_url'])
if os.path.exists('download/search_results.parquet'):
df = pd.read_parquet('download/search_results.parquet')


if all_results:
new_df = pd.DataFrame(all_results)
df = pd.concat([df, new_df]).drop_duplicates(subset='url', keep='first')
df = df.reset_index(drop=True)

# Write DataFrame to parquet file
os.makedirs('download', exist_ok=True)
df.to_parquet('download/search_results.parquet', index=False)

# Read back and compare
df_read = pd.read_parquet('brick/search_results.parquet')
assert df.shape == df_read.shape, f"Shape mismatch: Original {df.shape}, Read {df_read.shape}"
print("Parquet file successfully written and verified.")

57 changes: 57 additions & 0 deletions stages/02_download_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os, tqdm, time, pyarrow, hashlib
import requests
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

def scrape(scrape_url, autoparse=False, binary=False, ultra_premium=False, timeout=20):
scraperapi_key = os.getenv('SCRAPERAPI_KEY')
params = {
'api_key': scraperapi_key,
'url': scrape_url,
'autoparse': autoparse,
'binary_target': binary,
'ultra_premium': ultra_premium
}
return requests.get('http://api.scraperapi.com', params=params, timeout=timeout)


df = pd.read_parquet('download/search_results.parquet')
df['pdf_path'] = ''


# Loop through the URLs and download PDFs
os.makedirs('brick/riskder.pdf', exist_ok=True)
for index, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Downloading PDFs"):
url = row['url']
md5_hash = get_md5(url)
pdf_path = f'brick/riskder.pdf/{md5_hash}.pdf'

# Skip if the file already exists
if os.path.exists(pdf_path):
df.at[index, 'pdf_path'] = pdf_path
continue

try:
# Make a request to download the PDF
response = scrape(url, binary=True, timeout=30)

# Check if the request was successful and the content is likely a PDF
if response.status_code == 200 and response.headers.get('Content-Type', '').lower().startswith('application/pdf'):
# Save the PDF
with open(pdf_path, 'wb') as f:
_ = f.write(response.content)
df.at[index, 'pdf_path'] = pdf_path
else:
print(f"Failed to download PDF from {url}. Status code: {response.status_code}")

except Exception as e:
print(f"Error downloading PDF from {url}: {str(e)}")

# Add a small delay to avoid overwhelming the server
time.sleep(0.5)

# Save the DataFrame to brick/riskder.parquet
df.to_parquet('brick/riskder.parquet', index=False)
print("DataFrame successfully saved to brick/riskder.parquet")

0 comments on commit 14d4151

Please sign in to comment.