-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 14d4151
Showing
12 changed files
with
318 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
FROM mcr.microsoft.com/devcontainers/python:3.9 | ||
|
||
# Install necessary packages | ||
RUN apt-get update && apt-get install -y \ | ||
wget \ | ||
unzip \ | ||
xvfb \ | ||
libxi6 \ | ||
libgconf-2-4 \ | ||
libnss3-dev \ | ||
libxss1 \ | ||
libappindicator1 \ | ||
fonts-liberation \ | ||
libatk-bridge2.0-0 \ | ||
libgtk-3-0 \ | ||
libgbm1 \ | ||
libasound2 \ | ||
libdpkg-perl \ | ||
libatomic1 \ | ||
ca-certificates \ | ||
curl \ | ||
gnupg \ | ||
python3-venv \ | ||
awscli | ||
|
||
# Install pipx and biobricks | ||
RUN python3 -m pip install --user pipx \ | ||
&& python3 -m pipx ensurepath | ||
|
||
# Create /mnt/biobricks directory and set permissions | ||
RUN mkdir -p /mnt/biobricks/biobricks-ai \ | ||
&& chown -R vscode:vscode /mnt/biobricks | ||
|
||
# Add pipx binaries to the PATH for all users | ||
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> /etc/bash.bashrc \ | ||
&& echo 'export PATH="$PATH:$HOME/.local/bin"' >> /home/vscode/.bashrc | ||
|
||
# Switch to vscode user to perform user-specific installations | ||
USER vscode | ||
|
||
# Install Python dependencies | ||
COPY .devcontainer/requirements.txt /tmp/requirements.txt | ||
RUN pip install -r /tmp/requirements.txt | ||
|
||
# Accept build argument for BIOBRICKS_TOKEN & set to the default value if it is not given. | ||
ARG BIOBRICKS_TOKEN | ||
ENV DEFAULT_TOKEN=VQF6Q2U-NKktZ31ioVYa9w | ||
ENV BIOBRICKS_TOKEN=${BIOBRICKS_TOKEN:-${DEFAULT_TOKEN}} | ||
RUN if [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then export BIOBRICKS_TOKEN=$DEFAULT_TOKEN; fi | ||
|
||
# Install biobricks and configure it | ||
RUN /bin/bash -c 'source /etc/bash.bashrc && pipx install biobricks && biobricks version' \ | ||
&& /bin/bash -c 'if [ -z "$BIOBRICKS_TOKEN" ] || [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then echo "BIOBRICKS_TOKEN is not set or is too short (less than 5 characters)"; exit 1; fi' \ | ||
&& /bin/bash -c 'source /etc/bash.bashrc && biobricks configure --bblib=/mnt/biobricks --token=${BIOBRICKS_TOKEN} --interactive=False' | ||
|
||
# Switch back to root user to complete setup | ||
USER root |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"name": "SMRT Development Container", | ||
"build": { | ||
"dockerfile": "Dockerfile", | ||
"context": "..", | ||
"args": { | ||
"BIOBRICKS_TOKEN": "${localEnv:BIOBRICKS_PUBLIC_TOKEN}" | ||
} | ||
}, | ||
"features": { | ||
"ghcr.io/devcontainers/features/docker-in-docker:1": {} | ||
}, | ||
"customizations": { | ||
"vscode": { | ||
"settings": { | ||
"terminal.integrated.defaultProfile.linux": "bash", | ||
"python.pythonPath": "/usr/local/bin/python" | ||
}, | ||
"extensions": [ | ||
"ms-python.python", | ||
"ms-toolsai.jupyter", | ||
"ms-vsliveshare.vsliveshare", // Live Share extension | ||
"github.copilot", // GitHub Copilot extension | ||
"insilica.vscode-pycmd" | ||
] | ||
} | ||
}, | ||
"remoteUser": "vscode" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
python-dotenv==1.0.1 | ||
pandas==2.2.2 | ||
biobricks==0.3.7 | ||
fastparquet==2024.5.0 | ||
pyarrow==16.1.0 | ||
dvc==3.51.1 | ||
dvc-s3==3.2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
/config.local | ||
/tmp | ||
/cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[core] | ||
remote = biobricks.ai | ||
['remote "biobricks.ai"'] | ||
url = https://ins-dvc.s3.amazonaws.com/insdvc | ||
['remote "s3.biobricks.ai"'] | ||
url = s3://ins-dvc/insdvc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
name: bricktools-check | ||
on: [push, workflow_dispatch] | ||
jobs: | ||
bricktools-check: | ||
runs-on: ubuntu-latest | ||
env: | ||
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} | ||
steps: | ||
- name: bricktools check | ||
uses: biobricks-ai/github-actions/bricktools-check@main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
logs | ||
/download | ||
/list | ||
/brick | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# How to build bricks | ||
|
||
1. Create a brick named `{newbrick}` from this template | ||
``` | ||
gh repo create biobricks-ai/{newbrick} -p biobricks-ai/brick-template --public | ||
gh repo clone biobricks-ai/{newbrick} | ||
cd newbrick | ||
``` | ||
|
||
2. Edit stages according to your needs: | ||
Recommended scripts: | ||
- ``01_download.sh`` | ||
- ``02_unzip.sh`` | ||
- ``03_build.sh`` calling a function to process individual files like ``csv2parquet.R`` or ``csv2parquet.py`` | ||
|
||
3. Replace stages in dvc.yaml with your new stages | ||
|
||
4. Build your brick | ||
``` | ||
dvc repro # runs new stages | ||
``` | ||
|
||
5. Push the data to biobricks.ai | ||
``` | ||
dvc push -r s3.biobricks.ai | ||
``` | ||
|
||
6. Commit the brick | ||
``` | ||
git add -A && git commit -m "some message" | ||
git push | ||
``` | ||
|
||
7. Monitor the bricktools github action | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
schema: '2.0' | ||
stages: | ||
bingsearch: | ||
cmd: python stages/01_bingquery.py | ||
outs: | ||
- path: download/search_results.parquet | ||
hash: md5 | ||
md5: cbb02378c80cdefe3f2a9d8754aac6a9 | ||
size: 434703 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Brick DVC stages | ||
# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml | ||
|
||
# The complete process can be executed using: | ||
# dvc repro | ||
# If you want to force redoing the process use | ||
# dvc repro -f | ||
# Individual stage can be executed using: | ||
# dvc repro <stage> | ||
|
||
stages: | ||
|
||
bingsearch: | ||
cmd: python stages/01_bingquery.py | ||
outs: | ||
- download/search_results.parquet: | ||
persist: true | ||
|
||
download_pdfs: | ||
cmd: python stages/02_download_pdf.py | ||
deps: | ||
- download/search_results.parquet | ||
outs: | ||
- brick/riskder.pdf: | ||
persist: true | ||
- brick/riskder.parquet |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import os, tqdm, time, pyarrow | ||
import requests | ||
import pandas as pd | ||
from dotenv import load_dotenv | ||
|
||
# Load environment variables | ||
load_dotenv() | ||
bing_api_key = os.getenv('BINGAPI_KEY') | ||
|
||
def bing_search(query, offset=0): | ||
headers = {'Ocp-Apim-Subscription-Key': bing_api_key} | ||
params = { | ||
'q': query, | ||
'mkt': 'en-US', # Specify the market or region | ||
'count': 50, # 50 is the max value | ||
'offset': offset, # The offset for pagination | ||
} | ||
response = requests.get('https://api.bing.microsoft.com/v7.0/search', headers=headers, params=params) | ||
response.raise_for_status() # Raise an exception for HTTP errors | ||
results = response.json() | ||
urls = [] | ||
estimated_matches = results.get('webPages',{}).get('totalEstimatedMatches',{}) | ||
for item in results.get('webPages', {}).get('value', []): | ||
license_info = next((rule['license'] for rule in item.get('contractualRules', []) if rule['_type'] == 'ContractualRules/LicenseAttribution'), None) | ||
urls.append({ | ||
'query' : query, | ||
'name': item.get('name'), | ||
'url': item.get('url'), | ||
'snippet': item.get('snippet'), | ||
'license_name': license_info['name'] if license_info else '', | ||
'license_url': license_info['url'] if license_info else '' | ||
}) | ||
return urls, estimated_matches | ||
|
||
query = "epa data evaluation record filetype:pdf" | ||
res, estimated_total = bing_search(query) | ||
|
||
all_results = [] | ||
offset = 0 | ||
pbar = tqdm.tqdm(total=estimated_total, desc="Fetching results") | ||
while True: | ||
results, _ = bing_search(query, offset) | ||
if not results: | ||
break | ||
all_results.extend(results) | ||
offset += len(results) | ||
pbar.update(len(results)) | ||
|
||
if offset >= estimated_total: | ||
break | ||
time.sleep(0.1) | ||
|
||
pbar.close() | ||
|
||
# Create a DataFrame | ||
df = pd.DataFrame(columns=['query', 'name', 'url', 'snippet', 'license_name', 'license_url']) | ||
if os.path.exists('download/search_results.parquet'): | ||
df = pd.read_parquet('download/search_results.parquet') | ||
|
||
|
||
if all_results: | ||
new_df = pd.DataFrame(all_results) | ||
df = pd.concat([df, new_df]).drop_duplicates(subset='url', keep='first') | ||
df = df.reset_index(drop=True) | ||
|
||
# Write DataFrame to parquet file | ||
os.makedirs('download', exist_ok=True) | ||
df.to_parquet('download/search_results.parquet', index=False) | ||
|
||
# Read back and compare | ||
df_read = pd.read_parquet('brick/search_results.parquet') | ||
assert df.shape == df_read.shape, f"Shape mismatch: Original {df.shape}, Read {df_read.shape}" | ||
print("Parquet file successfully written and verified.") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os, tqdm, time, pyarrow, hashlib | ||
import requests | ||
import pandas as pd | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
|
||
def scrape(scrape_url, autoparse=False, binary=False, ultra_premium=False, timeout=20): | ||
scraperapi_key = os.getenv('SCRAPERAPI_KEY') | ||
params = { | ||
'api_key': scraperapi_key, | ||
'url': scrape_url, | ||
'autoparse': autoparse, | ||
'binary_target': binary, | ||
'ultra_premium': ultra_premium | ||
} | ||
return requests.get('http://api.scraperapi.com', params=params, timeout=timeout) | ||
|
||
|
||
df = pd.read_parquet('download/search_results.parquet') | ||
df['pdf_path'] = '' | ||
|
||
|
||
# Loop through the URLs and download PDFs | ||
os.makedirs('brick/riskder.pdf', exist_ok=True) | ||
for index, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Downloading PDFs"): | ||
url = row['url'] | ||
md5_hash = get_md5(url) | ||
pdf_path = f'brick/riskder.pdf/{md5_hash}.pdf' | ||
|
||
# Skip if the file already exists | ||
if os.path.exists(pdf_path): | ||
df.at[index, 'pdf_path'] = pdf_path | ||
continue | ||
|
||
try: | ||
# Make a request to download the PDF | ||
response = scrape(url, binary=True, timeout=30) | ||
|
||
# Check if the request was successful and the content is likely a PDF | ||
if response.status_code == 200 and response.headers.get('Content-Type', '').lower().startswith('application/pdf'): | ||
# Save the PDF | ||
with open(pdf_path, 'wb') as f: | ||
_ = f.write(response.content) | ||
df.at[index, 'pdf_path'] = pdf_path | ||
else: | ||
print(f"Failed to download PDF from {url}. Status code: {response.status_code}") | ||
|
||
except Exception as e: | ||
print(f"Error downloading PDF from {url}: {str(e)}") | ||
|
||
# Add a small delay to avoid overwhelming the server | ||
time.sleep(0.5) | ||
|
||
# Save the DataFrame to brick/riskder.parquet | ||
df.to_parquet('brick/riskder.parquet', index=False) | ||
print("DataFrame successfully saved to brick/riskder.parquet") |