EPA DERs discovered with bing

biobricks-ai · Jul 28, 2024 · 14d4151 · 14d4151
commit 14d4151
Show file tree

Hide file tree

Showing 12 changed files with 318 additions and 0 deletions.
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,57 @@
+FROM mcr.microsoft.com/devcontainers/python:3.9
+
+# Install necessary packages
+RUN apt-get update && apt-get install -y \
+    wget \
+    unzip \
+    xvfb \
+    libxi6 \
+    libgconf-2-4 \
+    libnss3-dev \
+    libxss1 \
+    libappindicator1 \
+    fonts-liberation \
+    libatk-bridge2.0-0 \
+    libgtk-3-0 \
+    libgbm1 \
+    libasound2 \
+    libdpkg-perl \
+    libatomic1 \
+    ca-certificates \
+    curl \
+    gnupg \
+    python3-venv \
+    awscli
+
+# Install pipx and biobricks
+RUN python3 -m pip install --user pipx \
+    && python3 -m pipx ensurepath
+
+# Create /mnt/biobricks directory and set permissions
+RUN mkdir -p /mnt/biobricks/biobricks-ai \
+    && chown -R vscode:vscode /mnt/biobricks
+
+# Add pipx binaries to the PATH for all users
+RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> /etc/bash.bashrc \
+    && echo 'export PATH="$PATH:$HOME/.local/bin"' >> /home/vscode/.bashrc
+
+# Switch to vscode user to perform user-specific installations
+USER vscode
+
+# Install Python dependencies
+COPY .devcontainer/requirements.txt /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+# Accept build argument for BIOBRICKS_TOKEN & set to the default value if it is not given.
+ARG BIOBRICKS_TOKEN
+ENV DEFAULT_TOKEN=VQF6Q2U-NKktZ31ioVYa9w
+ENV BIOBRICKS_TOKEN=${BIOBRICKS_TOKEN:-${DEFAULT_TOKEN}}
+RUN if [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then export BIOBRICKS_TOKEN=$DEFAULT_TOKEN; fi
+
+# Install biobricks and configure it
+RUN /bin/bash -c 'source /etc/bash.bashrc && pipx install biobricks && biobricks version' \
+    && /bin/bash -c 'if [ -z "$BIOBRICKS_TOKEN" ] || [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then echo "BIOBRICKS_TOKEN is not set or is too short (less than 5 characters)"; exit 1; fi' \
+    && /bin/bash -c 'source /etc/bash.bashrc && biobricks configure --bblib=/mnt/biobricks --token=${BIOBRICKS_TOKEN} --interactive=False'
+
+# Switch back to root user to complete setup
+USER root
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,29 @@
+{
+    "name": "SMRT Development Container",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {
+            "BIOBRICKS_TOKEN": "${localEnv:BIOBRICKS_PUBLIC_TOKEN}"
+        }
+    },
+    "features": {
+        "ghcr.io/devcontainers/features/docker-in-docker:1": {}
+    },
+    "customizations": {
+        "vscode": {
+            "settings": {
+                "terminal.integrated.defaultProfile.linux": "bash",
+                "python.pythonPath": "/usr/local/bin/python"
+            },
+            "extensions": [
+                "ms-python.python",
+                "ms-toolsai.jupyter",
+                "ms-vsliveshare.vsliveshare",  // Live Share extension
+                "github.copilot",  // GitHub Copilot extension
+                "insilica.vscode-pycmd"
+            ]
+        }
+    },
+    "remoteUser": "vscode"
+}
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
@@ -0,0 +1,7 @@
+python-dotenv==1.0.1
+pandas==2.2.2
+biobricks==0.3.7
+fastparquet==2024.5.0
+pyarrow==16.1.0
+dvc==3.51.1
+dvc-s3==3.2.0
diff --git a/.dvc/.gitignore b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
@@ -0,0 +1,6 @@
+[core]
+  remote = biobricks.ai
+['remote "biobricks.ai"']
+  url = https://ins-dvc.s3.amazonaws.com/insdvc
+['remote "s3.biobricks.ai"']
+  url = s3://ins-dvc/insdvc
diff --git a/.github/workflows/bricktools-check.yaml b/.github/workflows/bricktools-check.yaml
@@ -0,0 +1,10 @@
+name: bricktools-check
+on: [push, workflow_dispatch]
+jobs:
+  bricktools-check:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: bricktools check
+        uses: biobricks-ai/github-actions/bricktools-check@main
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+logs
+/download
+/list
+/brick
+.env
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# How to build bricks
+
+1. Create a brick named `{newbrick}` from this template
+```
+gh repo create biobricks-ai/{newbrick} -p biobricks-ai/brick-template --public
+gh repo clone biobricks-ai/{newbrick}
+cd newbrick
+```
+
+2. Edit stages according to your needs:
+    Recommended scripts:
+    - ``01_download.sh``
+    - ``02_unzip.sh``
+    - ``03_build.sh`` calling a function to process individual files like ``csv2parquet.R`` or ``csv2parquet.py``
+
+3. Replace stages in dvc.yaml with your new stages
+
+4. Build your brick
+```
+dvc repro # runs new stages
+```
+
+5. Push the data to biobricks.ai
+```
+dvc push -r s3.biobricks.ai 
+```
+
+6. Commit the brick
+```
+git add -A && git commit -m "some message"
+git push
+```
+
+7. Monitor the bricktools github action
+
diff --git a/dvc.lock b/dvc.lock
@@ -0,0 +1,9 @@
+schema: '2.0'
+stages:
+  bingsearch:
+    cmd: python stages/01_bingquery.py
+    outs:
+    - path: download/search_results.parquet
+      hash: md5
+      md5: cbb02378c80cdefe3f2a9d8754aac6a9
+      size: 434703
diff --git a/dvc.yaml b/dvc.yaml
@@ -0,0 +1,26 @@
+# Brick DVC stages
+# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml
+
+# The complete process can be executed using:
+# dvc repro
+# If you want to force redoing the process use 
+# dvc repro -f
+# Individual stage can be executed using: 
+# dvc repro <stage>
+
+stages:
+
+  bingsearch:
+    cmd: python stages/01_bingquery.py
+    outs:
+      - download/search_results.parquet:
+          persist: true
+
+  download_pdfs:
+    cmd: python stages/02_download_pdf.py
+    deps:
+      - download/search_results.parquet
+    outs:
+      - brick/riskder.pdf:
+          persist: true
+      - brick/riskder.parquet
diff --git a/stages/01_bingquery.py b/stages/01_bingquery.py
@@ -0,0 +1,74 @@
+import os, tqdm, time, pyarrow
+import requests
+import pandas as pd
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+bing_api_key = os.getenv('BINGAPI_KEY')
+
+def bing_search(query, offset=0):
+    headers = {'Ocp-Apim-Subscription-Key': bing_api_key}
+    params = {
+        'q': query,
+        'mkt': 'en-US',  # Specify the market or region
+        'count': 50,  # 50 is the max value
+        'offset': offset,  # The offset for pagination
+    }
+    response = requests.get('https://api.bing.microsoft.com/v7.0/search', headers=headers, params=params)
+    response.raise_for_status()  # Raise an exception for HTTP errors
+    results = response.json()
+    urls = []
+    estimated_matches = results.get('webPages',{}).get('totalEstimatedMatches',{})
+    for item in results.get('webPages', {}).get('value', []):
+        license_info = next((rule['license'] for rule in item.get('contractualRules', []) if rule['_type'] == 'ContractualRules/LicenseAttribution'), None)
+        urls.append({
+            'query' : query,
+            'name': item.get('name'),
+            'url': item.get('url'),
+            'snippet': item.get('snippet'),
+            'license_name': license_info['name'] if license_info else '',
+            'license_url': license_info['url'] if license_info else ''
+        })
+    return urls, estimated_matches
+
+query = "epa data evaluation record filetype:pdf"
+res, estimated_total = bing_search(query)
+
+all_results = []
+offset = 0
+pbar = tqdm.tqdm(total=estimated_total, desc="Fetching results")
+while True:
+    results, _ = bing_search(query, offset)
+    if not results:
+        break
+    all_results.extend(results)
+    offset += len(results)
+    pbar.update(len(results))
+
+    if offset >= estimated_total:
+        break
+    time.sleep(0.1)
+
+pbar.close()
+
+# Create a DataFrame
+df = pd.DataFrame(columns=['query', 'name', 'url', 'snippet', 'license_name', 'license_url'])
+if os.path.exists('download/search_results.parquet'):
+    df = pd.read_parquet('download/search_results.parquet')
+
+
+if all_results:
+    new_df = pd.DataFrame(all_results)
+    df = pd.concat([df, new_df]).drop_duplicates(subset='url', keep='first')
+df = df.reset_index(drop=True)
+
+# Write DataFrame to parquet file
+os.makedirs('download', exist_ok=True)
+df.to_parquet('download/search_results.parquet', index=False)
+
+# Read back and compare
+df_read = pd.read_parquet('brick/search_results.parquet')
+assert df.shape == df_read.shape, f"Shape mismatch: Original {df.shape}, Read {df_read.shape}"
+print("Parquet file successfully written and verified.")
+
diff --git a/stages/02_download_pdf.py b/stages/02_download_pdf.py
@@ -0,0 +1,57 @@
+import os, tqdm, time, pyarrow, hashlib
+import requests
+import pandas as pd
+from dotenv import load_dotenv
+
+load_dotenv()
+
+def scrape(scrape_url, autoparse=False, binary=False, ultra_premium=False, timeout=20):
+    scraperapi_key = os.getenv('SCRAPERAPI_KEY')
+    params = {
+        'api_key': scraperapi_key,
+        'url': scrape_url,
+        'autoparse': autoparse,
+        'binary_target': binary,
+        'ultra_premium': ultra_premium
+    }
+    return requests.get('http://api.scraperapi.com', params=params, timeout=timeout)
+
+
+df = pd.read_parquet('download/search_results.parquet')
+df['pdf_path'] = ''
+
+
+# Loop through the URLs and download PDFs
+os.makedirs('brick/riskder.pdf', exist_ok=True)
+for index, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Downloading PDFs"):
+    url = row['url']
+    md5_hash = get_md5(url)
+    pdf_path = f'brick/riskder.pdf/{md5_hash}.pdf'
+
+    # Skip if the file already exists
+    if os.path.exists(pdf_path):
+        df.at[index, 'pdf_path'] = pdf_path
+        continue
+
+    try:
+        # Make a request to download the PDF
+        response = scrape(url, binary=True, timeout=30)
+
+        # Check if the request was successful and the content is likely a PDF
+        if response.status_code == 200 and response.headers.get('Content-Type', '').lower().startswith('application/pdf'):
+            # Save the PDF
+            with open(pdf_path, 'wb') as f:
+                _ = f.write(response.content)
+            df.at[index, 'pdf_path'] = pdf_path
+        else:
+            print(f"Failed to download PDF from {url}. Status code: {response.status_code}")
+
+    except Exception as e:
+        print(f"Error downloading PDF from {url}: {str(e)}")
+
+    # Add a small delay to avoid overwhelming the server
+    time.sleep(0.5)
+
+# Save the DataFrame to brick/riskder.parquet
+df.to_parquet('brick/riskder.parquet', index=False)
+print("DataFrame successfully saved to brick/riskder.parquet")