Initial commit

biobricks-ai · Nov 18, 2024 · 7947cc4 · 7947cc4
commit 7947cc4
Show file tree

Hide file tree

Showing 16 changed files with 298 additions and 0 deletions.
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,57 @@
+FROM mcr.microsoft.com/devcontainers/python:3.9
+
+# Install necessary packages
+RUN apt-get update && apt-get install -y \
+    wget \
+    unzip \
+    xvfb \
+    libxi6 \
+    libgconf-2-4 \
+    libnss3-dev \
+    libxss1 \
+    libappindicator1 \
+    fonts-liberation \
+    libatk-bridge2.0-0 \
+    libgtk-3-0 \
+    libgbm1 \
+    libasound2 \
+    libdpkg-perl \
+    libatomic1 \
+    ca-certificates \
+    curl \
+    gnupg \
+    python3-venv \
+    awscli
+
+# Install pipx and biobricks
+RUN python3 -m pip install --user pipx \
+    && python3 -m pipx ensurepath
+
+# Create /mnt/biobricks directory and set permissions
+RUN mkdir -p /mnt/biobricks/biobricks-ai \
+    && chown -R vscode:vscode /mnt/biobricks
+
+# Add pipx binaries to the PATH for all users
+RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> /etc/bash.bashrc \
+    && echo 'export PATH="$PATH:$HOME/.local/bin"' >> /home/vscode/.bashrc
+
+# Switch to vscode user to perform user-specific installations
+USER vscode
+
+# Install Python dependencies
+COPY .devcontainer/requirements.txt /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+# Accept build argument for BIOBRICKS_TOKEN & set to the default value if it is not given.
+ARG BIOBRICKS_TOKEN
+ENV DEFAULT_TOKEN=VQF6Q2U-NKktZ31ioVYa9w
+ENV BIOBRICKS_TOKEN=${BIOBRICKS_TOKEN:-${DEFAULT_TOKEN}}
+RUN if [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then export BIOBRICKS_TOKEN=$DEFAULT_TOKEN; fi
+
+# Install biobricks and configure it
+RUN /bin/bash -c 'source /etc/bash.bashrc && pipx install biobricks && biobricks version' \
+    && /bin/bash -c 'if [ -z "$BIOBRICKS_TOKEN" ] || [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then echo "BIOBRICKS_TOKEN is not set or is too short (less than 5 characters)"; exit 1; fi' \
+    && /bin/bash -c 'source /etc/bash.bashrc && biobricks configure --bblib=/mnt/biobricks --token=${BIOBRICKS_TOKEN} --interactive=False'
+
+# Switch back to root user to complete setup
+USER root
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,29 @@
+{
+    "name": "SMRT Development Container",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {
+            "BIOBRICKS_TOKEN": "${localEnv:BIOBRICKS_PUBLIC_TOKEN}"
+        }
+    },
+    "features": {
+        "ghcr.io/devcontainers/features/docker-in-docker:1": {}
+    },
+    "customizations": {
+        "vscode": {
+            "settings": {
+                "terminal.integrated.defaultProfile.linux": "bash",
+                "python.pythonPath": "/usr/local/bin/python"
+            },
+            "extensions": [
+                "ms-python.python",
+                "ms-toolsai.jupyter",
+                "ms-vsliveshare.vsliveshare",  // Live Share extension
+                "github.copilot",  // GitHub Copilot extension
+                "insilica.vscode-pycmd"
+            ]
+        }
+    },
+    "remoteUser": "vscode"
+}
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
@@ -0,0 +1,7 @@
+python-dotenv==1.0.1
+pandas==2.2.2
+biobricks==0.3.7
+fastparquet==2024.5.0
+pyarrow==16.1.0
+dvc==3.51.1
+dvc-s3==3.2.0
diff --git a/.dvc/.gitignore b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
@@ -0,0 +1,6 @@
+[core]
+  remote = biobricks.ai
+['remote "biobricks.ai"']
+  url = https://ins-dvc.s3.amazonaws.com/insdvc
+['remote "s3.biobricks.ai"']
+  url = s3://ins-dvc/insdvc
diff --git a/.dvcignore b/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
diff --git a/.github/workflows/bricktools-check.yaml b/.github/workflows/bricktools-check.yaml
@@ -0,0 +1,10 @@
+name: bricktools-check
+on: [push, workflow_dispatch]
+jobs:
+  bricktools-check:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: bricktools check
+        uses: biobricks-ai/github-actions/bricktools-check@main
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+logs
+/download
+/list
+/brick
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# How to build bricks
+
+1. Create a brick named `{newbrick}` from this template
+```
+gh repo create biobricks-ai/{newbrick} -p biobricks-ai/brick-template --public
+gh repo clone biobricks-ai/{newbrick}
+cd newbrick
+```
+
+2. Edit stages according to your needs:
+    Recommended scripts:
+    - ``01_download.sh``
+    - ``02_unzip.sh``
+    - ``03_build.sh`` calling a function to process individual files like ``csv2parquet.R`` or ``csv2parquet.py``
+
+3. Replace stages in dvc.yaml with your new stages
+
+4. Build your brick
+```
+dvc repro # runs new stages
+```
+
+5. Push the data to biobricks.ai
+```
+dvc push -r s3.biobricks.ai 
+```
+
+6. Commit the brick
+```
+git add -A && git commit -m "some message"
+git push
+```
+
+7. Monitor the bricktools github action
+
diff --git a/dvc.lock b/dvc.lock
@@ -0,0 +1,2 @@
+schema: '2.0'
+stages: {}
diff --git a/dvc.yaml b/dvc.yaml
@@ -0,0 +1,35 @@
+# Brick DVC stages
+# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml
+
+# The complete process can be executed using:
+# dvc repro
+# If you want to force redoing the process use 
+# dvc repro -f
+# Individual stage can be executed using: 
+# dvc repro <stage>
+
+stages:
+  download:
+    cmd: stages/01_download.sh
+    deps:
+      - stages/01_download.sh
+    outs:
+      - download
+      - list
+  unzip: 
+    cmd: stages/02_unzip.sh
+    deps:
+      - stages/02_unzip.sh
+      - download
+      - list
+    outs:
+      - raw
+  build: 
+    cmd: stages/03_build.sh
+    deps:
+      - stages/03_build.sh
+      - stages/csv2parquet.sh
+      - raw
+      - list
+    outs:
+      - brick
diff --git a/stages/01_download.sh b/stages/01_download.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Script to download files
+
+# Get local path
+localpath=$(pwd)
+echo "Local path: $localpath"
+
+# Create the list directory to save list of remote files and directories
+listpath="$localpath/list"
+echo "List path: $listpath"
+mkdir -p $listpath
+cd $listpath;
+
+# Define the FTP base address
+export ftpbase=""
+
+# Retrieve the list of files to download from FTP base address
+wget --no-remove-listing $ftpbase
+cat index.html | grep -Po '(?<=href=")[^"]*' | sort | cut -d "/" -f 10 > files.txt
+rm .listing
+rm index.html
+
+# Create the download directory
+export downloadpath="$localpath/download"
+echo "Download path: $downloadpath"
+mkdir -p "$downloadpath"
+cd $downloadpath;
+
+# Download files in parallel
+cat $listpath/files.txt | xargs -P14 -n1 bash -c '
+  echo $0
+  wget -nH -q -nc -P $downloadpath $ftpbase$0
+'
+
+echo "Download done."
diff --git a/stages/02_unzip.sh b/stages/02_unzip.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Script to unzip files
+
+# Get local path
+localpath=$(pwd)
+echo "Local path: $localpath"
+
+# Set download path
+export downloadpath="$localpath/download"
+echo "Download path: $downloadpath"
+
+# Set list path
+listpath="$localpath/list"
+echo "List path: $listpath"
+
+# Create raw path
+export rawpath="$localpath/raw"
+mkdir -p $rawpath
+echo "Raw path: $rawpath"
+
+# Unzip files in parallel
+cat $listpath/files.txt | tail -n +2 | xargs -P14 -n1 bash -c '
+  filename="${0%.*}"
+  echo $downloadpath/$0
+  echo $rawpath/$filename
+  unzip $downloadpath/$0 -d $rawpath/$filename
+'
diff --git a/stages/03_build.sh b/stages/03_build.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Script to process unzipped files and build parquet files
+
+# Get local path
+localpath=$(pwd)
+echo "Local path: $localpath"
+
+# Set list path
+listpath="$localpath/list"
+mkdir -p $listpath
+echo "List path: $listpath"
+
+# Set raw path
+export rawpath="$localpath/raw"
+echo "Raw path: $rawpath"
+
+# Create brick directory
+export brickpath="$localpath/brick"
+mkdir -p $brickpath
+echo "Brick path: $brickpath"
+
+# Process raw files and create parquet files in parallel
+# calling a Python function with arguments input and output filenames
+cat $listpath/files.txt | tail -n +4 | xargs -P14 -n1 bash -c '
+  filename="${0%.*}"
+  echo $rawpath/$filename/$filename.txt
+  echo $brickpath/$filename.parquet
+  python stages/csv2parquet.py $rawpath/$filename.txt $brickpath/$filename.parquet
+'
diff --git a/stages/csv2parquet.R b/stages/csv2parquet.R
@@ -0,0 +1,2 @@
+# edit this stage to create new resources in the data directory
+mtcars |> arrow::write_parquet("brick/mtcars.parquet")
diff --git a/stages/csv2parquet.py b/stages/csv2parquet.py
@@ -0,0 +1,11 @@
+import pandas as pd
+import sys
+import pyarrow as pyarrow
+import fastparquet as fastparquet
+
+InFileName = sys.argv[1]
+OutFileName = sys.argv[2]
+
+print(f"csv2parquet: Converting file {InFileName}")
+DF = pd.read_csv(InFileName, sep=',')
+DF.to_parquet(OutFileName)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# edit this stage to create new resources in the data directory
		mtcars \|> arrow::write_parquet("brick/mtcars.parquet")