diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..3cdfe31 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,57 @@ +FROM mcr.microsoft.com/devcontainers/python:3.9 + +# Install necessary packages +RUN apt-get update && apt-get install -y \ + wget \ + unzip \ + xvfb \ + libxi6 \ + libgconf-2-4 \ + libnss3-dev \ + libxss1 \ + libappindicator1 \ + fonts-liberation \ + libatk-bridge2.0-0 \ + libgtk-3-0 \ + libgbm1 \ + libasound2 \ + libdpkg-perl \ + libatomic1 \ + ca-certificates \ + curl \ + gnupg \ + python3-venv \ + awscli + +# Install pipx and biobricks +RUN python3 -m pip install --user pipx \ + && python3 -m pipx ensurepath + +# Create /mnt/biobricks directory and set permissions +RUN mkdir -p /mnt/biobricks/biobricks-ai \ + && chown -R vscode:vscode /mnt/biobricks + +# Add pipx binaries to the PATH for all users +RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> /etc/bash.bashrc \ + && echo 'export PATH="$PATH:$HOME/.local/bin"' >> /home/vscode/.bashrc + +# Switch to vscode user to perform user-specific installations +USER vscode + +# Install Python dependencies +COPY .devcontainer/requirements.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt + +# Accept build argument for BIOBRICKS_TOKEN & set to the default value if it is not given. +ARG BIOBRICKS_TOKEN +ENV DEFAULT_TOKEN=VQF6Q2U-NKktZ31ioVYa9w +ENV BIOBRICKS_TOKEN=${BIOBRICKS_TOKEN:-${DEFAULT_TOKEN}} +RUN if [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then export BIOBRICKS_TOKEN=$DEFAULT_TOKEN; fi + +# Install biobricks and configure it +RUN /bin/bash -c 'source /etc/bash.bashrc && pipx install biobricks && biobricks version' \ + && /bin/bash -c 'if [ -z "$BIOBRICKS_TOKEN" ] || [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then echo "BIOBRICKS_TOKEN is not set or is too short (less than 5 characters)"; exit 1; fi' \ + && /bin/bash -c 'source /etc/bash.bashrc && biobricks configure --bblib=/mnt/biobricks --token=${BIOBRICKS_TOKEN} --interactive=False' + +# Switch back to root user to complete setup +USER root diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..ab26d65 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +{ + "name": "SMRT Development Container", + "build": { + "dockerfile": "Dockerfile", + "context": "..", + "args": { + "BIOBRICKS_TOKEN": "${localEnv:BIOBRICKS_PUBLIC_TOKEN}" + } + }, + "features": { + "ghcr.io/devcontainers/features/docker-in-docker:1": {} + }, + "customizations": { + "vscode": { + "settings": { + "terminal.integrated.defaultProfile.linux": "bash", + "python.pythonPath": "/usr/local/bin/python" + }, + "extensions": [ + "ms-python.python", + "ms-toolsai.jupyter", + "ms-vsliveshare.vsliveshare", // Live Share extension + "github.copilot", // GitHub Copilot extension + "insilica.vscode-pycmd" + ] + } + }, + "remoteUser": "vscode" +} diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt new file mode 100644 index 0000000..b7f5142 --- /dev/null +++ b/.devcontainer/requirements.txt @@ -0,0 +1,7 @@ +python-dotenv==1.0.1 +pandas==2.2.2 +biobricks==0.3.7 +fastparquet==2024.5.0 +pyarrow==16.1.0 +dvc==3.51.1 +dvc-s3==3.2.0 diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..701a919 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,6 @@ +[core] + remote = biobricks.ai +['remote "biobricks.ai"'] + url = https://ins-dvc.s3.amazonaws.com/insdvc +['remote "s3.biobricks.ai"'] + url = s3://ins-dvc/insdvc diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.github/workflows/bricktools-check.yaml b/.github/workflows/bricktools-check.yaml new file mode 100644 index 0000000..55ec739 --- /dev/null +++ b/.github/workflows/bricktools-check.yaml @@ -0,0 +1,10 @@ +name: bricktools-check +on: [push, workflow_dispatch] +jobs: + bricktools-check: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: bricktools check + uses: biobricks-ai/github-actions/bricktools-check@main diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..856f6df --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +logs +/download +/list +/brick diff --git a/README.md b/README.md new file mode 100644 index 0000000..134c9d1 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# How to build bricks + +1. Create a brick named `{newbrick}` from this template +``` +gh repo create biobricks-ai/{newbrick} -p biobricks-ai/brick-template --public +gh repo clone biobricks-ai/{newbrick} +cd newbrick +``` + +2. Edit stages according to your needs: + Recommended scripts: + - ``01_download.sh`` + - ``02_unzip.sh`` + - ``03_build.sh`` calling a function to process individual files like ``csv2parquet.R`` or ``csv2parquet.py`` + +3. Replace stages in dvc.yaml with your new stages + +4. Build your brick +``` +dvc repro # runs new stages +``` + +5. Push the data to biobricks.ai +``` +dvc push -r s3.biobricks.ai +``` + +6. Commit the brick +``` +git add -A && git commit -m "some message" +git push +``` + +7. Monitor the bricktools github action + diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..3137d7a --- /dev/null +++ b/dvc.lock @@ -0,0 +1,2 @@ +schema: '2.0' +stages: {} diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..e28b3d9 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,35 @@ +# Brick DVC stages +# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml + +# The complete process can be executed using: +# dvc repro +# If you want to force redoing the process use +# dvc repro -f +# Individual stage can be executed using: +# dvc repro + +stages: + download: + cmd: stages/01_download.sh + deps: + - stages/01_download.sh + outs: + - download + - list + unzip: + cmd: stages/02_unzip.sh + deps: + - stages/02_unzip.sh + - download + - list + outs: + - raw + build: + cmd: stages/03_build.sh + deps: + - stages/03_build.sh + - stages/csv2parquet.sh + - raw + - list + outs: + - brick diff --git a/stages/01_download.sh b/stages/01_download.sh new file mode 100755 index 0000000..e901153 --- /dev/null +++ b/stages/01_download.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Script to download files + +# Get local path +localpath=$(pwd) +echo "Local path: $localpath" + +# Create the list directory to save list of remote files and directories +listpath="$localpath/list" +echo "List path: $listpath" +mkdir -p $listpath +cd $listpath; + +# Define the FTP base address +export ftpbase="" + +# Retrieve the list of files to download from FTP base address +wget --no-remove-listing $ftpbase +cat index.html | grep -Po '(?<=href=")[^"]*' | sort | cut -d "/" -f 10 > files.txt +rm .listing +rm index.html + +# Create the download directory +export downloadpath="$localpath/download" +echo "Download path: $downloadpath" +mkdir -p "$downloadpath" +cd $downloadpath; + +# Download files in parallel +cat $listpath/files.txt | xargs -P14 -n1 bash -c ' + echo $0 + wget -nH -q -nc -P $downloadpath $ftpbase$0 +' + +echo "Download done." diff --git a/stages/02_unzip.sh b/stages/02_unzip.sh new file mode 100755 index 0000000..286198f --- /dev/null +++ b/stages/02_unzip.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Script to unzip files + +# Get local path +localpath=$(pwd) +echo "Local path: $localpath" + +# Set download path +export downloadpath="$localpath/download" +echo "Download path: $downloadpath" + +# Set list path +listpath="$localpath/list" +echo "List path: $listpath" + +# Create raw path +export rawpath="$localpath/raw" +mkdir -p $rawpath +echo "Raw path: $rawpath" + +# Unzip files in parallel +cat $listpath/files.txt | tail -n +2 | xargs -P14 -n1 bash -c ' + filename="${0%.*}" + echo $downloadpath/$0 + echo $rawpath/$filename + unzip $downloadpath/$0 -d $rawpath/$filename +' diff --git a/stages/03_build.sh b/stages/03_build.sh new file mode 100755 index 0000000..60b2ab7 --- /dev/null +++ b/stages/03_build.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# Script to process unzipped files and build parquet files + +# Get local path +localpath=$(pwd) +echo "Local path: $localpath" + +# Set list path +listpath="$localpath/list" +mkdir -p $listpath +echo "List path: $listpath" + +# Set raw path +export rawpath="$localpath/raw" +echo "Raw path: $rawpath" + +# Create brick directory +export brickpath="$localpath/brick" +mkdir -p $brickpath +echo "Brick path: $brickpath" + +# Process raw files and create parquet files in parallel +# calling a Python function with arguments input and output filenames +cat $listpath/files.txt | tail -n +4 | xargs -P14 -n1 bash -c ' + filename="${0%.*}" + echo $rawpath/$filename/$filename.txt + echo $brickpath/$filename.parquet + python stages/csv2parquet.py $rawpath/$filename.txt $brickpath/$filename.parquet +' diff --git a/stages/csv2parquet.R b/stages/csv2parquet.R new file mode 100644 index 0000000..6576d98 --- /dev/null +++ b/stages/csv2parquet.R @@ -0,0 +1,2 @@ +# edit this stage to create new resources in the data directory +mtcars |> arrow::write_parquet("brick/mtcars.parquet") diff --git a/stages/csv2parquet.py b/stages/csv2parquet.py new file mode 100644 index 0000000..f821fa8 --- /dev/null +++ b/stages/csv2parquet.py @@ -0,0 +1,11 @@ +import pandas as pd +import sys +import pyarrow as pyarrow +import fastparquet as fastparquet + +InFileName = sys.argv[1] +OutFileName = sys.argv[2] + +print(f"csv2parquet: Converting file {InFileName}") +DF = pd.read_csv(InFileName, sep=',') +DF.to_parquet(OutFileName)