generated from biobricks-ai/brick-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 0945695
Showing
16 changed files
with
298 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
FROM mcr.microsoft.com/devcontainers/python:3.9 | ||
|
||
# Install necessary packages | ||
RUN apt-get update && apt-get install -y \ | ||
wget \ | ||
unzip \ | ||
xvfb \ | ||
libxi6 \ | ||
libgconf-2-4 \ | ||
libnss3-dev \ | ||
libxss1 \ | ||
libappindicator1 \ | ||
fonts-liberation \ | ||
libatk-bridge2.0-0 \ | ||
libgtk-3-0 \ | ||
libgbm1 \ | ||
libasound2 \ | ||
libdpkg-perl \ | ||
libatomic1 \ | ||
ca-certificates \ | ||
curl \ | ||
gnupg \ | ||
python3-venv \ | ||
awscli | ||
|
||
# Install pipx and biobricks | ||
RUN python3 -m pip install --user pipx \ | ||
&& python3 -m pipx ensurepath | ||
|
||
# Create /mnt/biobricks directory and set permissions | ||
RUN mkdir -p /mnt/biobricks/biobricks-ai \ | ||
&& chown -R vscode:vscode /mnt/biobricks | ||
|
||
# Add pipx binaries to the PATH for all users | ||
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> /etc/bash.bashrc \ | ||
&& echo 'export PATH="$PATH:$HOME/.local/bin"' >> /home/vscode/.bashrc | ||
|
||
# Switch to vscode user to perform user-specific installations | ||
USER vscode | ||
|
||
# Install Python dependencies | ||
COPY .devcontainer/requirements.txt /tmp/requirements.txt | ||
RUN pip install -r /tmp/requirements.txt | ||
|
||
# Accept build argument for BIOBRICKS_TOKEN & set to the default value if it is not given. | ||
ARG BIOBRICKS_TOKEN | ||
ENV DEFAULT_TOKEN=VQF6Q2U-NKktZ31ioVYa9w | ||
ENV BIOBRICKS_TOKEN=${BIOBRICKS_TOKEN:-${DEFAULT_TOKEN}} | ||
RUN if [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then export BIOBRICKS_TOKEN=$DEFAULT_TOKEN; fi | ||
|
||
# Install biobricks and configure it | ||
RUN /bin/bash -c 'source /etc/bash.bashrc && pipx install biobricks && biobricks version' \ | ||
&& /bin/bash -c 'if [ -z "$BIOBRICKS_TOKEN" ] || [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then echo "BIOBRICKS_TOKEN is not set or is too short (less than 5 characters)"; exit 1; fi' \ | ||
&& /bin/bash -c 'source /etc/bash.bashrc && biobricks configure --bblib=/mnt/biobricks --token=${BIOBRICKS_TOKEN} --interactive=False' | ||
|
||
# Switch back to root user to complete setup | ||
USER root |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"name": "SMRT Development Container", | ||
"build": { | ||
"dockerfile": "Dockerfile", | ||
"context": "..", | ||
"args": { | ||
"BIOBRICKS_TOKEN": "${localEnv:BIOBRICKS_PUBLIC_TOKEN}" | ||
} | ||
}, | ||
"features": { | ||
"ghcr.io/devcontainers/features/docker-in-docker:1": {} | ||
}, | ||
"customizations": { | ||
"vscode": { | ||
"settings": { | ||
"terminal.integrated.defaultProfile.linux": "bash", | ||
"python.pythonPath": "/usr/local/bin/python" | ||
}, | ||
"extensions": [ | ||
"ms-python.python", | ||
"ms-toolsai.jupyter", | ||
"ms-vsliveshare.vsliveshare", // Live Share extension | ||
"github.copilot", // GitHub Copilot extension | ||
"insilica.vscode-pycmd" | ||
] | ||
} | ||
}, | ||
"remoteUser": "vscode" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
python-dotenv==1.0.1 | ||
pandas==2.2.2 | ||
biobricks==0.3.7 | ||
fastparquet==2024.5.0 | ||
pyarrow==16.1.0 | ||
dvc==3.51.1 | ||
dvc-s3==3.2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
/config.local | ||
/tmp | ||
/cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[core] | ||
remote = biobricks.ai | ||
['remote "biobricks.ai"'] | ||
url = https://ins-dvc.s3.amazonaws.com/insdvc | ||
['remote "s3.biobricks.ai"'] | ||
url = s3://ins-dvc/insdvc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Add patterns of files dvc should ignore, which could improve | ||
# the performance. Learn more at | ||
# https://dvc.org/doc/user-guide/dvcignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
name: bricktools-check | ||
on: [push, workflow_dispatch] | ||
jobs: | ||
bricktools-check: | ||
runs-on: ubuntu-latest | ||
env: | ||
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} | ||
steps: | ||
- name: bricktools check | ||
uses: biobricks-ai/github-actions/bricktools-check@main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
logs | ||
/download | ||
/list | ||
/brick |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# How to build bricks | ||
|
||
1. Create a brick named `{newbrick}` from this template | ||
``` | ||
gh repo create biobricks-ai/{newbrick} -p biobricks-ai/brick-template --public | ||
gh repo clone biobricks-ai/{newbrick} | ||
cd newbrick | ||
``` | ||
|
||
2. Edit stages according to your needs: | ||
Recommended scripts: | ||
- ``01_download.sh`` | ||
- ``02_unzip.sh`` | ||
- ``03_build.sh`` calling a function to process individual files like ``csv2parquet.R`` or ``csv2parquet.py`` | ||
|
||
3. Replace stages in dvc.yaml with your new stages | ||
|
||
4. Build your brick | ||
``` | ||
dvc repro # runs new stages | ||
``` | ||
|
||
5. Push the data to biobricks.ai | ||
``` | ||
dvc push -r s3.biobricks.ai | ||
``` | ||
|
||
6. Commit the brick | ||
``` | ||
git add -A && git commit -m "some message" | ||
git push | ||
``` | ||
|
||
7. Monitor the bricktools github action | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
schema: '2.0' | ||
stages: {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Brick DVC stages | ||
# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml | ||
|
||
# The complete process can be executed using: | ||
# dvc repro | ||
# If you want to force redoing the process use | ||
# dvc repro -f | ||
# Individual stage can be executed using: | ||
# dvc repro <stage> | ||
|
||
stages: | ||
download: | ||
cmd: stages/01_download.sh | ||
deps: | ||
- stages/01_download.sh | ||
outs: | ||
- download | ||
- list | ||
unzip: | ||
cmd: stages/02_unzip.sh | ||
deps: | ||
- stages/02_unzip.sh | ||
- download | ||
- list | ||
outs: | ||
- raw | ||
build: | ||
cmd: stages/03_build.sh | ||
deps: | ||
- stages/03_build.sh | ||
- stages/csv2parquet.sh | ||
- raw | ||
- list | ||
outs: | ||
- brick |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Script to download files | ||
|
||
# Get local path | ||
localpath=$(pwd) | ||
echo "Local path: $localpath" | ||
|
||
# Create the list directory to save list of remote files and directories | ||
listpath="$localpath/list" | ||
echo "List path: $listpath" | ||
mkdir -p $listpath | ||
cd $listpath; | ||
|
||
# Define the FTP base address | ||
export ftpbase="" | ||
|
||
# Retrieve the list of files to download from FTP base address | ||
wget --no-remove-listing $ftpbase | ||
cat index.html | grep -Po '(?<=href=")[^"]*' | sort | cut -d "/" -f 10 > files.txt | ||
rm .listing | ||
rm index.html | ||
|
||
# Create the download directory | ||
export downloadpath="$localpath/download" | ||
echo "Download path: $downloadpath" | ||
mkdir -p "$downloadpath" | ||
cd $downloadpath; | ||
|
||
# Download files in parallel | ||
cat $listpath/files.txt | xargs -P14 -n1 bash -c ' | ||
echo $0 | ||
wget -nH -q -nc -P $downloadpath $ftpbase$0 | ||
' | ||
|
||
echo "Download done." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Script to unzip files | ||
|
||
# Get local path | ||
localpath=$(pwd) | ||
echo "Local path: $localpath" | ||
|
||
# Set download path | ||
export downloadpath="$localpath/download" | ||
echo "Download path: $downloadpath" | ||
|
||
# Set list path | ||
listpath="$localpath/list" | ||
echo "List path: $listpath" | ||
|
||
# Create raw path | ||
export rawpath="$localpath/raw" | ||
mkdir -p $rawpath | ||
echo "Raw path: $rawpath" | ||
|
||
# Unzip files in parallel | ||
cat $listpath/files.txt | tail -n +2 | xargs -P14 -n1 bash -c ' | ||
filename="${0%.*}" | ||
echo $downloadpath/$0 | ||
echo $rawpath/$filename | ||
unzip $downloadpath/$0 -d $rawpath/$filename | ||
' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Script to process unzipped files and build parquet files | ||
|
||
# Get local path | ||
localpath=$(pwd) | ||
echo "Local path: $localpath" | ||
|
||
# Set list path | ||
listpath="$localpath/list" | ||
mkdir -p $listpath | ||
echo "List path: $listpath" | ||
|
||
# Set raw path | ||
export rawpath="$localpath/raw" | ||
echo "Raw path: $rawpath" | ||
|
||
# Create brick directory | ||
export brickpath="$localpath/brick" | ||
mkdir -p $brickpath | ||
echo "Brick path: $brickpath" | ||
|
||
# Process raw files and create parquet files in parallel | ||
# calling a Python function with arguments input and output filenames | ||
cat $listpath/files.txt | tail -n +4 | xargs -P14 -n1 bash -c ' | ||
filename="${0%.*}" | ||
echo $rawpath/$filename/$filename.txt | ||
echo $brickpath/$filename.parquet | ||
python stages/csv2parquet.py $rawpath/$filename.txt $brickpath/$filename.parquet | ||
' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# edit this stage to create new resources in the data directory | ||
mtcars |> arrow::write_parquet("brick/mtcars.parquet") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import pandas as pd | ||
import sys | ||
import pyarrow as pyarrow | ||
import fastparquet as fastparquet | ||
|
||
InFileName = sys.argv[1] | ||
OutFileName = sys.argv[2] | ||
|
||
print(f"csv2parquet: Converting file {InFileName}") | ||
DF = pd.read_csv(InFileName, sep=',') | ||
DF.to_parquet(OutFileName) |