Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlue authored Oct 24, 2024
0 parents commit 0945695
Show file tree
Hide file tree
Showing 16 changed files with 298 additions and 0 deletions.
57 changes: 57 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
FROM mcr.microsoft.com/devcontainers/python:3.9

# Install necessary packages
RUN apt-get update && apt-get install -y \
wget \
unzip \
xvfb \
libxi6 \
libgconf-2-4 \
libnss3-dev \
libxss1 \
libappindicator1 \
fonts-liberation \
libatk-bridge2.0-0 \
libgtk-3-0 \
libgbm1 \
libasound2 \
libdpkg-perl \
libatomic1 \
ca-certificates \
curl \
gnupg \
python3-venv \
awscli

# Install pipx and biobricks
RUN python3 -m pip install --user pipx \
&& python3 -m pipx ensurepath

# Create /mnt/biobricks directory and set permissions
RUN mkdir -p /mnt/biobricks/biobricks-ai \
&& chown -R vscode:vscode /mnt/biobricks

# Add pipx binaries to the PATH for all users
RUN echo 'export PATH="$PATH:$HOME/.local/bin"' >> /etc/bash.bashrc \
&& echo 'export PATH="$PATH:$HOME/.local/bin"' >> /home/vscode/.bashrc

# Switch to vscode user to perform user-specific installations
USER vscode

# Install Python dependencies
COPY .devcontainer/requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt

# Accept build argument for BIOBRICKS_TOKEN & set to the default value if it is not given.
ARG BIOBRICKS_TOKEN
ENV DEFAULT_TOKEN=VQF6Q2U-NKktZ31ioVYa9w
ENV BIOBRICKS_TOKEN=${BIOBRICKS_TOKEN:-${DEFAULT_TOKEN}}
RUN if [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then export BIOBRICKS_TOKEN=$DEFAULT_TOKEN; fi

# Install biobricks and configure it
RUN /bin/bash -c 'source /etc/bash.bashrc && pipx install biobricks && biobricks version' \
&& /bin/bash -c 'if [ -z "$BIOBRICKS_TOKEN" ] || [ ${#BIOBRICKS_TOKEN} -lt 5 ]; then echo "BIOBRICKS_TOKEN is not set or is too short (less than 5 characters)"; exit 1; fi' \
&& /bin/bash -c 'source /etc/bash.bashrc && biobricks configure --bblib=/mnt/biobricks --token=${BIOBRICKS_TOKEN} --interactive=False'

# Switch back to root user to complete setup
USER root
29 changes: 29 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"name": "SMRT Development Container",
"build": {
"dockerfile": "Dockerfile",
"context": "..",
"args": {
"BIOBRICKS_TOKEN": "${localEnv:BIOBRICKS_PUBLIC_TOKEN}"
}
},
"features": {
"ghcr.io/devcontainers/features/docker-in-docker:1": {}
},
"customizations": {
"vscode": {
"settings": {
"terminal.integrated.defaultProfile.linux": "bash",
"python.pythonPath": "/usr/local/bin/python"
},
"extensions": [
"ms-python.python",
"ms-toolsai.jupyter",
"ms-vsliveshare.vsliveshare", // Live Share extension
"github.copilot", // GitHub Copilot extension
"insilica.vscode-pycmd"
]
}
},
"remoteUser": "vscode"
}
7 changes: 7 additions & 0 deletions .devcontainer/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
python-dotenv==1.0.1
pandas==2.2.2
biobricks==0.3.7
fastparquet==2024.5.0
pyarrow==16.1.0
dvc==3.51.1
dvc-s3==3.2.0
3 changes: 3 additions & 0 deletions .dvc/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/config.local
/tmp
/cache
6 changes: 6 additions & 0 deletions .dvc/config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[core]
remote = biobricks.ai
['remote "biobricks.ai"']
url = https://ins-dvc.s3.amazonaws.com/insdvc
['remote "s3.biobricks.ai"']
url = s3://ins-dvc/insdvc
3 changes: 3 additions & 0 deletions .dvcignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore
10 changes: 10 additions & 0 deletions .github/workflows/bricktools-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: bricktools-check
on: [push, workflow_dispatch]
jobs:
bricktools-check:
runs-on: ubuntu-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: bricktools check
uses: biobricks-ai/github-actions/bricktools-check@main
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
logs
/download
/list
/brick
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# How to build bricks

1. Create a brick named `{newbrick}` from this template
```
gh repo create biobricks-ai/{newbrick} -p biobricks-ai/brick-template --public
gh repo clone biobricks-ai/{newbrick}
cd newbrick
```

2. Edit stages according to your needs:
Recommended scripts:
- ``01_download.sh``
- ``02_unzip.sh``
- ``03_build.sh`` calling a function to process individual files like ``csv2parquet.R`` or ``csv2parquet.py``

3. Replace stages in dvc.yaml with your new stages

4. Build your brick
```
dvc repro # runs new stages
```

5. Push the data to biobricks.ai
```
dvc push -r s3.biobricks.ai
```

6. Commit the brick
```
git add -A && git commit -m "some message"
git push
```

7. Monitor the bricktools github action

2 changes: 2 additions & 0 deletions dvc.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
schema: '2.0'
stages: {}
35 changes: 35 additions & 0 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Brick DVC stages
# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml

# The complete process can be executed using:
# dvc repro
# If you want to force redoing the process use
# dvc repro -f
# Individual stage can be executed using:
# dvc repro <stage>

stages:
download:
cmd: stages/01_download.sh
deps:
- stages/01_download.sh
outs:
- download
- list
unzip:
cmd: stages/02_unzip.sh
deps:
- stages/02_unzip.sh
- download
- list
outs:
- raw
build:
cmd: stages/03_build.sh
deps:
- stages/03_build.sh
- stages/csv2parquet.sh
- raw
- list
outs:
- brick
36 changes: 36 additions & 0 deletions stages/01_download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash

# Script to download files

# Get local path
localpath=$(pwd)
echo "Local path: $localpath"

# Create the list directory to save list of remote files and directories
listpath="$localpath/list"
echo "List path: $listpath"
mkdir -p $listpath
cd $listpath;

# Define the FTP base address
export ftpbase=""

# Retrieve the list of files to download from FTP base address
wget --no-remove-listing $ftpbase
cat index.html | grep -Po '(?<=href=")[^"]*' | sort | cut -d "/" -f 10 > files.txt
rm .listing
rm index.html

# Create the download directory
export downloadpath="$localpath/download"
echo "Download path: $downloadpath"
mkdir -p "$downloadpath"
cd $downloadpath;

# Download files in parallel
cat $listpath/files.txt | xargs -P14 -n1 bash -c '
echo $0
wget -nH -q -nc -P $downloadpath $ftpbase$0
'

echo "Download done."
28 changes: 28 additions & 0 deletions stages/02_unzip.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

# Script to unzip files

# Get local path
localpath=$(pwd)
echo "Local path: $localpath"

# Set download path
export downloadpath="$localpath/download"
echo "Download path: $downloadpath"

# Set list path
listpath="$localpath/list"
echo "List path: $listpath"

# Create raw path
export rawpath="$localpath/raw"
mkdir -p $rawpath
echo "Raw path: $rawpath"

# Unzip files in parallel
cat $listpath/files.txt | tail -n +2 | xargs -P14 -n1 bash -c '
filename="${0%.*}"
echo $downloadpath/$0
echo $rawpath/$filename
unzip $downloadpath/$0 -d $rawpath/$filename
'
30 changes: 30 additions & 0 deletions stages/03_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

# Script to process unzipped files and build parquet files

# Get local path
localpath=$(pwd)
echo "Local path: $localpath"

# Set list path
listpath="$localpath/list"
mkdir -p $listpath
echo "List path: $listpath"

# Set raw path
export rawpath="$localpath/raw"
echo "Raw path: $rawpath"

# Create brick directory
export brickpath="$localpath/brick"
mkdir -p $brickpath
echo "Brick path: $brickpath"

# Process raw files and create parquet files in parallel
# calling a Python function with arguments input and output filenames
cat $listpath/files.txt | tail -n +4 | xargs -P14 -n1 bash -c '
filename="${0%.*}"
echo $rawpath/$filename/$filename.txt
echo $brickpath/$filename.parquet
python stages/csv2parquet.py $rawpath/$filename.txt $brickpath/$filename.parquet
'
2 changes: 2 additions & 0 deletions stages/csv2parquet.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# edit this stage to create new resources in the data directory
mtcars |> arrow::write_parquet("brick/mtcars.parquet")
11 changes: 11 additions & 0 deletions stages/csv2parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pandas as pd
import sys
import pyarrow as pyarrow
import fastparquet as fastparquet

InFileName = sys.argv[1]
OutFileName = sys.argv[2]

print(f"csv2parquet: Converting file {InFileName}")
DF = pd.read_csv(InFileName, sep=',')
DF.to_parquet(OutFileName)

0 comments on commit 0945695

Please sign in to comment.