Skip to content

Commit

Permalink
Merge pull request #1 from biobricks-ai/finished-version
Browse files Browse the repository at this point in the history
Finished version
  • Loading branch information
tomlue authored Jun 4, 2024
2 parents 5a5f0d9 + 47aed30 commit c0f14bf
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
/download
/brick
status.txt
status.txt
8 changes: 8 additions & 0 deletions code/00_status.py
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
# PURPOSE: CHECK IF THE SOURCE HAS CHANGED
import requests
source_url = "https://figshare.com/articles/dataset/The_METLIN_small_molecule_dataset_for_machine_learning-based_retention_time_prediction/8038913"

# request.get source_url to status.txt
response = requests.get(source_url)

with open("status.txt", "w") as file:
file.write(str(response.text))
9 changes: 9 additions & 0 deletions code/01_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,12 @@
os.makedirs('download', exist_ok=True)

# read the data from the ./download directory
import requests
source_url = "https://figshare.com/ndownloader/articles/8038913/versions/1"

# stream the source_url to the ./download/smrt_data.zip
with requests.get(source_url, stream=True) as r:
r.raise_for_status()
with open('download/smrt_data.zip', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
17 changes: 15 additions & 2 deletions code/02_process.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
# PURPOSE: CHANGE THE DOWNLOADED DATA TO ONE OR MORE PARQUET FILES
import os
import os, shutil, zipfile, pandas as pd

# exports to the ./brick directory
os.makedirs('brick', exist_ok=True)

# read the data from the ./download directory
# make a temp directory
os.makedirs('temp', exist_ok=True)

# unzip the download/smrt_data.zip to temp directory
with zipfile.ZipFile('download/smrt_data.zip', 'r') as zip_ref:
zip_ref.extractall('temp')

# transform SMRT_DATA to PARQUET
smrt_datset = pd.read_csv('temp/SMRT_dataset.csv',sep=';')
smrt_datset.to_parquet('brick/smrt_dataset.parquet')

# skip other assets which are all generated or redundant to this raw data
# remove temp directory
shutil.rmtree('temp')
37 changes: 17 additions & 20 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,36 @@ schema: '2.0'
stages:
status:
cmd: python code/00_status.py
deps:
- path: code/00_status.py
hash: md5
md5: 95a09d63c054eb185a1408771f4ee8a3
size: 43
download:
cmd: python code/01_download.py
deps:
- path: code/01_download.py
hash: md5
md5: f82fd5fc2597b90ed411991180e4ac30
size: 195
md5: 03b63774ca433e50ef222f35b826943a
size: 555
- path: status.txt
hash: md5
md5: 386705179ebe331fa00c2cb2d510ad5a
size: 167822
outs:
- path: download/
- path: download/smrt_data.zip
hash: md5
md5: d751713988987e9331980363e24189ce.dir
size: 0
nfiles: 0
md5: 06a13c7cf182b8be650feec1e620804b
size: 1007003829
process:
cmd: python code/02_process.py
deps:
- path: code/02_process.py
hash: md5
md5: c520d6a17cb1fb7e47155d606ce80701
size: 197
- path: download/
md5: 5e746adccd6d4f20fa5c5f7cdcb34328
size: 655
- path: download/smrt_data.zip
hash: md5
md5: d751713988987e9331980363e24189ce.dir
size: 0
nfiles: 0
md5: 06a13c7cf182b8be650feec1e620804b
size: 1007003829
outs:
- path: brick/
hash: md5
md5: d751713988987e9331980363e24189ce.dir
size: 0
nfiles: 0
md5: c66ba598b117d1c2c28a1cc4754cfe60.dir
size: 6655466
nfiles: 1
4 changes: 2 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ stages:
- status.txt
- code/01_download.py
outs:
- download/
- download/smrt_data.zip
process:
cmd: python code/02_process.py
deps:
- download/
- download/smrt_data.zip
- code/02_process.py
outs:
- brick/
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ biobricks==0.3.7
fastparquet==2024.5.0
pyarrow==16.1.0
dvc==3.51.1
dvc-s3==3.2.0
dvc-s3==3.2.0

0 comments on commit c0f14bf

Please sign in to comment.