Skip to content

Commit

Permalink
Pipeline cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
zmughal committed Dec 6, 2024
1 parent c87f329 commit 8ec5779
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .bb/dependencies.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://github.com/biobricks-ai/pubchem-annotations#4bc6de375ba7dbdbd2986b53d952e41ace592551
https://github.com/biobricks-ai/pubchem-annotations#3b1b417e43b17b6caf75a730f8c4119aa76f8c16
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ logs
/download
/list
/brick
/cache
14 changes: 12 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
stages:

update_biobricks_dependencies:
cmd: stages/00_update_biobricks_dependencies.py
cmd: stages/00_update_biobricks_deps.sh
deps:
- stages/00_update_biobricks_deps.sh

process-annotations:
cmd: stages/01_process.py
deps:
- .bb/dependencies.txt
- stages/01_process.py
outs:
- cache/process/combined_annotations.ttl
build:
cmd: stages/02_build.sh
deps:
- cache/process/combined_annotations.ttl
- stages/02_build.sh
outs:
- brick/pubchem-annotations.hdt
- brick/annotations.hdt
7 changes: 0 additions & 7 deletions stages/00_update_biobricks_deps.py

This file was deleted.

3 changes: 3 additions & 0 deletions stages/00_update_biobricks_deps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/bash

biobricks pull
33 changes: 2 additions & 31 deletions stages/01_process.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

import biobricks as bb
import pandas as pd
import pyarrow.parquet as pq
Expand Down Expand Up @@ -148,34 +150,3 @@
with open(file, "r") as infile:
outfile.write(infile.read())
outfile.write("\n") # Ensure separation between files

print("Creating HDT file ...")
# Convert the Turtle file to an HDT file
hdt_file = str(outdir / 'annotations.hdt')
# # # create empty HDT file
# with open(hdt_file, "w") as f:
# pass

# # # Create an HDT store
# store = HDTStore(hdt_file)

# # # load the entire graph into the store
# g = Graph()
# g.parse(combined_file, format='ttl')

# # # add the graph to the store
# store.load_graph(g) # or add_graph?

# # close the store to finalize the HDT file
# store.close()

# hdt = HDTDocument().from_graph(g)

# Conversion using the command-line tool rdf2hdtcat
subprocess.run(["rdf2hdtcat -p", combined_file, hdt_file], check=True)
# # Conversion using the command-line tool rdf2hdt
# subprocess.run(["rdf2hdt -p", combined_file, hdt_file], check=True)
print(f"Done writing HDT file to {hdt_file}")

# # delete cache directory
# shutil.rmtree(pathlib.Path('cache'))
20 changes: 20 additions & 0 deletions stages/02_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash

set -euo pipefail

# Get local path
localpath=$(pwd)
echo "Local path: $localpath"

# Get cache path
cachepath="$localpath/cache"
echo "Cache path: $cachepath"

# Create brick directory
brickpath="$localpath/brick"
mkdir -p $brickpath
echo "Brick path: $brickpath"

export base_uri="http://rdf.ncbi.nlm.nih.gov/pubchem/annotations.hdt"

rdf2hdt -i -p -B "$base_uri" $cachepath/process/combined_annotations.ttl $brickpath/annotations.hdt
2 changes: 2 additions & 0 deletions stages/02_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

import rdflib
import pathlib
from rdflib_hdt import HDTStore
Expand Down

0 comments on commit 8ec5779

Please sign in to comment.