From 8ca1a133a7017ed96e56f8bda8bb3cb803533462 Mon Sep 17 00:00:00 2001 From: Zakariyya Mughal Date: Tue, 10 Dec 2024 11:17:30 -0500 Subject: [PATCH 1/3] refactor[m]: Formatting of flake --- flake.nix | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/flake.nix b/flake.nix index 4e089d8..6fbab8d 100644 --- a/flake.nix +++ b/flake.nix @@ -19,17 +19,17 @@ outputs = { self, nixpkgs, flake-utils, hdt-cpp, hdt-java }: flake-utils.lib.eachDefaultSystem (system: with import nixpkgs { inherit system; }; { - devShells.default = mkShell { - buildInputs = [ + devShells.default = mkShell { + buildInputs = [ hdt-cpp.packages.${system}.default - hdt-java.packages.${system}.default - apache-jena - apache-jena-fuseki - jq - ]; - env = { - JENA_HOME = "${apache-jena}"; - }; - }; + hdt-java.packages.${system}.default + apache-jena + apache-jena-fuseki + jq + ]; + env = { + JENA_HOME = "${apache-jena}"; + }; + }; }); } From 75f86301f6975ac767b37d22a48a57d7f9cc91dc Mon Sep 17 00:00:00 2001 From: Zakariyya Mughal Date: Fri, 6 Dec 2024 13:06:55 -0500 Subject: [PATCH 2/3] Pipeline cleanup --- .bb/dependencies.txt | 2 +- .gitignore | 1 + dvc.yaml | 15 ++++++++++++-- stages/00_update_biobricks_deps.py | 7 ------- stages/00_update_biobricks_deps.sh | 3 +++ stages/01_process.py | 33 ++---------------------------- stages/02_build.sh | 23 +++++++++++++++++++++ stages/02_test.py | 2 ++ 8 files changed, 45 insertions(+), 41 deletions(-) delete mode 100644 stages/00_update_biobricks_deps.py create mode 100755 stages/00_update_biobricks_deps.sh create mode 100755 stages/02_build.sh diff --git a/.bb/dependencies.txt b/.bb/dependencies.txt index 5d5eecf..9bc139f 100644 --- a/.bb/dependencies.txt +++ b/.bb/dependencies.txt @@ -1 +1 @@ -https://github.com/biobricks-ai/pubchem-annotations#4bc6de375ba7dbdbd2986b53d952e41ace592551 +https://github.com/biobricks-ai/pubchem-annotations#3b1b417e43b17b6caf75a730f8c4119aa76f8c16 diff --git a/.gitignore b/.gitignore index 856f6df..7f9eb21 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ logs /download /list /brick +/cache diff --git a/dvc.yaml b/dvc.yaml index 68a60ef..78abfc2 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,11 +1,22 @@ stages: update_biobricks_dependencies: - cmd: stages/00_update_biobricks_dependencies.py + cmd: stages/00_update_biobricks_deps.sh + deps: + - stages/00_update_biobricks_deps.sh process-annotations: cmd: stages/01_process.py deps: - .bb/dependencies.txt + - stages/01_process.py + outs: + - cache/process/combined_annotations.ttl + build: + cmd: stages/02_build.sh + deps: + - cache/process/combined_annotations.ttl + - stages/02_build.sh outs: - - brick/pubchem-annotations.hdt \ No newline at end of file + - brick/annotations.hdt + - brick/annotations.hdt.index.v1-1 diff --git a/stages/00_update_biobricks_deps.py b/stages/00_update_biobricks_deps.py deleted file mode 100644 index d5808af..0000000 --- a/stages/00_update_biobricks_deps.py +++ /dev/null @@ -1,7 +0,0 @@ -import os -import shutil -import subprocess - -# Clean up and initialize biobricks -shutil.rmtree('.bb', ignore_errors=True) -subprocess.run('biobricks init && biobricks add pubchem-annotations', shell=True) diff --git a/stages/00_update_biobricks_deps.sh b/stages/00_update_biobricks_deps.sh new file mode 100755 index 0000000..667701e --- /dev/null +++ b/stages/00_update_biobricks_deps.sh @@ -0,0 +1,3 @@ +#!/usr/bin/bash + +biobricks pull diff --git a/stages/01_process.py b/stages/01_process.py index 0195a36..62242df 100755 --- a/stages/01_process.py +++ b/stages/01_process.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import biobricks as bb import pandas as pd import pyarrow.parquet as pq @@ -148,34 +150,3 @@ with open(file, "r") as infile: outfile.write(infile.read()) outfile.write("\n") # Ensure separation between files - -print("Creating HDT file ...") -# Convert the Turtle file to an HDT file -hdt_file = str(outdir / 'annotations.hdt') -# # # create empty HDT file -# with open(hdt_file, "w") as f: -# pass - -# # # Create an HDT store -# store = HDTStore(hdt_file) - -# # # load the entire graph into the store -# g = Graph() -# g.parse(combined_file, format='ttl') - -# # # add the graph to the store -# store.load_graph(g) # or add_graph? - -# # close the store to finalize the HDT file -# store.close() - -# hdt = HDTDocument().from_graph(g) - -# Conversion using the command-line tool rdf2hdtcat -subprocess.run(["rdf2hdtcat -p", combined_file, hdt_file], check=True) -# # Conversion using the command-line tool rdf2hdt -# subprocess.run(["rdf2hdt -p", combined_file, hdt_file], check=True) -print(f"Done writing HDT file to {hdt_file}") - -# # delete cache directory -# shutil.rmtree(pathlib.Path('cache')) diff --git a/stages/02_build.sh b/stages/02_build.sh new file mode 100755 index 0000000..9e7334d --- /dev/null +++ b/stages/02_build.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Get local path +localpath=$(pwd) +echo "Local path: $localpath" + +# Get cache path +cachepath="$localpath/cache" +echo "Cache path: $cachepath" + +# Create brick directory +brickpath="$localpath/brick" +mkdir -p $brickpath +echo "Brick path: $brickpath" + +export base_uri="http://rdf.ncbi.nlm.nih.gov/pubchem/annotations.hdt" + +input_path="$cachepath/process/combined_annotations.ttl" +output_path="$brickpath/annotations.hdt" + +rdf2hdt -i -p -B "$base_uri" "$input_path" "$output_path" diff --git a/stages/02_test.py b/stages/02_test.py index 8228541..98d9fa1 100755 --- a/stages/02_test.py +++ b/stages/02_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import rdflib import pathlib from rdflib_hdt import HDTStore From 985674b58b4a902a66e58ca85fbe0edd0be6ebe2 Mon Sep 17 00:00:00 2001 From: Zakariyya Mughal Date: Fri, 6 Dec 2024 17:36:17 -0500 Subject: [PATCH 3/3] Update dvc.lock --- dvc.lock | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/dvc.lock b/dvc.lock index 3137d7a..8c9aa88 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,2 +1,45 @@ schema: '2.0' -stages: {} +stages: + update_biobricks_dependencies: + cmd: stages/00_update_biobricks_deps.sh + deps: + - path: stages/00_update_biobricks_deps.sh + hash: md5 + md5: fc8078fa10a515838c0bdd46f3d89127 + size: 32 + process-annotations: + cmd: stages/01_process.py + deps: + - path: .bb/dependencies.txt + hash: md5 + md5: 8da1cbf5465d917e74655708a4357ddf + size: 93 + - path: stages/01_process.py + hash: md5 + md5: ca5bfb5063f33d221f14e6b010a896be + size: 6025 + outs: + - path: cache/process/combined_annotations.ttl + hash: md5 + md5: 9e407f729c445a864cf3ef5cb5803568 + size: 7063736995 + build: + cmd: stages/02_build.sh + deps: + - path: cache/process/combined_annotations.ttl + hash: md5 + md5: 9e407f729c445a864cf3ef5cb5803568 + size: 7063736995 + - path: stages/02_build.sh + hash: md5 + md5: 9e3974eba3e2118f230e10967dec1987 + size: 515 + outs: + - path: brick/annotations.hdt + hash: md5 + md5: 188f7cc4f3b623e1a213c6aa4a12569b + size: 1026674753 + - path: brick/annotations.hdt.index.v1-1 + hash: md5 + md5: daa1e09f9599fb6db5979580d2e8ea41 + size: 500214214