From e057ff8288231a50182ac0fe330325c1f594639e Mon Sep 17 00:00:00 2001
From: Thomas Luechtefeld <tom@insilica.co>
Date: Thu, 21 Nov 2024 20:41:32 +0000
Subject: [PATCH] end of pair coding session

---
 dvc.yaml                           |  3 ++
 stages/00_update_biobricks_deps.py |  7 ++++
 stages/01_process.py               | 58 +++++++++++++++++-------------
 stages/02_test.py                  | 38 ++++++++++++++++++++
 4 files changed, 81 insertions(+), 25 deletions(-)
 create mode 100644 stages/00_update_biobricks_deps.py
 create mode 100644 stages/02_test.py
diff --git a/dvc.yaml b/dvc.yaml
index b96a27b..68a60ef 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -1,5 +1,8 @@
 stages:
   
+  update_biobricks_dependencies:
+    cmd: stages/00_update_biobricks_dependencies.py
+    
   process-annotations:
     cmd: stages/01_process.py
     deps:
diff --git a/stages/00_update_biobricks_deps.py b/stages/00_update_biobricks_deps.py
new file mode 100644
index 0000000..d5808af
--- /dev/null
+++ b/stages/00_update_biobricks_deps.py
@@ -0,0 +1,7 @@
+import os
+import shutil
+import subprocess
+
+# Clean up and initialize biobricks
+shutil.rmtree('.bb', ignore_errors=True)
+subprocess.run('biobricks init && biobricks add pubchem-annotations', shell=True)
diff --git a/stages/01_process.py b/stages/01_process.py
index 6e3c9a1..c948c08 100755
--- a/stages/01_process.py
+++ b/stages/01_process.py
@@ -3,6 +3,7 @@
 import json
 import pathlib 
 from tqdm import tqdm
+import random
 
 tqdm.pandas()
 
@@ -19,37 +20,44 @@
 pa1 = rawpa[rawpa['PubChemCID'].progress_apply(lambda x: len(x) == 1)]
 pa1['PubChemCID'] = pa1['PubChemCID'].progress_apply(lambda x: int(x[0]))
 
-# let's look at a single row of the table
-rawrow = pa1.iloc[0]
-row = rawrow.apply(str).to_dict()
-rowstr = json.dumps(row, indent=4)
-print(rowstr)
-
-data = pa1['Data'].values
-data_obj = [json.loads(d) for d in tqdm(data)]
+# get row1 and make it json for a pretty print
+print(json.dumps(pa1.iloc[0].apply(str).to_dict(), indent=4))
 
 # create annotations
-annotations = []
-for obj in tqdm(data_obj):
-    # Check if the 'Value' key exists and contains 'StringWithMarkup'
-    if 'Value' in obj and 'StringWithMarkup' in obj['Value']:
-        # Extract the value from the 'StringWithMarkup' key
-        value = obj['Value']['StringWithMarkup'][0]['String']
-        # Create an annotation with 'have_value' and the extracted value
-        annotation = {'has_value': True, 'value': value}
-        annotations.append(annotation)
-
-# create chemical
-# create chemical has_annotation annotation
+# annotations = []
+# for obj in tqdm(data_obj):
+#     # Check if the 'Value' key exists and contains 'StringWithMarkup'
+#     if 'Value' in obj and 'StringWithMarkup' in obj['Value']:
+#         # Extract the value from the 'StringWithMarkup' key
+#         value = obj['Value']['StringWithMarkup'][0]['String']
+#         # Create an annotation with 'have_value' and the extracted value
+#         annotation = {'has_value': True, 'value': value}
+#         annotations.append(annotation)
+
+# - [x] create chemical
+# - [x] create annotation
+# - [x] create annotation has_subject chemical
+# - [ ] create annotation has_value value
 # loop through pa1 creating a chemical for each
 row = pa1.iloc[0]
 for index, row in tqdm(pa1.iterrows()):
     cid = row['PubChemCID']
-    chem_iri = f"https://pubchem.ncbi.nlm.nih.gov/rest/rdf/compound/{cid}.html"    
+    chem_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID{cid}"    
+
+    # create an annotation
+    # anid = row['ANID']
+    anid = random.randint(100000, 999999)
+    annotation_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/ANID{anid}"
+
+    # create the value for the annotation
 
-# chemical has_identifier <build the pubchem cid>
-# chemical has_annotation <annotation>
-# annotation
-# annotation has_value "starkljksjdf" 
+    # create a relationship between the chemical and the annotation
+    has_subject = "http://purl.org/dc/terms/subject"
+    triple = (annotation_iri, has_subject, chem_iri)
 
+    # write the triple to a turtle file
+    with open(outdir / 'annotations.ttl', 'a') as f:
+        f.write(f"<{triple[0]}> <{triple[1]}> <{triple[2]}> .\n")
+    
+    # add a has_annotation
 
diff --git a/stages/02_test.py b/stages/02_test.py
new file mode 100644
index 0000000..328e942
--- /dev/null
+++ b/stages/02_test.py
@@ -0,0 +1,38 @@
+import rdflib
+import pathlib
+
+outdir = pathlib.Path('cache/test')
+outdir.mkdir(parents=True, exist_ok=True)
+
+# Read the turtle file into a graph
+graph = rdflib.Graph()
+turtle_file = outdir / 'annotations.ttl'
+
+try:
+    # Parse the Turtle file into the RDF graph
+    graph.parse(source=turtle_file.as_posix(), format="turtle")
+
+    # Generate metadata
+    metadata = {
+        "triple_count": len(graph),
+        "namespaces": list(graph.namespaces()),
+        "sample_triples": list(graph)[:5]  # Limit to first 5 triples
+    }
+
+    # Write metadata to a file
+    metadata_file = outdir / "test.txt"
+    with open(metadata_file, "w") as f:
+        f.write(f"Triple Count: {metadata['triple_count']}\n")
+        f.write("Namespaces:\n")
+        for prefix, uri in metadata['namespaces']:
+            f.write(f"  {prefix}: {uri}\n")
+        f.write("Sample Triples:\n")
+        for s, p, o in metadata['sample_triples']:
+            f.write(f"  {s} {p} {o}\n")
+
+    print(f"Metadata written to {metadata_file}")
+
+except Exception as e:
+    # Explicitly fail if the graph fails to load
+    print(f"Failed to parse the graph: {e}")
+    raise