Skip to content

Commit

Permalink
end of pair coding session
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlue committed Nov 21, 2024
1 parent 05a953c commit e057ff8
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 25 deletions.
3 changes: 3 additions & 0 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
stages:

update_biobricks_dependencies:
cmd: stages/00_update_biobricks_dependencies.py

process-annotations:
cmd: stages/01_process.py
deps:
Expand Down
7 changes: 7 additions & 0 deletions stages/00_update_biobricks_deps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os
import shutil
import subprocess

# Clean up and initialize biobricks
shutil.rmtree('.bb', ignore_errors=True)
subprocess.run('biobricks init && biobricks add pubchem-annotations', shell=True)
58 changes: 33 additions & 25 deletions stages/01_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import pathlib
from tqdm import tqdm
import random

tqdm.pandas()

Expand All @@ -19,37 +20,44 @@
pa1 = rawpa[rawpa['PubChemCID'].progress_apply(lambda x: len(x) == 1)]
pa1['PubChemCID'] = pa1['PubChemCID'].progress_apply(lambda x: int(x[0]))

# let's look at a single row of the table
rawrow = pa1.iloc[0]
row = rawrow.apply(str).to_dict()
rowstr = json.dumps(row, indent=4)
print(rowstr)

data = pa1['Data'].values
data_obj = [json.loads(d) for d in tqdm(data)]
# get row1 and make it json for a pretty print
print(json.dumps(pa1.iloc[0].apply(str).to_dict(), indent=4))

# create annotations
annotations = []
for obj in tqdm(data_obj):
# Check if the 'Value' key exists and contains 'StringWithMarkup'
if 'Value' in obj and 'StringWithMarkup' in obj['Value']:
# Extract the value from the 'StringWithMarkup' key
value = obj['Value']['StringWithMarkup'][0]['String']
# Create an annotation with 'have_value' and the extracted value
annotation = {'has_value': True, 'value': value}
annotations.append(annotation)

# create chemical
# create chemical has_annotation annotation
# annotations = []
# for obj in tqdm(data_obj):
# # Check if the 'Value' key exists and contains 'StringWithMarkup'
# if 'Value' in obj and 'StringWithMarkup' in obj['Value']:
# # Extract the value from the 'StringWithMarkup' key
# value = obj['Value']['StringWithMarkup'][0]['String']
# # Create an annotation with 'have_value' and the extracted value
# annotation = {'has_value': True, 'value': value}
# annotations.append(annotation)

# - [x] create chemical
# - [x] create annotation
# - [x] create annotation has_subject chemical
# - [ ] create annotation has_value value
# loop through pa1 creating a chemical for each
row = pa1.iloc[0]
for index, row in tqdm(pa1.iterrows()):
cid = row['PubChemCID']
chem_iri = f"https://pubchem.ncbi.nlm.nih.gov/rest/rdf/compound/{cid}.html"
chem_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID{cid}"

# create an annotation
# anid = row['ANID']
anid = random.randint(100000, 999999)
annotation_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/ANID{anid}"

# create the value for the annotation

# chemical has_identifier <build the pubchem cid>
# chemical has_annotation <annotation>
# annotation
# annotation has_value "starkljksjdf"
# create a relationship between the chemical and the annotation
has_subject = "http://purl.org/dc/terms/subject"
triple = (annotation_iri, has_subject, chem_iri)

# write the triple to a turtle file
with open(outdir / 'annotations.ttl', 'a') as f:
f.write(f"<{triple[0]}> <{triple[1]}> <{triple[2]}> .\n")

# add a has_annotation

38 changes: 38 additions & 0 deletions stages/02_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import rdflib
import pathlib

outdir = pathlib.Path('cache/test')
outdir.mkdir(parents=True, exist_ok=True)

# Read the turtle file into a graph
graph = rdflib.Graph()
turtle_file = outdir / 'annotations.ttl'

try:
# Parse the Turtle file into the RDF graph
graph.parse(source=turtle_file.as_posix(), format="turtle")

# Generate metadata
metadata = {
"triple_count": len(graph),
"namespaces": list(graph.namespaces()),
"sample_triples": list(graph)[:5] # Limit to first 5 triples
}

# Write metadata to a file
metadata_file = outdir / "test.txt"
with open(metadata_file, "w") as f:
f.write(f"Triple Count: {metadata['triple_count']}\n")
f.write("Namespaces:\n")
for prefix, uri in metadata['namespaces']:
f.write(f" {prefix}: {uri}\n")
f.write("Sample Triples:\n")
for s, p, o in metadata['sample_triples']:
f.write(f" {s} {p} {o}\n")

print(f"Metadata written to {metadata_file}")

except Exception as e:
# Explicitly fail if the graph fails to load
print(f"Failed to parse the graph: {e}")
raise

0 comments on commit e057ff8

Please sign in to comment.