diff --git a/dvc.yaml b/dvc.yaml index b96a27b..68a60ef 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,5 +1,8 @@ stages: + update_biobricks_dependencies: + cmd: stages/00_update_biobricks_dependencies.py + process-annotations: cmd: stages/01_process.py deps: diff --git a/stages/00_update_biobricks_deps.py b/stages/00_update_biobricks_deps.py new file mode 100644 index 0000000..d5808af --- /dev/null +++ b/stages/00_update_biobricks_deps.py @@ -0,0 +1,7 @@ +import os +import shutil +import subprocess + +# Clean up and initialize biobricks +shutil.rmtree('.bb', ignore_errors=True) +subprocess.run('biobricks init && biobricks add pubchem-annotations', shell=True) diff --git a/stages/01_process.py b/stages/01_process.py index 6e3c9a1..c948c08 100755 --- a/stages/01_process.py +++ b/stages/01_process.py @@ -3,6 +3,7 @@ import json import pathlib from tqdm import tqdm +import random tqdm.pandas() @@ -19,37 +20,44 @@ pa1 = rawpa[rawpa['PubChemCID'].progress_apply(lambda x: len(x) == 1)] pa1['PubChemCID'] = pa1['PubChemCID'].progress_apply(lambda x: int(x[0])) -# let's look at a single row of the table -rawrow = pa1.iloc[0] -row = rawrow.apply(str).to_dict() -rowstr = json.dumps(row, indent=4) -print(rowstr) - -data = pa1['Data'].values -data_obj = [json.loads(d) for d in tqdm(data)] +# get row1 and make it json for a pretty print +print(json.dumps(pa1.iloc[0].apply(str).to_dict(), indent=4)) # create annotations -annotations = [] -for obj in tqdm(data_obj): - # Check if the 'Value' key exists and contains 'StringWithMarkup' - if 'Value' in obj and 'StringWithMarkup' in obj['Value']: - # Extract the value from the 'StringWithMarkup' key - value = obj['Value']['StringWithMarkup'][0]['String'] - # Create an annotation with 'have_value' and the extracted value - annotation = {'has_value': True, 'value': value} - annotations.append(annotation) - -# create chemical -# create chemical has_annotation annotation +# annotations = [] +# for obj in tqdm(data_obj): +# # Check if the 'Value' key exists and contains 'StringWithMarkup' +# if 'Value' in obj and 'StringWithMarkup' in obj['Value']: +# # Extract the value from the 'StringWithMarkup' key +# value = obj['Value']['StringWithMarkup'][0]['String'] +# # Create an annotation with 'have_value' and the extracted value +# annotation = {'has_value': True, 'value': value} +# annotations.append(annotation) + +# - [x] create chemical +# - [x] create annotation +# - [x] create annotation has_subject chemical +# - [ ] create annotation has_value value # loop through pa1 creating a chemical for each row = pa1.iloc[0] for index, row in tqdm(pa1.iterrows()): cid = row['PubChemCID'] - chem_iri = f"https://pubchem.ncbi.nlm.nih.gov/rest/rdf/compound/{cid}.html" + chem_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID{cid}" + + # create an annotation + # anid = row['ANID'] + anid = random.randint(100000, 999999) + annotation_iri = f"http://rdf.ncbi.nlm.nih.gov/pubchem/annotation/ANID{anid}" + + # create the value for the annotation -# chemical has_identifier -# chemical has_annotation -# annotation -# annotation has_value "starkljksjdf" + # create a relationship between the chemical and the annotation + has_subject = "http://purl.org/dc/terms/subject" + triple = (annotation_iri, has_subject, chem_iri) + # write the triple to a turtle file + with open(outdir / 'annotations.ttl', 'a') as f: + f.write(f"<{triple[0]}> <{triple[1]}> <{triple[2]}> .\n") + + # add a has_annotation diff --git a/stages/02_test.py b/stages/02_test.py new file mode 100644 index 0000000..328e942 --- /dev/null +++ b/stages/02_test.py @@ -0,0 +1,38 @@ +import rdflib +import pathlib + +outdir = pathlib.Path('cache/test') +outdir.mkdir(parents=True, exist_ok=True) + +# Read the turtle file into a graph +graph = rdflib.Graph() +turtle_file = outdir / 'annotations.ttl' + +try: + # Parse the Turtle file into the RDF graph + graph.parse(source=turtle_file.as_posix(), format="turtle") + + # Generate metadata + metadata = { + "triple_count": len(graph), + "namespaces": list(graph.namespaces()), + "sample_triples": list(graph)[:5] # Limit to first 5 triples + } + + # Write metadata to a file + metadata_file = outdir / "test.txt" + with open(metadata_file, "w") as f: + f.write(f"Triple Count: {metadata['triple_count']}\n") + f.write("Namespaces:\n") + for prefix, uri in metadata['namespaces']: + f.write(f" {prefix}: {uri}\n") + f.write("Sample Triples:\n") + for s, p, o in metadata['sample_triples']: + f.write(f" {s} {p} {o}\n") + + print(f"Metadata written to {metadata_file}") + +except Exception as e: + # Explicitly fail if the graph fails to load + print(f"Failed to parse the graph: {e}") + raise