getting started on processing pubchem-annotations

biobricks-ai · Nov 20, 2024 · 05a953c · 05a953c
1 parent 2f738c3
commit 05a953c
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 139 deletions.
diff --git a/.bb/.gitignore b/.bb/.gitignore
@@ -0,0 +1 @@
+/*/
diff --git a/.bb/dependencies.txt b/.bb/dependencies.txt
@@ -0,0 +1 @@
+https://github.com/biobricks-ai/pubchem-annotations#4bc6de375ba7dbdbd2986b53d952e41ace592551
diff --git a/dvc.yaml b/dvc.yaml
@@ -1,35 +1,8 @@
-# Brick DVC stages
-# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml
-
-# The complete process can be executed using:
-# dvc repro
-# If you want to force redoing the process use 
-# dvc repro -f
-# Individual stage can be executed using: 
-# dvc repro <stage>
-
 stages:
-  download:
-    cmd: stages/01_download.sh
+
+  process-annotations:
+    cmd: stages/01_process.py
     deps:
-      - stages/01_download.sh
+      - .bb/dependencies.txt
     outs:
-      - download
-      - list
-  unzip: 
-    cmd: stages/02_unzip.sh
-    deps:
-      - stages/02_unzip.sh
-      - download
-      - list
-    outs:
-      - raw
-  build: 
-    cmd: stages/03_build.sh
-    deps:
-      - stages/03_build.sh
-      - stages/csv2parquet.sh
-      - raw
-      - list
-    outs:
-      - brick
+      - brick/pubchem-annotations.hdt
diff --git a/stages/01_download.sh b/stages/01_download.sh
diff --git a/stages/01_process.py b/stages/01_process.py
@@ -0,0 +1,55 @@
+import biobricks as bb
+import pandas as pd
+import json
+import pathlib 
+from tqdm import tqdm
+
+tqdm.pandas()
+
+outdir = pathlib.Path('cache/process')
+outdir.mkdir(parents=True, exist_ok=True)
+pa_brick = bb.assets('pubchem-annotations')
+
+# pa_brick has a single table `annotations_parquet`
+# TODO pubchem-annotations isn't big but in the future spark or dask is a better choice.
+rawpa = pd.read_parquet(pa_brick.annotations_parquet)
+rawpa = rawpa.head(1000)
+
+# filter pa to rows where there is a single pubchem_cid and map to an int
+pa1 = rawpa[rawpa['PubChemCID'].progress_apply(lambda x: len(x) == 1)]
+pa1['PubChemCID'] = pa1['PubChemCID'].progress_apply(lambda x: int(x[0]))
+
+# let's look at a single row of the table
+rawrow = pa1.iloc[0]
+row = rawrow.apply(str).to_dict()
+rowstr = json.dumps(row, indent=4)
+print(rowstr)
+
+data = pa1['Data'].values
+data_obj = [json.loads(d) for d in tqdm(data)]
+
+# create annotations
+annotations = []
+for obj in tqdm(data_obj):
+    # Check if the 'Value' key exists and contains 'StringWithMarkup'
+    if 'Value' in obj and 'StringWithMarkup' in obj['Value']:
+        # Extract the value from the 'StringWithMarkup' key
+        value = obj['Value']['StringWithMarkup'][0]['String']
+        # Create an annotation with 'have_value' and the extracted value
+        annotation = {'has_value': True, 'value': value}
+        annotations.append(annotation)
+
+# create chemical
+# create chemical has_annotation annotation
+# loop through pa1 creating a chemical for each
+row = pa1.iloc[0]
+for index, row in tqdm(pa1.iterrows()):
+    cid = row['PubChemCID']
+    chem_iri = f"https://pubchem.ncbi.nlm.nih.gov/rest/rdf/compound/{cid}.html"    
+
+# chemical has_identifier <build the pubchem cid>
+# chemical has_annotation <annotation>
+# annotation
+# annotation has_value "starkljksjdf" 
+
+
diff --git a/stages/02_unzip.sh b/stages/02_unzip.sh
diff --git a/stages/03_build.sh b/stages/03_build.sh
diff --git a/stages/csv2parquet.R b/stages/csv2parquet.R
diff --git a/stages/csv2parquet.py b/stages/csv2parquet.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		https://github.com/biobricks-ai/pubchem-annotations#4bc6de375ba7dbdbd2986b53d952e41ace592551