Skip to content

Commit

Permalink
getting started on processing pubchem-annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlue committed Nov 20, 2024
1 parent 2f738c3 commit 05a953c
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 139 deletions.
1 change: 1 addition & 0 deletions .bb/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/*/
1 change: 1 addition & 0 deletions .bb/dependencies.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/biobricks-ai/pubchem-annotations#4bc6de375ba7dbdbd2986b53d952e41ace592551
37 changes: 5 additions & 32 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,35 +1,8 @@
# Brick DVC stages
# See https://dvc.org/doc/user-guide/project-structure/dvcyaml-files#pipelines-files-dvcyaml

# The complete process can be executed using:
# dvc repro
# If you want to force redoing the process use
# dvc repro -f
# Individual stage can be executed using:
# dvc repro <stage>

stages:
download:
cmd: stages/01_download.sh

process-annotations:
cmd: stages/01_process.py
deps:
- stages/01_download.sh
- .bb/dependencies.txt
outs:
- download
- list
unzip:
cmd: stages/02_unzip.sh
deps:
- stages/02_unzip.sh
- download
- list
outs:
- raw
build:
cmd: stages/03_build.sh
deps:
- stages/03_build.sh
- stages/csv2parquet.sh
- raw
- list
outs:
- brick
- brick/pubchem-annotations.hdt
36 changes: 0 additions & 36 deletions stages/01_download.sh

This file was deleted.

55 changes: 55 additions & 0 deletions stages/01_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import biobricks as bb
import pandas as pd
import json
import pathlib
from tqdm import tqdm

tqdm.pandas()

outdir = pathlib.Path('cache/process')
outdir.mkdir(parents=True, exist_ok=True)
pa_brick = bb.assets('pubchem-annotations')

# pa_brick has a single table `annotations_parquet`
# TODO pubchem-annotations isn't big but in the future spark or dask is a better choice.
rawpa = pd.read_parquet(pa_brick.annotations_parquet)
rawpa = rawpa.head(1000)

# filter pa to rows where there is a single pubchem_cid and map to an int
pa1 = rawpa[rawpa['PubChemCID'].progress_apply(lambda x: len(x) == 1)]
pa1['PubChemCID'] = pa1['PubChemCID'].progress_apply(lambda x: int(x[0]))

# let's look at a single row of the table
rawrow = pa1.iloc[0]
row = rawrow.apply(str).to_dict()
rowstr = json.dumps(row, indent=4)
print(rowstr)

data = pa1['Data'].values
data_obj = [json.loads(d) for d in tqdm(data)]

# create annotations
annotations = []
for obj in tqdm(data_obj):
# Check if the 'Value' key exists and contains 'StringWithMarkup'
if 'Value' in obj and 'StringWithMarkup' in obj['Value']:
# Extract the value from the 'StringWithMarkup' key
value = obj['Value']['StringWithMarkup'][0]['String']
# Create an annotation with 'have_value' and the extracted value
annotation = {'has_value': True, 'value': value}
annotations.append(annotation)

# create chemical
# create chemical has_annotation annotation
# loop through pa1 creating a chemical for each
row = pa1.iloc[0]
for index, row in tqdm(pa1.iterrows()):
cid = row['PubChemCID']
chem_iri = f"https://pubchem.ncbi.nlm.nih.gov/rest/rdf/compound/{cid}.html"

# chemical has_identifier <build the pubchem cid>
# chemical has_annotation <annotation>
# annotation
# annotation has_value "starkljksjdf"


28 changes: 0 additions & 28 deletions stages/02_unzip.sh

This file was deleted.

30 changes: 0 additions & 30 deletions stages/03_build.sh

This file was deleted.

2 changes: 0 additions & 2 deletions stages/csv2parquet.R

This file was deleted.

11 changes: 0 additions & 11 deletions stages/csv2parquet.py

This file was deleted.

0 comments on commit 05a953c

Please sign in to comment.