Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
benstear authored Nov 16, 2023
1 parent 3b6dbbb commit 9caac85
Showing 1 changed file with 129 additions and 0 deletions.
129 changes: 129 additions & 0 deletions tests/reduce_petagraph_csv_size_github_actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
#import polars as pl
import pandas as pd
#import glob as glob
#from collections import Counter
#import matplotlib.pyplot as plt



# In[2]:


#path='/Users/stearb/Desktop/DESKTOP_TRANSFER/R03_local/Petagraph_Sept2023/data/base_csvs/basecontext10Sep2023/'
path='/var/lib/neo4j/import/'


# In[ ]:





# In[3]:


cuicodes = pd.read_csv(path+'CUI-CODEs.csv')


# In[4]:


#c=dict(Counter([i.split(':')[0] for i in cuicodes[':END_ID']]).most_common())
#plt.bar(height=list(c.values()),x=range(len(list(c.values()))))


# In[5]:


dropsabs = ['HSCLO','MSH','MEDCIN','LNC','MTH','NDC','ICD10PCS','MTHSPL','REFSEQ','RXNORM','ICD10CM','ICPC2ICD10ENG','MDR',
'MMSL','UWDA','CHV','CPT','SNOMEDCT_VET','GS','ORPHANET','NDDF','VANDF','UMD','MMX','ICD10AM','ICD9CM',
'HCPT','MTHICD9','PDQ','CSP','NIC','LCH_NW','ICD10','NOC','DRUGBANK']


# ## Reduce CUICODEs.csv

# In[6]:


cuicodes['sabs'] = [i.split(':')[0] for i in cuicodes[':END_ID']]
cuicodes_reduced = cuicodes[~cuicodes['sabs'].isin(dropsabs)].reset_index(drop=True)
print('CUI-CODEs.csv reduced by '+str(np.round(100*(len(cuicodes_reduced)/len(cuicodes))))+'%')

cuicodes_reduced.to_csv('/var/lib/neo4j/import/CUI-CODEs.csv',index=False)

del cuicodes


# In[7]:


codes = pd.read_csv(path+'CODEs.csv')
codes_reduced = codes[codes['SAB'].isin(cuicodes_reduced['sabs'].values)]

print('CODEs.csv reduced by '+str(np.round(100*(len(codes_reduced)/len(codes))))+'%')

del codes

codes_reduced.drop(['CODE','value:float','lowerbound:float','upperbound:float','unit'],axis=1,inplace=True)

codes_reduced.to_csv('/var/lib/neo4j/import/CODEs.csv',index=False)

del codes_reduced


# ## Drop CUIs in the CUI-CODEs df from CUIs.csv and CUI-CUIs.csv file

# In[8]:


cuicodes_reduced.drop('sabs',axis=1,inplace=True)


# In[9]:


cuis = pd.read_csv(path+'CUIs.csv')

cuis_reduced = cuis[cuis['CUI:ID'].isin(cuicodes_reduced[':START_ID'].values)].reset_index(drop=True)

print('CUIs.csv reduced by '+str(np.round(100*(len(cuis_reduced)/len(cuis))))+'%')

del cuis

cuis_reduced.to_csv('/var/lib/neo4j/import/CUIs.csv',index=False)

del cuis_reduced


# # CUI-CUIs

# In[10]:


cuicuis = pd.read_csv(path+'CUI-CUIs.csv')

# DROP CUIs from START_ID and END_ID cols
cuicuis_reduced = cuicuis[cuicuis[':START_ID'].isin(cuicodes_reduced[':START_ID'].values)]

cuicuis_reduced = cuicuis_reduced[cuicuis_reduced[':END_ID'].isin(cuicodes_reduced[':START_ID'].values)]

del cuicuis
#print('CUI-CUIs.csv reduced by '+str(np.round(100*(len(cuicuis_reduced)/len(cuicuis))))+'%')

cuicuis_reduced.drop('evidence_class:string',axis=1,inplace=True)

cuis_reduced.to_csv('/var/lib/neo4j/import/CUI-CUIs.csv',index=False)

del cuicuis_reduced





0 comments on commit 9caac85

Please sign in to comment.