forked from asappresearch/kbc-pomr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_cpt_baseline.py
231 lines (211 loc) · 10.3 KB
/
compute_cpt_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
from collections import Counter
from collections import defaultdict
import csv
import pickle
import re
import typing as t
import requests as rq
from umls_api_auth import Authentication
import numpy as np
from tqdm import tqdm
import pandas as pd
problem_snomed, problem_icd10, problem_icd9 = defaultdict(list), defaultdict(list), defaultdict(list)
print("loading problem definitions")
with open('data/problem_codes_all.csv') as f:
r = csv.reader(f)
header = next(r)
for row in r:
if row[1] == 'icd10':
problem_icd10[row[0]].append(row[2])
if row[1] == 'icd9':
problem_icd9[row[0]].append(row[2])
if row[1] == 'snomed':
problem_snomed[row[0]].append(row[2])
CPT_TO_DISCIPLINE = {
# surgeries
"1010115": "Audiology", # "Surgical Procedures on the Auditory System"
"1006056": "Circulatory", # "Surgical Procedures on the Cardiovascular System"
"1006964": "Gastroenterology", # "Surgical Procedures on the Digestive System"
"1009024": "Endocrinology", # "Surgical Procedures on the Endocrine System"
"1009727": "Ophthalmology", # "Surgical Procedures on the Eye and Ocular Adnexa"
"1008681": "Genitourinary", # "Surgical Procedures on the Female Genital System"
"1006843": "Hematology", # "Surgical Procedures on the Hemic and Lymphatic Systems"
"1003148": "Dermatology", # "Surgical Procedures on the Integumentary System"
"1008470": "Genitourinary", # "Surgical Procedures on the Male Genital System"
"1006933": "Respiratory", # "Surgical Procedures on the Mediastinum and Diaphragm"
"1003679": "Musculoskeletal", # "Surgical Procedures on the Musculoskeletal System"
"1009068": "Neurology", # "Surgical Procedures on the Nervous System"
"1005690": "Respiratory", # "Surgical Procedures on the Respiratory System"
"1008061": "Genitourinary", # "Surgical Procedures on the Urinary System"
# radiology procedures
"1015091": "Musculoskeletal", # "Bone/Joint Studies"
"1010334": "Respiratory", # Diagnostic Radiology (Diagnostic Imaging) Procedures of the Chest
"1010537": "Gastroenterology", # Diagnostic Radiology (Diagnostic Imaging) Procedures of the Gastrointestinal Tract
"1010594": "Circulatory", # Diagnostic Radiology (Diagnostic Imaging) Procedures of the Heart
"1010602": "Circulatory", # Diagnostic Radiology (Diagnostic Imaging) Procedures of the Vascular System
"1010574": "Genitourinary", # Diagnostic Radiology (Diagnostic Imaging) Procedures of the Urinary Tract
"1015090": "Genitourinary", # Breast, Mammography
"1010843": "Oncology", # Radiation Oncology Treatment
# medicine services and procedures
"1013263": "Immunology", # Allergy and Clinical Immunology Procedures
"1012974": "Circulatory", # Cardiovascular Procedures
"1013417": "Mental Health", # Central Nervous System Assessments/Tests (eg, Neuro-Cognitive, Mental Status, Speech Testing)
"1013565": "Musculoskeletal", # Chiropractic Manipulative Treatment Procedures
"1012740": "Genitourinary", # Dialysis Services and Procedures
"1013306": "Endocrinology", # Endocrinology Services
"1012764": "Gastroenterology", # Gastroenterology Procedures
"1013309": "Neurology", # Neurology and Neuromuscular Procedures
"1012793": "Ophthalmology", # Ophthalmology Services and Procedures
"1013483": "Musculoskeletal", # Physical Medicine and Rehabilitation Evaluations
"1012681": "Mental Health", # Psychiatry Services and Procedures
"1013214": "Respiratory", # Pulmonary Procedures
"1013471": "Dermatology", # Special Dermatological Procedures
}
class ICDPrefix(t.NamedTuple):
lowLetter: str
lowNumber: int
highLetter: str
highNumber: int
ICD_10_PREF_TO_DISCIPLINE = {
ICDPrefix("A", 0, "B", 99): "Infectious Disease",
ICDPrefix("C", 0, "D", 49): "Oncology",
ICDPrefix("D", 0, "D", 89): "Hematology",
ICDPrefix("E", 0, "E", 89): "Endocrinology",
ICDPrefix("F", 1, "F", 99): "Mental Health",
ICDPrefix("G", 0, "G", 99): "Neurology",
ICDPrefix("H", 0, "H", 59): "Ophthalmology",
ICDPrefix("H", 60, "H", 95): "Otology",
ICDPrefix("I", 00, "I", 99): "Circulatory",
ICDPrefix("J", 0, "J", 99): "Respiratory",
ICDPrefix("K", 0, "K", 14): "Dentistry",
ICDPrefix("K", 20, "K", 95): "Gastroenterology",
ICDPrefix("L", 0, "L", 99): "Dermatology",
ICDPrefix("M", 0, "M", 99): "Musculoskeletal",
ICDPrefix("N", 0, "N", 99): "Genitourinary",
ICDPrefix("R", 0, "R", 9): "Circulatory",
ICDPrefix("R", 10, "R", 19): "Gastroenterology",
ICDPrefix("R", 20, "R", 23): "Dermatology",
ICDPrefix("R", 25, "R", 29): "Musculoskeletal",
ICDPrefix("R", 30, "R", 39): "Genitourinary",
ICDPrefix("R", 40, "R", 46): "Mental Health",
ICDPrefix("S", 0, "T", 98): "External",
ICDPrefix("Z", 0, "Z", 29): "Health Maintenance",
}
ICD_9_PREF_TO_DISCIPLINE = {
ICDPrefix("", 1, "", 139): "Infectious Disease",
ICDPrefix("", 140, "", 239): "Oncology",
ICDPrefix("", 240, "", 279): "Endocrinology",
ICDPrefix("", 280, "", 289): "Hematology",
ICDPrefix("", 290, "", 319): "MentalHealth",
ICDPrefix("", 320, "", 389): "Neurology",
ICDPrefix("", 390, "", 459): "Circulatory",
ICDPrefix("", 460, "", 519): "Respiratory",
ICDPrefix("", 520, "", 529): "Dentistry",
ICDPrefix("", 530, "", 579): "Gastroenterology",
ICDPrefix("", 580, "", 629): "Genitourinary",
ICDPrefix("", 630, "", 679): "Other",
ICDPrefix("", 680, "", 709): "Dermatology",
ICDPrefix("", 710, "", 739): "Musculoskeletal",
ICDPrefix("", 740, "", 759): "Other",
ICDPrefix("", 760, "", 779): "Other",
ICDPrefix("", 780, "", 799): "Other",
ICDPrefix("", 800, "", 999): "External",
ICDPrefix("V", 1, "V", 91): "Health Maintenance",
ICDPrefix("E", 0, "E", 999): "External",
}
def cpt_to_discipline(px_code, tgt):
# get discipline for a CPT code using our manually created map
disciplines = []
# take part of code before modifier
code = px_code.split('.')[0]
route = f'/content/current/source/CPT/{code}/ancestors'
query = {'ticket': auth.getst(tgt)}
anc = rq.get(URI+route, params=query)
if anc.status_code == 200:
disciplines.extend([CPT_TO_DISCIPLINE[a['ui']] for a in anc.json()['result'] if a['ui'] in CPT_TO_DISCIPLINE])
return set(disciplines)
def icd9_to_disc(dx_code):
prefix = dx_code[:3]
if dx_code[0].isnumeric():
prefix = int(prefix)
for pref, disc in ICD_9_PREF_TO_DISCIPLINE.items():
if prefix_number >= pref.lowNumber and prefix_number <= pref.highNumber:
return disc
else:
prefix_letter = dx_code[0]
# just manually do this
if prefix_letter == "V":
return "Health Maintenance"
elif prefix_letter == "E":
return "External"
return
def icd10_to_disc(dx_code):
prefix = dx_code[:3]
prefix_letter = prefix[0]
prefix_number = int(prefix[1:])
for pref, disc in ICD_10_PREF_TO_DISCIPLINE.items():
if prefix_letter >= pref.lowLetter and prefix_letter <= pref.highLetter and prefix_number >= pref.lowNumber and prefix_number <= pref.highNumber:
return disc
return
def dxs_to_discipline(dx_codes, dx_systems):
# get discipline for ICD code using manually created maps
disciplines = []
for code, system in zip(dx_codes, dx_systems):
if system == "ICD-9-CM":
icd9_disc = icd9_to_disc(code)
disciplines.append(icd9_disc)
elif system == "ICD-10-CM":
icd10_disc = icd10_to_disc(code)
disciplines.append(icd10_disc)
return set(disciplines)
def px_to_prob_match_by_discipline(px, problem):
tgt = auth.gettgt()
px_discs = cpt_to_discipline(px, tgt)
dx_discs = dxs_to_discipline(problem_icd10[problem], ["ICD-10-CM"] * len(problem_icd10[problem]))
return len(px_discs.intersection(dx_discs)) > 0
def is_cpt(code):
return re.match("^[A-Z]\d{4}(\.\d+)?", code) is not None or re.match("^\d{4}[\dFMTU](\.\d+)?$", code) is not None
auth = Authentication('MY-SECRET-KEY')
tgt = auth.gettgt()
URI = "https://uts-ws.nlm.nih.gov/rest"
df = pd.read_csv('data/test_probs.csv')
adf = pd.read_csv('data/all.csv', names=['problem', 'relationType', 'target', 'label'])
procs = sorted([proc.split('_')[1] for proc in adf[adf['relationType'] == 'procedure']['target'].unique()])
print(f"num all procs: {len(set(procs))}")
cpt_procs = [p for p in procs if is_cpt(p)]
print(f"num cpt procs: {len(set(cpt_procs))}")
num_negs = Counter()
num_pos_negs = Counter()
for ix, triple in enumerate(df.itertuples()):
if triple.label == 1:
if triple.relationType != 'procedure':
continue
target = triple.target.split('_')[1]
num_negs[(triple.problem, triple.relationType)] += 1
if is_cpt(target):
if px_to_prob_match_by_discipline(target, triple.problem):
num_pos_negs[(triple.problem, triple.relationType)] += 1
ranks = []
matches = 0
for triple in df.itertuples():
if triple.label != 2 or triple.relationType != 'procedure':
continue
target = triple.target.split('_')[1]
nn = num_negs[(triple.problem, triple.relationType)]
npn = num_pos_negs[(triple.problem, triple.relationType)]
if is_cpt(target):
if px_to_prob_match_by_discipline(target, triple.problem):
rank = npn / 2 + 1
else:
rank = np.median(np.arange(npn+1, nn+1+1))
else:
rank = np.median(np.arange(npn+1, nn+1+1))
ranks.append(rank)
ranks = np.array(ranks)
mr = np.mean(ranks)
mrr = np.mean(1./ranks)
hits_at_1 = np.mean([rank <= 1 for rank in ranks])
hits_at_5 = np.mean([rank <= 5 for rank in ranks])
hits_at_10 = np.mean([rank <= 10 for rank in ranks])
hits_at_30 = np.mean([rank <= 30 for rank in ranks])
print(f"MR: {mr}, MRR: {mrr}, H@1: {hits_at_1}, H@5: {hits_at_5}, H@10: {hits_at_10}, H@30: {hits_at_30}")