-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathterm_fixes.R
93 lines (67 loc) · 3.19 KB
/
term_fixes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# owl parsing
# test case DOID:3363 http://purl.obolibrary.org/obo/DOID_3393
library(xml2)
library(magrittr)
devtools::load_all()
data.holder <- readRDS(file.path(RAWDIR,'DATA.HOLDER.rds'))
# list and fetch ontologies form Gemma's cache -------
ontology_server <- 'moe'
ontology_path <- '/space/gemmaData/ontologyCache/ontology'
ontologies <- system2("ssh" ,paste0(ontology_server," ls ", ontology_path),stdout = TRUE)
ontologies <- ontologies[!grepl('tmp',ontologies)]
dir.create('data-raw/ontologies',showWarnings = FALSE)
ontologies %>% lapply(function(x){
RCurl::scp(ontology_server,file.path(ontology_path,x)) %>%
writeBin(file.path('data-raw/ontologies',x))
})
# temporarily add obi manually until processing is complete
# file is taken from the original source for now
# ontologies %<>% c('obiOntology')
ontologies = ontologies[ontologies %in% list.files('data-raw/ontologies/')]
# get all hasAlternativeId terms from the ontologies to replace within data.holder
alternative_terms <- ontologies %>% lapply(function(x){
print(x)
onto <- xml2::read_xml(file.path('data-raw/ontologies/',x))
children <- xml2::xml_children(onto)
classes <- xml2::xml_name(children) == 'Class'
children <- children[classes]
term_links <- xml2::xml_attr(children,'about')
alternatives <- children %>% lapply(function(y){
term_children <- xml2::xml_children(y)
term_contents <- xml2::xml_name(term_children)
term_children[term_contents =='hasAlternativeId'] %>% xml2::xml_text()
})
names(alternatives) <- term_links
return(alternatives)
})
names(alternative_terms) <- ontologies
alternatives <- alternative_terms[-13] %>% lapply(\(x){
x %>% sapply(length) %>% rep(names(x),.)
}) %>% unlist
to_replace <- alternative_terms[-13] %>% lapply(\(x){
x %>% unlist
}) %>% unlist
original_terms <- alternative_terms[-13] %>% lapply(names) %>% unlist %>% unique
data.holder %<>% lapply(function(x){
val_terms <- [email protected]$cf.ValLongUri %>% {.[grepl('obo',.)]} %>% unique
base_terms <- [email protected]$cf.BaseLongUri %>% {.[grepl('obo',.)]} %>% unique
terms <- c(val_terms,base_terms) %>% unique
missing_terms <- terms[!terms %in% original_terms]
length(missing_terms)
compact_names <- missing_terms %>%
stringr::str_extract('(?<=obo/).*') %>%
stringr::str_replace('_',':')
replacement_terms <- alternatives[match(compact_names,to_replace)]
names(replacement_terms) <- missing_terms
replacement_terms %<>% na.omit()
levels([email protected]$cf.BaseLongUri) = c(levels([email protected]$cf.BaseLongUri),replacement_terms) %>% unique
levels([email protected]$cf.ValLongUri) = c(levels([email protected]$cf.ValLongUri),replacement_terms) %>% unique
[email protected]$cf.BaseLongUri[[email protected]$cf.BaseLongUri %in% names(replacement_terms)] %<>%
{replacement_terms[match(.,names(replacement_terms))]}
[email protected]$cf.BaseLongUri %<>% droplevels()
[email protected]$cf.ValLongUri[[email protected]$cf.ValLongUri%in% names(replacement_terms)] %<>%
{replacement_terms[match(.,names(replacement_terms))]}
[email protected]$cf.ValLongUri %<>% droplevels()
return(x)
})
saveRDS(data.holder, file.path(RAWDIR, 'DATA.HOLDER_fixed_terms.rds'))