-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdump.R
153 lines (132 loc) · 6.78 KB
/
dump.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
library(gemmaAPI, lib.loc = '~/R/x86_64-redhat-linux-gnu-library/3.6/')
library(data.table)
library(gemmaAPI)
library(async)
library(glue)
library(dplyr)
library(jsonlite)
MAX_EXP <- 20000
BATCH_SIZE <- 1000
synchronise({
async_map(split(as.character(1:MAX_EXP), ceiling(seq_along(as.character(1:MAX_EXP)) / BATCH_SIZE)),
async(function(x) {
gemmaAPI::getDatasets(x, limit = 0, async = T)
}))
}) %>% rbindlist(fill = T) -> experiments
synchronise({
async_map(experiments[, ee.ID], async(function(x) {
gemmaAPI::getDatasetDEA(x, async = T)$then(function(anns) {
if(length(anns) != 0 && anns[, any(is.na(ee.ID))])
anns[, ee.ID := as.integer(ee.ID)] %>% .[, ee.ID := x]
anns
})
}), .limit = floor(BATCH_SIZE / 5))
}) %>% rbindlist -> annotations
gemmaAPI::getPlatforms(limit = 0) %>%
.[, .(ad.ID = platform.ID, ad.ShortName = platform.ShortName,
ad.Troubled = platform.Troubled, ad.Sequencing = technology.Type == 'SEQUENCING',
ad.Type = technology.Type, ad.Company = sapply(strsplit(platform.Name, ' '), '[[', 1))] -> platforms
synchronise({
async_map(experiments[, ee.ID], async(function(x) {
gemmaAPI::getDatasetAnnotations(x, async = T)$then(function(tags) {
data.table(ee.ID = x, tags)
})
}), .limit = floor(BATCH_SIZE / 5))
}) %>% rbindlist(fill = T) -> tags
lapply(unique(annotations[, ad.ID]), function(ads) {
platforms[ad.ID %in% ads,
.(ad.ID = paste0(ads, collapse = ', '),
ad.Name = paste0(ad.ShortName, collapse = ', '),
ad.Company = paste0(ad.Company, collapse = ', '),
ad.Sequencing = all(ad.Sequencing),
ad.Troubled = any(ad.Troubled))]
}) %>% rbindlist %>%
merge(annotations %>% .[, .(ad.ID = paste0(unlist(ad.ID), collapse = ', ')), setdiff(names(annotations), 'ad.ID')], by = 'ad.ID') %>%
merge(experiments, by = 'ee.ID') %>%
merge(tags[class.Type == 'ExperimentTag',
.(ee.Cat = paste0(class.Name, collapse = ', '),
ee.CatLongUri = paste0(class.URL, collapse = ', '),
ee.Tag = paste0(term.Name, collapse = ', '),
ee.TagLongUri = paste0(term.URL, collapse = ', ')), ee.ID],
by = 'ee.ID') %>%
.[, .(ee.ID, rsc.ID, ee.Public, ee.Troubled, ee.Taxon = taxon.Name, ee.Database,
ee.Description, ee.ShortName, ee.Accession, ee.Name,
ee.BatchCorrected = geeq.batchCorrected, ee.Batch.P, ee.Batch.PC,
ee.qScore = geeq.qScore, ee.sScore = geeq.sScore,
ee.NumSamples = ee.Samples, ee.Cat, ee.CatLongUri, ee.Tag, ee.TagLongUri,
ee.numDE = stats.DE, ee.numDown = stats.Down, ee.numUp = stats.Up,
ee.numProbes = probes.Analyzed, ee.numGenes = genes.Analyzed,
ad.Name, ad.Sequencing, ad.Troubled, ad.Type = technology.Type,
sf.Subset, sf.Cat, sf.CatLongUri, sf.Val, sf.ValLongUri,
cf.Cat, cf.CatLongUri, cf.Val, cf.ValLongUri, cf.Baseline, cf.BaseLongUri
)] %>% setorder(ee.Taxon, rsc.ID) -> experiment.meta
saveRDS(experiment.meta, '/space/scratch/jsicherman/Gemma/meta_exp.rds')
rm(experiments, platforms, tags, annotations)
# This bootlegged auto-retry is pretty gross, but should work
# Initialize with RSC IDs to fetch
# TODO Update later to only redownload updated ones.
for(taxon in na.omit(experiment.meta[, unique(ee.Taxon)])) {
assign(paste0('QUEUED_', taxon), experiment.meta[ee.ID %in% experiment.meta[ee.Taxon == taxon, unique(ee.ID)]] %>%
.[, rsc.ID])
}
rm(list = ls(pattern = 'diffEx_'))
# Keep trying to pull data down as long as we have some left
while(length(ls(pattern = 'QUEUED_')) > 0) {
for(taxon in na.omit(experiment.meta[, unique(ee.Taxon)])) {
# Finish a taxon
if(!exists(paste0('QUEUED_', taxon)) || length(get(paste0('QUEUED_', taxon))) == 0) {
rm(list = paste0('QUEUED_', taxon))
next
}
message(taxon)
anns <- experiment.meta[ee.ID %in% experiment.meta[ee.Taxon == taxon, unique(ee.ID)]] %>%
.[rsc.ID %in% get(paste0('QUEUED_', taxon))] %>%
.[, result.ID := as.integer(sapply(strsplit(rsc.ID, '.', T), '[[', 2))] %>%
.[, .(result.ID, ee.ID, rsc.ID, cf.Cat, cf.Baseline, cf.Val)]
synchronise({
async_map(1:nrow(anns), async(function(i) {
message(paste0(i, '/', nrow(anns), '...'))
passthrough <- anns[i, .(rsc.ID, cf.Cat, cf.Baseline, cf.Val)]
# Pull the DE results
gemmaAPI::getDatasetDE(anns[i, ee.ID], keepNonSpecific = F, consolidate = 'average', limit = 0, diffExSet = anns[i, result.ID], async = T)$then(function(de) {
# Test if our HTTP request worked...
# Sometimes we get 500 errors if the server times out on us,
# so we'll have to come back to those
if('data.table' %in% class(de)) {
# Unqueue safely if there are no associated vectors
if(nrow(de[, unlist(expr, F)]) == 0)
list(unqueue = passthrough[, rsc.ID])
else { # Compute fold changes otherwise
sampleInfo <- gemmaAPI::getDatasetSamples(de[, ee.ID]) %>%
.[, .(sample.Name, sample.Characteristics, sample.FactorValues)]
# Decide if each sample is in the baseline or the contrast
baselines <- sampleInfo[sampleInfo[, sapply(sample.FactorValues, function(x) {
sum(sapply(x[, name], grepl, passthrough[, cf.Baseline])) > sum(sapply(x[, name], grepl, passthrough[, cf.Val]))
})], sample.Name]
list(unqueue = passthrough[, rsc.ID],
data = de[, unlist(expr, F)] %>% .[, setdiff(sampleInfo[, sample.Name], baselines), with = F] %>%
rowMeans(na.rm = T) %>% `/`(de[, unlist(expr, F)] %>% .[, baselines, with = F] %>% rowMeans(na.rm = T)) %>% {
data.table(entrez.ID = de[, unlist(expr, F)][, gene.ID], fc = log2(.))
} %>% setnames(2, passthrough[, rsc.ID])
)
}
}
})
}), .limit = floor(BATCH_SIZE / 30))
}) %>% {
# Unqueue non-failures
assign(paste0('QUEUED_', taxon),
Filter(function(x) !(x %in% unlist(lapply(., '[[', 'unqueue'))), get(paste0('QUEUED_', taxon))), envir = globalenv())
# Recompute the differential expression set if there are any successes
if(any(sapply(., length) == 2)) {
diffEx <- Reduce(function(...) merge(..., all = T, by = 'entrez.ID'), lapply(.[lengths(.) == 2], '[[', 'data'))
if(exists(paste0('diffEx_', taxon)))
assign(paste0('diffEx_', taxon), merge(get(paste0('diffEx_', taxon)), diffEx, by = 'entrez.ID', all = T), envir = globalenv())
else
assign(paste0('diffEx_', taxon), diffEx, envir = globalenv())
saveRDS(get(paste0('diffEx_', taxon)), paste0('/space/scratch/jsicherman/Gemma/diffEx_', taxon, '.rds'))
}
invisible(NULL)
}
}
}