-
Notifications
You must be signed in to change notification settings - Fork 1
/
35_format_phyloseq.r
315 lines (253 loc) · 12 KB
/
35_format_phyloseq.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#' ---
#' title: "Inclusion of sample data into CSS'd `phyloseq` object"
#' author: "Paul Czechowski"
#' date: "October 22nd, 2016"
#' output: pdf_document
#' toc: true
#' highlight: zenburn
#' bibliography: ./references.bib
#' ---
#'
#' # Preface
#'
#' This code is tested using a raw R terminal. Path names are defined relative
#' to the project directory. This code commentary is included in the R code itself
#' and can be rendered at any stage using `rmarkdown::render ("./35_format_phyloseq.R")`.
#' Please check the session info at the end of the document for further notes
#' on the coding environment. This script reads CSS'd data.
#'
#' # Prerequisites
#'
#' **This script uses the `phyloseq` object from repository `pcm_modelling`**.
#' In `pcm_modelling` the generation of .`560_psob_18S_css_filtered.RData`
#' and `560_psob_18S_default_filtered.RData` is thoroughly
#' documented. Here the former object is used, in which abundance were corrected
#' using cumulative sum scaling [@Paulson2013]. Further prerequisites:
#'
#' * This script is located in the project parent directory.
#' * `10_import_predictors.pdf` and `20_format_predictors.R` were run.
#' * Object generated by these scripts are available in
#' `/Zenodo/R_Objects`.
#'
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Environment preparation
#'
#' ## Package loading and cleaning of workspace
#'
#+ message=FALSE, results='hide'
library ("phyloseq") # required package for import of .biom tables from QIIME
library ("gridExtra") # package for re-arrangement of plots on page
library ("ggplot2") # for plotting of trees
rm(list=ls()) # clear R environment
# working directory is current directory by default
# and needs not to be set
#' ## Setting locations for data import and export
#'
#' This script uses the objects generated by `20_format_predictors.R` that are located
#' in the `Zenodo` directory tree. It will also write to that location. The
#' number in front of the file name denotes the source script. The `phyloseq`
#' object used here is from the repository `pcm_modelling` and has complete
#' eukaryotic data available, abundance corrected with the `css` algorithm in Qiime
#' 1.9.
#'
#' ### Import Locations
#'
#' Reading CSS'd data.
path_phsq_18S_in <- "./Zenodo/R_Objects/560_psob_18S_css_filtered.RData"
#' Diagnostic message.:
message("Loading: ", path_phsq_18S_in)
#' Formatted predictor data.:
path_predictors <- "./Zenodo/R_Objects/20_predictors.Rdata"
#' ### Export Locations
#'
#' Processed object with corrected dimension names and predictors.
path_phsq_ob <- "./Zenodo/R_Objects/35_phsq_ob.Rdata"
#' Work space for future reference.
path_workspace <- "./Zenodo/R_Objects/35_workspace.Rdata"
#' ## Functions
#'
#' Load script with functions.
source (file.path ("00_functions.r", fsep = .Platform$file.sep))
#' This function renames data in slots of phyloseq objects. Necessary for
#' downstream analysis, and when two genes are analysed in parallel.
label_slots <- function (psq_obj, gene = c("18S","COI")) {
# for function testing
# ====================
# load (path_phsq_18S_in)
# psq_obj <- ps_ob_18S
# gene <- "18S"
# define maker prefix in merged data
if (gene == "18S" ){
prefix = "rDNA_"
} else if (gene == "COI"){
prefix = "mtDNA_"
}
# modify ** PHYLOTYPE ** identifiers to be UNIQUE FOR EACH GENE
# =============================================================
## add prefix to OTU table in current Phyloseq object
dimnames (psq_obj@[email protected])[[1]] <-
paste0 (prefix, dimnames (psq_obj@[email protected])[[1]])
## add prefix to taxonomy table in current Phyloseq object
dimnames (psq_obj@[email protected])[[1]] <-
paste0 (prefix, dimnames (psq_obj@[email protected])[[1]])
## add prefix to reference sequences table
psq_obj@refseq@ranges@NAMES <- paste0 (prefix, psq_obj@refseq@ranges@NAMES)
## Show a diagnostic message
message("Made PHYLOTYPE names UNIQUE in Phyloseq object slots for gene ", gene,
" with prefix ", prefix)
# modify ** SAMPLE ** identifiers to be IDENTICAL FOR EACH GENE
# =============================================================
## strip suffix from OTU table in current Phyloseq object
dimnames (psq_obj@[email protected])[[2]] <-
gsub ('.{4}$', '', dimnames (psq_obj@[email protected])[[2]])
## Show a diagnostic message
message("Modified SAMPLE names Phyloseq object by stripping the last four characters.")
# return modified phyloseq object
return (psq_obj)
}
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Data import
#'
#' Phylotype data is imported using basic R functionality. Imported are phyloseq
#' objects [@McMurdie2013].
load (path_phsq_18S_in) # object name is "phsq_ob"
#' Also import the predictors.
load (path_predictors) # object name is "predictors"
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Filtering `phyloseq` data
#'
#' And hand is a `phyloseq` object from a project dealing with all eukaryotes, and with
#' all locations. In order for this object to be congruent with the downstream work flow.
#' Sample area "Reinbolt Hills" needs to be removed. Also everything apart from
#' invertebrates needs to be removed.
# remove superfluous sample region
phsq_ob <- subset_samples (phsq_ob, sample_data (phsq_ob)$AREA != "Reinbolt_Hills")
#' Give a first count to know how many samples were processed.
length(sample_names(phsq_ob)) # how many samples...
length (sample_data(phsq_ob)$AREA) # ... again jsut to be sure?
length (which (sample_data(phsq_ob)$AREA == "Mount_Menzies")) # 20 samples MM
length (which (sample_data(phsq_ob)$AREA == "Mawson_Escarpment")) # 64 samples ME
length (which (sample_data(phsq_ob)$AREA == "Lake_Terrasovoje")) # 64 samples LT
#' Marker coverage
levels (sample_data(phsq_ob)$GENE) # of two gene combinations
length (which (sample_data(phsq_ob)$GENE == "both")) # 58 both genes
length (which (sample_data(phsq_ob)$GENE == "18Sonly")) # 77 18Sonly
length (which (sample_data(phsq_ob)$GENE == "COIonly")) # 1 COIonly,
# retain invertebrates
phsq_ob <- subset_taxa (phsq_ob, Phylum == "Rotifera" | Phylum == "Tardigrada"
| Phylum == "Arthropoda" | Phylum == "Nematoda")
# remove classes that must be mis-identifications, since data was subset to non-
# control, Antarctic samples only.
phsq_ob <- subset_taxa (phsq_ob, Class != "Insecta" & Class != "Diplopoda" &
Class != "Maxillopoda" ) # & Class != "Collembola"
#' As a precaution, filtering for empty samples and phylotypes, again.
phsq_ob <- remove_empty (phsq_ob)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Merging of data-set with abiotic data
#'
#' ## Deleting the superfluous `sample_data()` slots
#'
#' The predictors used in this project are re-formatted in
#' `20_format_predictors.R` and will be included into the analysis work-flow later.
#' To avoid confusion, the similar data contained in the `sample_data()`
#' slots of the current `phyloseq` objects will be discarded here.
phsq_ob <- phyloseq (otu_table(phsq_ob), tax_table(phsq_ob), refseq(phsq_ob))
#' ## Re-labelling sample and phylotype identifiers in phyloseq objects.
#'
#' The following calls will rename the phylotype names and sample names in the
#' two phylotype objects.
#' <!-- To test this un-comment the following lines -->
#' <!-- tax_table(phsq_ob) -->
#' <!-- tax_table(prepare_merging (phsq_ob, "18S")) -->
#' <!-- dimnames(phsq_ob@[email protected])[[1]] -->
#' <!-- dimnames(phsq_ob@[email protected])[[1]] -->
#' <!-- phsq_ob@refseq@ranges@NAMES -->
#' <!-- dimnames(phsq_ob@[email protected])[[2]] -->
phsq_ob <- label_slots (phsq_ob, "18S")
#' <!-- check the results here -->
#' <!-- sample_names(phsq_ob) -->
#' <!-- dimnames(phsq_ob@[email protected])[[1]] -->
#' <!-- dimnames(phsq_ob@[email protected])[[1]] -->
#' <!-- phsq_ob@refseq@ranges@NAMES -->
#' <!-- dimnames(phsq_ob@[email protected])[[2]] -->
#' ## Merging data-sets into new phyloseq object(s)
#'
#' As a precaution, filtering for empty samples and phylotypes, again.
phsq_ob <- remove_empty (phsq_ob)
#' The following `phyloseq` object contains the predictor data. The abundance
#' values are added up to give proportional reads. Sequence data is not carried
#' through at this point anymore.
phsq_ob <- phyloseq (otu_table(phsq_ob), tax_table(phsq_ob), sample_data(predictors))
#' <!-- checking object -->
#' <!-- otu_table (phsq_ob) -->
#' <!-- tax_table (phsq_ob) -->
#' <!-- rank_names (phsq_ob) -->
#' <!-- sample_data (phsq_ob) -->
#' <!-- THIS MISTAKE IS CORRECTED IN 20_FIELD_DATA.R -->
#' <!-- -------------------------------------------- -->
#' <!-- # some samples have incorrect information regarding locus availability -->
#' <!-- sample_data(phsq_ob)$GENE == "none" -->
#' <!-- sample_names(phsq_ob)[114:115] # "2.10.E" "2.10.C" -->
#' <!-- # there is phylotype information available for 18S -->
#' <!-- otu_table(phsq_ob_comb)[ , which (colnames(otu_table(phsq_ob_comb)) %in% c("2.10.E", "2.10.C"))] -->
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' Undoing the log scaling of CSS'd abundance counts. This was experimental
#' and is not used anymore:
# phsq_ob <- get_exp(phsq_ob)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Checking output object
#'
#' ## Some basic counts from the newly modified phyloseq object
#'
# 90 samples in the data set
length(sample_names(phsq_ob))
# sample locations
length (sample_data(phsq_ob)$AREA) # of 90 samples
length (which (sample_data(phsq_ob)$AREA == "MM")) # 8 samples from MM
length (which (sample_data(phsq_ob)$AREA == "ME")) # 38 samples from ME
length (which (sample_data(phsq_ob)$AREA == "LT")) # 44 samples from ME
# marker coverage
levels (sample_data(phsq_ob)$GENE) # of two gene combinations
length (which (sample_data(phsq_ob)$GENE == "both")) # 42 both genes
length (which (sample_data(phsq_ob)$GENE == "18Sonly")) # 48 18Sonly
length (which (sample_data(phsq_ob)$GENE == "COIonly")) # 0 COIonly,
#' ## Plotting the newly modified phyloseq object
#'
#' The following plots are agglomerated on a specified level by calling
#' `agglomerate()`. The first plot shows proportional abundance of invertebrates
#' in rarefied data. In the second plot, all abundances are converted to "1",
#' (via `make_binary ()`)and the rank composition is more visible. Create the
#' plots.:
pl1 <- barplot_samples (agglomerate (phsq_ob, "Class"), "Class")
pl2 <- barplot_samples (make_binary (agglomerate (phsq_ob, "Class")), "Class")
#' Showing the plots.:
#+ message=FALSE, results='hide', warning=FALSE, fig.width=7, fig.height=7, dpi=200, fig.align='center', fig.cap="Invertebrate abundances at class level. Counts are RAW sequences, CSS not applied here. (approx. code line 293)"
grid.arrange(pl1, pl2, nrow = 2)
#' <!-- #################################################################### -->
#' <!-- #################################################################### -->
#'
#' # Write data to disk
#'
#' Saved are object created by this script as well as command history and work-space
#' image.
save (phsq_ob, file = path_phsq_ob) # phyloseq object
save.image (path_workspace) # work-space
#' # Session info
#'
#' The code and output in this document were tested and generated in the
#' following computing environment:
#+ echo=FALSE
sessionInfo()
#' # References