35_format_phyloseq.r

#' ---
#' title: "Inclusion of sample data into CSS'd `phyloseq` object"
#' author: "Paul Czechowski"
#' date: "October 22nd, 2016"
#' output: pdf_document
#' toc: true
#' highlight: zenburn
#' bibliography: ./references.bib
#' ---
#'
#' # Preface
#' 
#' This code is tested using a raw R terminal. Path names are defined relative 
#' to the project directory. This code commentary is included in the R code itself
#' and can be rendered at any stage using  `rmarkdown::render ("./35_format_phyloseq.R")`.
#' Please check the session info  at the end of the document for further notes 
#' on the coding environment. This script reads CSS'd data.
#'
#' # Prerequisites
#' 
#' **This script uses the `phyloseq` object from repository `pcm_modelling`**.
#' In `pcm_modelling` the generation of .`560_psob_18S_css_filtered.RData`
#' and `560_psob_18S_default_filtered.RData` is thoroughly 
#' documented. Here the former object is used, in which abundance were corrected
#' using cumulative sum scaling [@Paulson2013]. Further prerequisites: 
#'
#' * This script is located in the project parent directory.
#' * `10_import_predictors.pdf` and `20_format_predictors.R` were run.
#' * Object generated by these scripts are available in 
#' `/Zenodo/R_Objects`.
#' 
#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Environment preparation
#'
#' ## Package loading and cleaning of workspace
#' 
#+ message=FALSE, results='hide'
library ("phyloseq")  # required package for import of .biom tables from QIIME
library ("gridExtra") # package for re-arrangement of plots on page
library ("ggplot2")   # for plotting of trees
rm(list=ls())         # clear R environment
                      # working directory is current directory by default 
                      # and needs not to be set

#' ## Setting locations for data import and export
#' 
#' This script uses the objects generated by `20_format_predictors.R` that are located
#' in the `Zenodo` directory tree. It will also write to that location. The 
#' number in front of the file name denotes the source script. The `phyloseq`
#' object used here is from the repository `pcm_modelling` and has complete 
#' eukaryotic data available, abundance corrected with the `css` algorithm in Qiime
#' 1.9.
#'
#' ### Import  Locations
#'
#' Reading CSS'd data.
path_phsq_18S_in <- "./Zenodo/R_Objects/560_psob_18S_css_filtered.RData" 
                      
#' Diagnostic message.:
message("Loading: ", path_phsq_18S_in) 

#' Formatted predictor data.:
path_predictors  <- "./Zenodo/R_Objects/20_predictors.Rdata"

#' ### Export Locations
#'
#' Processed object with corrected dimension names and predictors.
path_phsq_ob   <- "./Zenodo/R_Objects/35_phsq_ob.Rdata"
                 
#' Work space for future reference. 
path_workspace <- "./Zenodo/R_Objects/35_workspace.Rdata"

#' ## Functions
#'
#' Load script with functions.
source (file.path ("00_functions.r", fsep = .Platform$file.sep))

#' This function renames data in slots of phyloseq objects. Necessary for 
#' downstream analysis, and when two genes are analysed in parallel.
label_slots <- function (psq_obj, gene = c("18S","COI")) {
  
  # for function testing 
  # ====================
  # load (path_phsq_18S_in)
  # psq_obj <- ps_ob_18S 
  # gene <- "18S"
  
  # define maker prefix in merged data
  if (gene  == "18S" ){
      prefix = "rDNA_"
  } else if (gene  == "COI"){
      prefix = "mtDNA_"
  } 
  
  # modify ** PHYLOTYPE ** identifiers to be UNIQUE FOR EACH GENE
  # =============================================================
  ## add prefix to OTU table in current Phyloseq object
  dimnames (psq_obj@otu_table@.Data)[[1]] <- 
    paste0 (prefix, dimnames (psq_obj@otu_table@.Data)[[1]])
   
  ## add prefix to taxonomy table in current Phyloseq object
  dimnames (psq_obj@tax_table@.Data)[[1]] <- 
    paste0 (prefix, dimnames (psq_obj@tax_table@.Data)[[1]])
   
  ## add prefix to reference sequences table
	psq_obj@refseq@ranges@NAMES <- paste0 (prefix, psq_obj@refseq@ranges@NAMES)	
	 
  ## Show a diagnostic message 
  message("Made PHYLOTYPE names UNIQUE in Phyloseq object slots for gene ", gene,
    " with prefix ", prefix)
  
  # modify ** SAMPLE ** identifiers to be IDENTICAL FOR EACH GENE
  # =============================================================
  ## strip suffix from OTU table in current Phyloseq object
  dimnames (psq_obj@otu_table@.Data)[[2]] <- 
    gsub ('.{4}$', '', dimnames (psq_obj@otu_table@.Data)[[2]])

  ## Show a diagnostic message 
  message("Modified SAMPLE names Phyloseq object by stripping the last four characters.")
  
  # return modified phyloseq object
  return (psq_obj)
}

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Data import
#' 
#' Phylotype data is imported using basic R functionality. Imported are phyloseq
#' objects [@McMurdie2013].
load (path_phsq_18S_in) # object name is "phsq_ob"

#' Also import the predictors.
load (path_predictors) # object name is "predictors" 

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Filtering `phyloseq` data
#'
#' And hand is a `phyloseq` object from a project dealing with all eukaryotes, and with
#' all locations. In order for this object to be congruent with the downstream work flow.
#' Sample area "Reinbolt Hills" needs to be removed. Also everything apart from 
#' invertebrates needs to be removed.

# remove superfluous sample region
phsq_ob <- subset_samples (phsq_ob, sample_data (phsq_ob)$AREA != "Reinbolt_Hills")

#' Give a first count to know how many samples were processed.

length(sample_names(phsq_ob))                      # how many samples...
length (sample_data(phsq_ob)$AREA)                 # ... again jsut to be sure? 
length (which (sample_data(phsq_ob)$AREA == "Mount_Menzies"))     # 20 samples MM
length (which (sample_data(phsq_ob)$AREA == "Mawson_Escarpment")) # 64 samples ME
length (which (sample_data(phsq_ob)$AREA == "Lake_Terrasovoje"))  # 64 samples LT

#' Marker coverage  
levels (sample_data(phsq_ob)$GENE)                      # of two gene combinations
length (which (sample_data(phsq_ob)$GENE == "both"))    # 58 both genes
length (which (sample_data(phsq_ob)$GENE == "18Sonly")) # 77 18Sonly
length (which (sample_data(phsq_ob)$GENE == "COIonly")) #  1 COIonly,

# retain invertebrates
phsq_ob <- subset_taxa (phsq_ob, Phylum == "Rotifera" | Phylum ==  "Tardigrada"
  |  Phylum == "Arthropoda" |  Phylum == "Nematoda")

# remove classes that must be mis-identifications, since data was subset to non-
#   control, Antarctic samples only. 
phsq_ob <- subset_taxa (phsq_ob, Class != "Insecta" & Class !=  "Diplopoda" & 
  Class != "Maxillopoda" ) # & Class != "Collembola"

#' As a precaution, filtering for empty samples and phylotypes, again.
phsq_ob <- remove_empty (phsq_ob)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#' 
#' # Merging of data-set with abiotic data  
#'
#' ## Deleting the superfluous `sample_data()` slots
#'
#' The predictors used in this project are re-formatted in 
#' `20_format_predictors.R` and will be included into the analysis work-flow later.
#' To avoid confusion, the similar data contained in the `sample_data()`
#' slots of the current `phyloseq` objects will be discarded here.
phsq_ob <- phyloseq (otu_table(phsq_ob), tax_table(phsq_ob), refseq(phsq_ob))

#' ## Re-labelling sample and phylotype identifiers in phyloseq objects.
#' 
#' The following calls will rename the phylotype names and sample names in the
#' two phylotype objects.
#' <!-- To test this un-comment the following lines --> 
#' <!-- tax_table(phsq_ob) -->
#' <!-- tax_table(prepare_merging (phsq_ob, "18S")) -->
#' <!-- dimnames(phsq_ob@otu_table@.Data)[[1]] -->
#' <!-- dimnames(phsq_ob@tax_table@.Data)[[1]] -->
#' <!-- phsq_ob@refseq@ranges@NAMES -->
#' <!-- dimnames(phsq_ob@otu_table@.Data)[[2]] -->

phsq_ob <- label_slots (phsq_ob, "18S")
 
#' <!-- check the results here -->
#' <!-- sample_names(phsq_ob) -->
#' <!-- dimnames(phsq_ob@otu_table@.Data)[[1]] -->
#' <!-- dimnames(phsq_ob@tax_table@.Data)[[1]] -->
#' <!-- phsq_ob@refseq@ranges@NAMES -->
#' <!-- dimnames(phsq_ob@otu_table@.Data)[[2]] -->

#' ## Merging data-sets into new phyloseq object(s)
#' 
#' As a precaution, filtering for empty samples and phylotypes, again.
phsq_ob <- remove_empty (phsq_ob)
  
#' The following `phyloseq` object contains the predictor data. The abundance 
#' values are added up to give proportional reads. Sequence data is not carried
#' through at this point anymore. 
phsq_ob <- phyloseq (otu_table(phsq_ob), tax_table(phsq_ob), sample_data(predictors))

#' <!-- checking object -->

#' <!-- otu_table  (phsq_ob) -->
#' <!-- tax_table  (phsq_ob) -->
#' <!-- rank_names (phsq_ob) -->
#' <!-- sample_data (phsq_ob) -->

#' <!-- THIS MISTAKE IS CORRECTED IN 20_FIELD_DATA.R -->
#' <!-- -------------------------------------------- -->
#' <!-- # some samples have incorrect information regarding locus availability -->
#' <!-- sample_data(phsq_ob)$GENE == "none" -->
#' <!-- sample_names(phsq_ob)[114:115] # "2.10.E" "2.10.C" -->

#' <!-- # there is phylotype information available for 18S -->
#' <!-- otu_table(phsq_ob_comb)[ , which (colnames(otu_table(phsq_ob_comb)) %in% c("2.10.E", "2.10.C"))] -->

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' Undoing the log scaling of CSS'd abundance counts. This was experimental 
#' and is not used anymore:

# phsq_ob <- get_exp(phsq_ob)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Checking output object  
#'
#' ## Some basic counts from the newly modified phyloseq object 
#'

# 90 samples in the data set
length(sample_names(phsq_ob))

# sample locations 
length (sample_data(phsq_ob)$AREA)                 # of 90 samples
length (which (sample_data(phsq_ob)$AREA == "MM")) # 8 samples from MM
length (which (sample_data(phsq_ob)$AREA == "ME")) # 38 samples from ME
length (which (sample_data(phsq_ob)$AREA == "LT")) # 44 samples from ME

# marker coverage  
levels (sample_data(phsq_ob)$GENE)                      # of two gene combinations
length (which (sample_data(phsq_ob)$GENE == "both"))    # 42 both genes
length (which (sample_data(phsq_ob)$GENE == "18Sonly")) # 48 18Sonly
length (which (sample_data(phsq_ob)$GENE == "COIonly")) #  0 COIonly,

#' ## Plotting the newly modified phyloseq object 
#' 
#' The following plots are agglomerated on a specified level by calling 
#' `agglomerate()`. The first plot shows proportional abundance of invertebrates
#' in rarefied data. In the second plot, all abundances are converted to "1",
#' (via `make_binary ()`)and the rank composition is more visible. Create the 
#' plots.:
pl1 <- barplot_samples (agglomerate (phsq_ob, "Class"), "Class")
pl2 <- barplot_samples (make_binary (agglomerate (phsq_ob, "Class")), "Class")

#' Showing the plots.:
#+ message=FALSE, results='hide', warning=FALSE, fig.width=7, fig.height=7, dpi=200, fig.align='center',  fig.cap="Invertebrate abundances at class level. Counts are RAW sequences, CSS not applied here. (approx. code line 293)"
grid.arrange(pl1, pl2, nrow = 2)

#' <!-- #################################################################### -->


#' <!-- #################################################################### -->
#'
#' # Write data to disk 
#' 
#' Saved are object created by this script as well as command history and work-space
#' image.
save (phsq_ob, file = path_phsq_ob) # phyloseq object
save.image (path_workspace)         # work-space

#' # Session info
#' 
#' The code and output in this document were tested and generated in the 
#' following computing environment:
#+ echo=FALSE

sessionInfo()

#' # References