Skip to content

Latest commit

 

History

History
1135 lines (820 loc) · 42.5 KB

OMZ_SAG_Compendium_Figures_Markdown_And_Tables.md

File metadata and controls

1135 lines (820 loc) · 42.5 KB

OMZ Compendium Figures And Tables

Julia Anstett 2023-03-21

First, we will import the libraries used to generate all figures for this paper

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## 
## ---------------------
## Welcome to dendextend version 1.16.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## 
## Attaching package: 'dendextend'
## 
## The following object is masked from 'package:permute':
## 
##     shuffle
## 
## The following object is masked from 'package:stats':
## 
##     cutree
library(sf)
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library("rnaturalearth")
library("rnaturalearthdata")
## 
## Attaching package: 'rnaturalearthdata'
## 
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
library(egg)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Next, we will import the data used to generate all files in this paper

#All figures use OMZ
OMZ<-read.csv("Data/Update_Global_SAGs_Mar_21_2023_GTDB.csv", header = TRUE)

#Figures 3, 4, S2-6 use the truncated Phylum labels for ploting
Tax_list_labs<-read.csv("Data/Tax_List_Labs_Sept_23_2021.csv", header = FALSE)

#For Figure 1B
oxygen1<-read.csv("Data/woa18_all_o00mn01.csv", header = TRUE)
oxygen5<-read.csv("Data/woa18_all_o00mn5d.csv", header = TRUE)


#For Figure 3
CD_Hits<-read.csv("Data/CD-Hit_OTUS_July_05_2022.csv", header= TRUE)
SSU_Seqs<-read.csv("Data/SSU_Seqs.csv", header = TRUE)

###Figures

Figure 1B

Here, I will make the map with the OMZ Counts

#First, we'll round up the Latitude and Longitide to the nearest degree
OMZ<- OMZ %>% mutate(Round_Lat=round(OMZ$Lat))
OMZ<- OMZ %>% mutate(Round_Long=round(OMZ$Long))

plot_points<-OMZ %>% select(Region,Round_Lat, Round_Long)
plot_points<-unique(plot_points)


counter<-1
for (i in 1:dim(plot_points)[1]){
  input<-tally(OMZ %>% filter (Region==plot_points[i,1],Round_Lat==plot_points[i,2], Round_Long==plot_points[i,3]))
  plot_points[counter,4]<-input
  counter<- counter+1 
}

colnames(plot_points)[4]<-"Total"

counter<-1
for (i in 1:dim(plot_points)[1]){
  input<-tally(OMZ %>% filter (Region==plot_points[i,1],Round_Lat==plot_points[i,2], Round_Long==plot_points[i,3], Status=="Sequenced"))
  plot_points[counter,5]<-input
  counter<- counter+1 
}

colnames(plot_points)[5]<-"Sequenced"

min_O2_1_test<-oxygen1 %>% select(-LATITUDE, -LONGITUDE)
min_O2_5_test<-oxygen5 %>% select(-LATITUDE, -LONGITUDE)


min_O2_1<-oxygen1 %>% select(LATITUDE, LONGITUDE) %>% mutate(uM=apply(min_O2_1_test, 1, FUN=min, na.rm=TRUE))
min_O2_plot_1<-min_O2_1%>% mutate(uM=replace(min_O2_1$uM, min_O2_1$uM>=200, 200))




min_O2_5<-oxygen5 %>% select (LATITUDE, LONGITUDE) %>% mutate(uM=apply(min_O2_5_test, 1, FUN=min, na.rm=TRUE))
min_O2_plot_5<-min_O2_5%>%mutate(uM=replace(min_O2_5$uM, min_O2_5$uM>=200, 200))



rainbow1<-c('#9e0142','#d53e4f','#f46d43','#fdae61','#fee08b','#ffffbf','#e6f598','#abdda4','#66c2a5','#3288bd','#5e4fa2', '#000066')
revrainbow1<-rev(rainbow1)

mp <- NULL



world <- ne_countries(scale = "medium", returnclass = "sf")


colnames(plot_points)[2]<-"Latitude"
colnames(plot_points)[3]<-"Longitude"

sites<-plot_points
sites <- st_as_sf(sites, coords = c("Longitude", "Latitude"), 
                  crs = 4326, agr = "constant")

sf_use_s2(FALSE)
## Spherical geometry (s2) switched off
#mapWorld <- borders("world", colour="grey40", fill="grey40") # create a layer of borders
mp <- ggplot() +

  
  geom_raster(aes(x=min_O2_plot_5$LONGITUDE, y=min_O2_plot_5$LATITUDE, fill=min_O2_plot_5$uM)) +
  geom_raster(aes(x=min_O2_plot_1$LONGITUDE, y=min_O2_plot_1$LATITUDE, fill=min_O2_plot_1$uM)) +
  scale_fill_gradientn(colours =revrainbow1, name="Minimum Oxygen Concentration (uM)")+
  
  geom_sf(data=world, color = "grey40", fill = "grey40") +
  
  
  geom_sf(data = sites, aes(size = sites$Total), shape = 21, fill = "white") +  
  geom_sf(data = sites, aes(size = sites$Sequenced), shape = 21, fill = "black") +
  scale_size(range = c(1, 10))+
  labs(size="Number of SAGs")+
  geom_sf()+
  coord_sf(expand = FALSE) + 
  xlab("Longitude") + 
  ylab("Latitude")

mp

ggsave("Outputs/FIG-1B-OMZ_Map.pdf",mp, width=15, height = 14, units = "in")

Figure 3A

This will produce the dendrogram and the clustering annotation bars.

#First, we need to convert the CD-Hit table to a readable and assign a cluster ID to each SSU sequence
counter<-0
start_ID<-0
for (i in 1:dim(CD_Hits)[1]){
  tmp<-CD_Hits$Cluster_ID[i]
  
  if (tmp==counter){
    CD_Hits$Cluster_ID[i]<-paste("Cluster", start_ID, sep="_")
    
  }else{
    counter<-0
    start_ID<-start_ID+1
    CD_Hits$Cluster_ID[i]<-paste("Cluster", start_ID, sep="_")
  }
  counter<-counter+1
}

#Variable Declaration so we can get ready to get the proportions of the amplified and sequenced representatives used in the distance matrix
props_Amp<-data.frame()
props_Seq<-data.frame()

props_Amp_WGS<-data.frame()
props_Amp_SSU<-data.frame()

SSU_plot<-data.frame()

#Grab all of the unique SILVA 138.1 taxonomies
Tax_list<-unique(OMZ %>% select(SILVA_138_1_Tax) %>% arrange (SILVA_138_1_Tax))

#Reduce the dataset
wga_apporach<-OMZ%>% select(Site_ID, O2._.uM., WGA_Approach, Sorting_Method, OMZ_Phenotype, Topography)
wga_apporach_DNA_Only<-wga_apporach %>% filter(Sorting_Method=="DNA")
wga_apporach_DNA_Only<-wga_apporach_DNA_Only %>%  distinct(Site_ID, .keep_all = TRUE)


#Variable Declaration to get ready to plot and count

Plot_labs<-c()


OMZ_Counts_Amp<-data.frame()
OMZ_Counts_Seq<-data.frame()

OMZ_plot<-data.frame()


#Set up the Taxonomy List


for (i in 1: dim(OMZ)[1]){
  for (j in 1:dim (Tax_list_labs)[1]){
    pattern<-as.character(Tax_list_labs[j,1])
    tmp_match<-grepl(pattern, as.character(OMZ$SILVA_138_1_Tax[i]))
    
    if (tmp_match==TRUE){
      OMZ$Plot_Taxa[i]<-pattern
    }
  }
}


#Next, we're going to subset for the anonymously sorted SAGs, and exclude SAGs that did not cover the SSU rRNA V4-V5 region, as well as
#any unclassified SAGs

DNA_Sorted<- OMZ %>% filter(Sorting_Method=="DNA")
DNA_Sites<- as.character(unique(DNA_Sorted$Site_ID))
SSU_DNA_Only<-DNA_Sorted %>% filter(Primary_Tax_Method=="16S", Target_Region!="V6-V8", SILVA_138_1_Tax !="Unclassified")
SSU_Sites<- as.character(unique(SSU_DNA_Only$Site_ID))


#This brings in the appropriate cluster IDs
for (i in 1:dim(SSU_DNA_Only)[1]){
  pattern<-as.character(SSU_DNA_Only$Sample_ID[i])
  tmp_match<- CD_Hits %>% filter (Sample_ID==as.character(pattern))
  SSU_DNA_Only$Cluster_ID[i]<-tmp_match$Cluster_ID
    
}

#Here, we're going to identify and record all of the representative SAGs for each cluster
OTU_Reps_Phylum<-data.frame()

OTU_Reps_Phylum<-CD_Hits %>% filter(Identity=="*")


cluster_names<-c()
counter<-1
for (i in 1:dim(OTU_Reps_Phylum)[1]){
    pattern<-as.character(OTU_Reps_Phylum$Sample_ID[i])
    tmp_match<- DNA_Sorted %>% filter(Sample_ID==pattern)
    
    OTU_Reps_Phylum[i,4]<-as.character(tmp_match$SILVA_138_1_Tax)
    OTU_Reps_Phylum[i,5]<-as.character(tmp_match$Plot_Taxa)
 
}

colnames(OTU_Reps_Phylum)[4]<-"SILVA_138_1_Tax"
colnames(OTU_Reps_Phylum)[5]<-"Phylum"

OTU_Reps_Phylum<-OTU_Reps_Phylum %>% filter(SILVA_138_1_Tax!="Unclassified")

SSU_Seqs_Rep<-data.frame()

for (i in 1:dim(OTU_Reps_Phylum)[1]){
  tmp_SSU_Seqs_Rep<-SSU_Seqs %>% filter(Sample_ID==as.character(OTU_Reps_Phylum$Sample_ID[i]))
  SSU_Seqs_Rep<-rbind(SSU_Seqs_Rep,tmp_SSU_Seqs_Rep)
}

write.csv(SSU_Seqs_Rep, "Outputs/SSU_Reps_Mar_21_2023.csv")
SSU_wga_apporach_DNA_Only<-SSU_DNA_Only %>%  distinct(Site_ID, .keep_all = TRUE)

SSU_Clusters<-unique(SSU_DNA_Only$Cluster_ID)

#######################################################################################################


#calculate amplifed 16S abundances for Full Taxonomy
for (i in 1:dim(Tax_list)[1]){
  for (j in 1: length(SSU_Sites)){
    input<-tally(SSU_DNA_Only %>% filter (SILVA_138_1_Tax==Tax_list[i,1], Site_ID==SSU_Sites[j]))/
      tally (SSU_DNA_Only %>% filter (Site_ID==SSU_Sites[j]))
    props_Amp_SSU[j,i]<-input
  }
} 



rownames(props_Amp_SSU)<-SSU_Sites
colnames(props_Amp_SSU)<-Tax_list[,1]




props_Amp_SSU_OTU<-data.frame()
OTU_List<-unique(SSU_DNA_Only$Cluster_ID)


#calculate amplifed 16S abundances for Full Taxonomy
for (i in 1:length(OTU_List)){
  for (j in 1: length(SSU_Sites)){
    input<-tally(SSU_DNA_Only %>% filter (Cluster_ID==OTU_List[i], Site_ID==SSU_Sites[j]))/
      tally (SSU_DNA_Only %>% filter (Site_ID==SSU_Sites[j]))
    props_Amp_SSU_OTU[j,i]<-input
  }
} 


rownames(props_Amp_SSU_OTU)<-SSU_Sites
colnames(props_Amp_SSU_OTU)<-OTU_List



################################################################################3
#Cluster By OTU Cluster

#################################################################
dist_SSU_prop_bray_otu<-vegdist(props_Amp_SSU_OTU, method = "bray")
clust_SSU_prop_bray_a_otu<-hclust(dist_SSU_prop_bray_otu, method="average")

dend1<-(rotate(as.dendrogram(clust_SSU_prop_bray_a_otu), order = c(1:7,18:34, 16:17 ,8:15)))

#Set Annotation Bar for Oxytype
strip_labels<-labels(dend1)
wga_colors<-c()
phenotype_colours<-c()

#calculate amplifed 16S abundances for Full Taxonomy
for (j in 1: length(strip_labels)){
  y<-SSU_wga_apporach_DNA_Only %>% filter(Site_ID==strip_labels[j]) 
  
  tmp_wga_approach<-as.character(y$WGA_Approach)
  
  
  wga_colors<-c(wga_colors, tmp_wga_approach)
  phenotype_colours<-c(phenotype_colours, as.character(y$OMZ_Phenotype))
  
  
}

wga_colors<-replace(wga_colors, wga_colors=="MDA", "#f4a582")
wga_colors<-replace(wga_colors, wga_colors=="WGA-X", "#d1e5f0")

phenotype_colours<-replace(phenotype_colours, phenotype_colours=="Open_Ocean_OMZ", "#ffffb3")
phenotype_colours<-replace(phenotype_colours, phenotype_colours=="Low_Oxygen_OMZ", "#fccde5")
phenotype_colours<-replace(phenotype_colours, phenotype_colours=="AMZ", "#8dd3c7")
phenotype_colours<-replace(phenotype_colours, phenotype_colours=="Sulfidic_Bottom", "#bebada")

plot_colours<-cbind(phenotype_colours, wga_colors)

pdf("Outputs/FIG-3A_Dend_avg_Mar_21_2023.pdf", width = 15, height = 5)
par(mar = c(15,4,1,1))
plot(dend1)
colored_bars(colors = plot_colours, dend=dend1, sort_by_labels_order = FALSE)
dev.off()
## png 
##   2

##Figure 3B Dot Plot for anonymously sorted SSU Samples

#Since we've already clustered our data, we simply need the order the order from the clustering for the sites, the taxonomic labels, 
#and the count data

SSU_Tax_labs<-as.data.frame(unique(SSU_DNA_Only$Plot_Taxa))
colnames(SSU_Tax_labs)<-"Plot_Taxa"

Taxa_to_drop<-c()

for (i in 1:dim(Tax_list_labs)[1]){
  tmp_tax<-Tax_list_labs[i,1]
  tmp_ssu_tax<-SSU_Tax_labs %>% filter (Plot_Taxa==as.character(tmp_tax))
  
  if (dim(tmp_ssu_tax)[1]==0){
    Taxa_to_drop<-append(Taxa_to_drop,as.character(tmp_tax))
  }
}

colnames(Tax_list_labs)<-"Plot_Taxa"
Tax_list_labs<-Tax_list_labs[Tax_list_labs!=Taxa_to_drop,]
Tax_list_labs<-as.vector(Tax_list_labs)
Tax_list_labs<-Tax_list_labs[!Tax_list_labs %in% Taxa_to_drop]

#calculate amplifed 16S abundances for Taxonomy at Phylum Level
counter<-1
for (i in 1:length(Tax_list_labs)){
  for (j in 1: length(SSU_Sites)){
    input<-tally(SSU_DNA_Only %>% filter (Plot_Taxa==as.character(Tax_list_labs[i]), Site_ID==as.character(SSU_Sites[j])))
    
    SSU_plot[counter,1]<-as.character(SSU_Sites[j])
    SSU_plot[counter,2]<-as.character(Tax_list_labs[i])
    SSU_plot[counter,3]<-NA
    SSU_plot[counter,4]<-input
    
    
    counter<-counter+1 
  }
}

oxytype_ssu<-SSU_DNA_Only%>%  distinct(Site_ID, .keep_all = TRUE)


for (i in 1:length(Tax_list_labs)){
  for (j in 1: length(SSU_Sites)){
    
    input<-tally(SSU_DNA_Only %>% filter (Plot_Taxa==Tax_list_labs[i], Site_ID==as.character(SSU_Sites[j]), Status=="Sequenced"))
    
    SSU_plot[counter,1]<-SSU_Sites[j]
    SSU_plot[counter,2]<-Tax_list_labs[i]
    SSU_plot[counter,3]<-oxytype_ssu$O2._.uM.[j]
    SSU_plot[counter,4]<-input
    
    counter<- counter+1 
  }
}

colnames(SSU_plot)<-c("Site", "Taxonomy", "O2_uM", "Count")
SSU_plot$Count[SSU_plot$Count==0]<-NA

SSU_plot<-SSU_plot%>%mutate(O2_uM=replace(SSU_plot$O2_uM,SSU_plot$O2_uM>=200, 200))

#This is the colour pallet we're using, notice that it's the same as Figure 1B
rainbow1<-c('#9e0142','#d53e4f','#f46d43','#fdae61','#fee08b','#ffffbf','#e6f598','#abdda4','#66c2a5','#3288bd','#5e4fa2', '#000066')
revrainbow1<-rev(rainbow1)


p_SSU_h<- ggplot(SSU_plot, aes(x =Site, y = Taxonomy))+ 
  geom_point(na.rm=TRUE, aes(size=Count, fill=O2_uM), alpha=0.7, shape =21, colour="black")+
  
  scale_y_discrete(limits = rev(Tax_list_labs)) +
  scale_x_discrete(limits= strip_labels,position="top") +
  scale_fill_gradientn(colours =revrainbow1, name="Oxygen Concentration (uM)", na.value = "grey")+
  theme_bw()+
  theme(axis.text.x = element_blank())+
  scale_size_continuous(breaks = c(25, 50, 75, 100, 125), range = c(3,15) )+
  labs(size="Number of SAGs")
p_SSU_h

ggsave("Outputs/FIG-3B_Dot_O2_Map_Colour_Mar_21_2023.pdf",p_SSU_h, width=20, height = 15, units = "in")

##Figure 4 CheckM Completeness and Contamination Estimate Plots Here, we’re going to plot panels A-F

Tax_list_labs<-read.csv("Data/Tax_List_Labs_Sept_23_2021.csv", header = FALSE)

for (i in 1: dim(OMZ)[1]){
  for (j in 1:dim (Tax_list_labs)[1]){
    pattern<-as.character(Tax_list_labs[j,1])
    match<-grepl(pattern, as.character(OMZ$SILVA_138_1_Tax[i]))
    
    if (match==TRUE){
      OMZ$Plot_Taxa[i]<-pattern
      Plot_labs<-pattern
    }
  }
}

Seq_OMZ<-OMZ %>% filter(Status=="Sequenced")

unique_phylum<-unique(Seq_OMZ$Plot_Taxa)

Phylum_drop<-c()
counter<-1
for (i in 1:dim(Tax_list_labs)[1]){
  pattern<-as.character(Tax_list_labs[i,1])
  tmp_match<-match(pattern, unique_phylum)
  
  if(is.na(tmp_match)==TRUE){
    Phylum_drop<-c(Phylum_drop,i)
  }
}

Tax_list_labs<-as.vector(Tax_list_labs[-Phylum_drop,])


##############################################################

#We're going to make one base plot, and make each sub plot individually, then arrange them together with ggarrange.

Seq_OMZ$Plot_Taxa<-as.factor(Seq_OMZ$Plot_Taxa)
Seq_OMZ$Plot_Taxa<- factor(Seq_OMZ$Plot_Taxa, levels=as.vector(Tax_list_labs))

Seq_OMZ$OMZ_Phenotype<-as.factor(Seq_OMZ$OMZ_Phenotype)
Seq_OMZ$OMZ_Phenotype<- factor(Seq_OMZ$OMZ_Phenotype, levels=c("Open_Ocean_OMZ", "Low_Oxygen_OMZ", "AMZ", "Sulfidic_Bottom"))

OMZ_CC<-ggplot(Seq_OMZ, aes(x=Completeness, y=Contamination))
OMZ_CC<-Seq_OMZ %>% filter(Contamination<=5) %>% ggplot( aes(x=Completeness, y=Contamination))

####All_CCs
All_Tax<-OMZ_CC +
  geom_point(na.rm = TRUE, aes(fill=Plot_Taxa, size=Assembly_Length_MBP), shape =21, colour="black") +
  geom_vline(xintercept = 50)+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_hline(yintercept = 5, linetype="dashed")+
  guides(fill = guide_legend(order=1,override.aes = list(size=5)))+
  theme_classic()+
  labs(size="Assembly Length (Mbp)", fill="Taxonomy")+
  ylab("% Contamination")+
  xlab("% Completeness")
All_Tax

All_Region<-OMZ_CC +
  geom_point(na.rm = TRUE, aes(fill=Region, size=Assembly_Length_MBP), shape =21, colour="black") +
  guides(fill = guide_legend(order=1,override.aes = list(size=5)))+
  geom_vline(xintercept = 50)+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_hline(yintercept = 5, linetype="dashed")+
  guides(fill = guide_legend(order=1,override.aes = list(size=5)))+
  theme_classic()+
  labs(size="Assembly Length (Mbp)", fill="Region")+
  ylab("% Contamination")+
  theme(axis.title.x=element_blank())

All_Region

All_Pheno<-OMZ_CC +
  geom_point(na.rm = TRUE, aes(fill=OMZ_Phenotype, size=Assembly_Length_MBP), shape =21, colour="black") +
  guides(fill = guide_legend(order=1,override.aes = list(size=5)))+
  scale_fill_manual(values = c("Open_Ocean_OMZ" = "#ffffb3", "Low_Oxygen_OMZ"="#fccde5", "AMZ"= "#8dd3c7", "Sulfidic_Bottom"="#bebada"))+
  geom_vline(xintercept = 50)+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_hline(yintercept = 5, linetype="dashed")+
  guides(fill = guide_legend(order=1,override.aes = list(size=5)))+
  theme_classic()+
  labs(size="Assembly Length (Mbp)", fill="OMZ Type")+
  ylab("% Contamination")+
  theme(axis.title.x=element_blank())

All_Pheno

All_Oxy<-OMZ_CC +
  geom_point(na.rm = TRUE, aes(fill=O2._.uM., size=Assembly_Length_MBP), shape =21, colour="black") +
  scale_fill_gradientn(colours =revrainbow1, name="Oxygen Concentration (uM)")+
  guides(fill = guide_legend(order=1, override.aes = list(size=5)))+
  geom_vline(xintercept = 50)+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_hline(yintercept = 5, linetype="dashed")+
  theme_classic()+
  labs(size="Assembly Length (Mbp)", fill="Environmental O2 Level")+
  ylab("% Contamination")+
  theme(axis.title.x=element_blank())

All_Oxy

All_MDA_CC<-OMZ_CC +
  geom_point(na.rm = TRUE, aes(fill=WGA_Approach, size=Assembly_Length_MBP), shape =21, colour="black") +
  scale_fill_manual(values = c("MDA" = "#f4a582", "WGA-X"="#d1e5f0"), labels=c("L-MDA", "WGA-X"))+
  guides(fill = guide_legend(order=1, override.aes = list(size=5)))+
  geom_vline(xintercept = 50)+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_hline(yintercept = 5, linetype="dashed")+
  theme_classic()+
  labs(size="Assembly Length (Mbp)",fill="Amplification Method")+
  ylab("% Contamination")+
  xlab("% Completeness")

All_MDA_CC

All_Depth_CC<-OMZ_CC +
  geom_point(na.rm = TRUE, aes(fill=Depth, size=Assembly_Length_MBP), shape =21, colour="black") +
  scale_fill_continuous(low="navy", high="lightblue", trans = 'reverse')+
  geom_vline(xintercept = 50)+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_hline(yintercept = 5, linetype="dashed")+
  theme_classic()+
  guides(colour = guide_legend(reverse=T))+
  labs(size="Assembly Length (Mbp)",fill="Depth (m)")+
  ylab("% Contamination")+
  theme(axis.title.x=element_blank())


All_Depth_CC

arr1<-ggarrange (All_Region, All_Pheno,All_Depth_CC, All_Oxy,All_MDA_CC, All_Tax, nrow = 3)

arr1

ggsave("Outputs/FIG-4_All_CCs.pdf",arr1,
       width=15, height = 14, units = "in")

##Figure S4 CheckM Estimate Box Plots

#Split Between MDA/WGA-X and make box plots for Completeness, Conamination, and Assembly Length

MDA_Seq<-Seq_OMZ %>% filter (WGA_Approach=="MDA")
WGA_Seq<-Seq_OMZ %>% filter (WGA_Approach=="WGA-X")



p1_Comp_MDA<-ggplot(MDA_Seq, aes(x=Plot_Taxa, y=Completeness)) + geom_boxplot(na.rm = TRUE, fill="#f4a582") +
  #theme(axis.text.x = element_text(angle=45, hjust = 1))+
  xlab("L-MDA")+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()+
  theme(axis.title.x=element_blank())

p2_Contam_MDA<-ggplot(MDA_Seq, aes(x=Plot_Taxa, y=Contamination)) + geom_boxplot(na.rm = TRUE, fill="#f4a582")+
  # theme(axis.text.x = element_text(angle=45, hjust = 1)) +  
  #ylab("% Contamination")+
  scale_y_continuous(limits=c(0,8), breaks = seq(0, 8, by = 2))+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()+
  theme(axis.title.x=element_blank())


p3_Len_MDA<-ggplot(MDA_Seq, aes(x=Plot_Taxa, y=Assembly_Length_MBP)) + 
  geom_boxplot(na.rm = TRUE, fill="#f4a582")+
  #  theme(axis.text.x = element_text(angle=45, hjust = 1)) +  
  ylab("Assembly Length (MBP)")+
  scale_y_continuous(limits=c(0,4), breaks = seq(0, 4, by = 2))+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()+
  theme(axis.title.x=element_blank())


p4_Avg_Contig_MDA<-ggplot(MDA_Seq, aes(x=Plot_Taxa, y=Mean_contig_length_bp/1000))+
  geom_boxplot(na.rm = TRUE,  fill="#f4a582")+
  #  theme(axis.text.x = element_text(angle=45, hjust = 1)) +  
  ylab("Mean Contig Length (KBP)")+
  scale_y_continuous(limits=c(0,150), breaks = seq(0, 150, by = 50))+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()+
  theme(axis.title.x=element_blank())


p1_Comp_WGA<-ggplot(WGA_Seq, aes(x=Plot_Taxa, y=Completeness)) + geom_boxplot(na.rm = TRUE, fill="#d1e5f0") +
  #theme(axis.text.x = element_text(angle=45, hjust = 1))+
  xlab("WGA-X")+
  ylab("% Completeness")+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()


p2_Contam_WGA<-ggplot(WGA_Seq, aes(x=Plot_Taxa, y=Contamination)) + geom_boxplot(na.rm = TRUE, fill="#d1e5f0")+
  # theme(axis.text.x = element_text(angle=45, hjust = 1)) +  
  ylab("% Contamination")+
  scale_y_continuous(limits=c(0,8), breaks = seq(0, 8, by = 2))+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()


p3_Len_WGA<-ggplot(WGA_Seq, aes(x=Plot_Taxa, y=Assembly_Length_MBP)) + 
  geom_boxplot(na.rm = TRUE,  fill="#d1e5f0")+
  #  theme(axis.text.x = element_text(angle=45, hjust = 1)) +  
  ylab("Assembly Length (MBP)")+
  scale_y_continuous(limits=c(0,4), breaks = seq(0, 4, by = 2))+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()


p4_Avg_Contig_WGA<-ggplot(WGA_Seq, aes(x=Plot_Taxa, y=Mean_contig_length_bp/1000))+
  geom_boxplot(na.rm = TRUE,  fill="#d1e5f0")+
  #  theme(axis.text.x = element_text(angle=45, hjust = 1)) +  
  ylab("Mean Contig Length (KBP)")+
  scale_y_continuous(limits=c(0,150), breaks = seq(0, 150, by = 50))+
  coord_flip()+ scale_x_discrete(limits = rev(Tax_list_labs))+
  theme_classic()


arr1<-ggarrange(p1_Comp_MDA + theme(panel.background = element_blank()), 
          p2_Contam_MDA + 
            theme(axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.y = element_blank() ),
          p3_Len_MDA + 
            theme(axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.y = element_blank() ), 
          p4_Avg_Contig_MDA +
            theme(axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.y = element_blank() ), 

          
          p1_Comp_WGA + theme(panel.background = element_blank()),
          
          p2_Contam_WGA + 
            theme(axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.y = element_blank() ), 
          
          p3_Len_WGA + 
            theme(axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.y = element_blank() ),
          p4_Avg_Contig_WGA +
            theme(axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.y = element_blank() ), 
          
          nrow=2)

ggsave("Outputs/FIG-S4_MDA_Boxplots.pdf",
       arr1,width=14, height = 8.5, units = "in")

##Figure S5

####Breakdown CC plots
OMZ_Region_CC<-OMZ_CC+facet_wrap(.~Region) + 
  geom_point(na.rm = TRUE, aes(fill=Plot_Taxa, size=Assembly_Length_MBP), shape =21, colour="black") +
  guides(fill = guide_legend(order=1,override.aes = list(size=5)))+
  labs(size="Assembly Length (MBP)", fill="Taxonomy")+
  theme_classic()+
  ylab("% Contamination")+
  xlab("% Completeness")+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_vline(xintercept = 50)
  
OMZ_Region_CC

ggsave("Outputs/FIG-S5_Region_Tax_CC.pdf", OMZ_Region_CC, width=14, height = 8.5, units = "in")

##Figure S6

OMZ_Taxa_CC<-OMZ_CC+facet_wrap(.~Plot_Taxa) + 
  geom_point(na.rm = TRUE, aes(fill=OMZ_Phenotype, size=Assembly_Length_MBP), shape =21, colour="black") +
  scale_fill_manual(values = c("Sulfidic_Bottom"="#bebada","AMZ"= "#8dd3c7", "Low_Oxygen_OMZ"="#fccde5", "Open_Ocean_OMZ" = "#ffffb3"))+
  guides(fill = guide_legend(order=1,override.aes = list(size=5)))+
  labs(size="Assembly Length (MBP)", fill="OMZ Type")+
  theme_classic()+
  ylab("% Contamination")+
  xlab("% Completeness")+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_vline(xintercept = 50)

OMZ_Taxa_CC

ggsave("Outputs/FIG-S6_Tax_Pheno_CC.pdf", OMZ_Taxa_CC, width=14, height = 8.5, units = "in")

##Figure S7

OMZ_Taxa_Oxy_CC<-OMZ_CC+facet_wrap(.~Plot_Taxa) + 
  geom_point(na.rm = TRUE, aes(fill=O2._.uM., size=Assembly_Length_MBP), shape =21, colour="black") +
  scale_fill_gradientn(colours =revrainbow1, name="Oxygen Concentration (uM)")+
  guides(fill = guide_legend(order=1, override.aes = list(size=5)))+
  labs(size="Assembly Length (MBP)", fill="Environmental O2 Level")+
  theme_classic()+
  ylab("% Contamination")+
  xlab("% Completeness")+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_vline(xintercept = 50)

OMZ_Taxa_Oxy_CC

ggsave("Outputs/FIG-S7_Tax_Oxy_CC.pdf", OMZ_Taxa_Oxy_CC, width=14, height = 8.5, units = "in")

##Figure S8

OMZ_Taxa_Region_CC<-OMZ_CC+facet_wrap(.~Plot_Taxa) + 
  geom_point(na.rm = TRUE, aes(fill=Region, size=Assembly_Length_MBP), shape =21, colour="black") +
  guides(fill = guide_legend(order=1, override.aes = list(size=5)))+
  labs(size="Assembly Length (MBP)", fill="Region")+
  theme_classic()+
  ylab("% Contamination")+
  xlab("% Completeness")+
  geom_vline(xintercept= 90, linetype="dashed")+
  geom_vline(xintercept = 50)

OMZ_Taxa_Region_CC

ggsave("Outputs/FIG-S8_Tax_Region_CC.pdf", OMZ_Taxa_Region_CC, width=14, height = 8.5, units = "in")

###Tables

OMZ_Trim <- unique(OMZ %>% select(Region, Depth, Month, Year, Site_ID, Lat, Long))

for (i in 1: dim (OMZ_Trim)[1]){
  OMZ_Trim[i,8]<-tally(OMZ %>% filter(Site_ID==as.character(OMZ_Trim$Site_ID[i])))
  OMZ_Trim[i,9]<-tally(OMZ %>% filter(Site_ID==as.character(OMZ_Trim$Site_ID[i]), Status=="Sequenced"))
}

colnames(OMZ_Trim)[8]<-"Total Number of SAGs"
colnames(OMZ_Trim)[9]<-"Total Number of Sequenced SAGs"

#Total Number of SAGs in the Catalog
OMZ_Counts<-unique(OMZ%>%select(Primary_Tax_Method))

for (i in 1: dim (OMZ_Counts)[1]){
  OMZ_Counts[i,2]<-tally(OMZ %>% filter(Primary_Tax_Method==as.character(OMZ_Counts$Primary_Tax_Method[i])))
  OMZ_Counts[i,3]<-tally(OMZ %>% filter(Primary_Tax_Method==as.character(OMZ_Counts$Primary_Tax_Method[i]), Status=="Sequenced"))
}
colnames(OMZ_Counts)<-c("Primary Taxonomic Assignment Method", "Total Number of SAGs", "Total Number of Sequenced SAGs")


OMZ_Amps<-unique(OMZ%>%select(WGA_Approach))

for (i in 1: dim (OMZ_Amps)[1]){
  OMZ_Amps[i,2]<-tally(OMZ %>% filter(WGA_Approach==as.character(OMZ_Amps$WGA_Approach[i])))
  OMZ_Amps[i,3]<-tally(OMZ %>% filter(WGA_Approach==as.character(OMZ_Amps$WGA_Approach[i]), Status=="Sequenced"))
}

colnames(OMZ_Amps)<-c("Whole Genome Amplification Method", "Total Number of SAGs", "Total Number of Sequenced SAGs")

#Counts of Classified SAGs
OMZ_Class<-OMZ %>% select(GTDB_Tax, SILVA_138_1_Tax, Completeness, Contamination, WGA_Approach)
OMZ_Class$GTDB_Tax<-OMZ_Class$GTDB_Tax %>% replace_na("Unclassified")

All_GTDB<-as.numeric(tally(OMZ_Class %>% filter(GTDB_Tax!="Unclassified")))
All_SILVA<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified")))

SILVA_And_GTDB<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax!="Unclassified")))
No_SILVA_And_GTDB<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax=="Unclassified")))

Only_Silva<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax=="Unclassified")))
Only_GTDB<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax!="Unclassified")))




OMZ_Class_Out<-as.data.frame(c(All_GTDB, All_SILVA,SILVA_And_GTDB,Only_GTDB,  Only_Silva, No_SILVA_And_GTDB))

rownames(OMZ_Class_Out)<-c("SAGs With GTDB Classifications",
                           "SAGs With SILVA Classifications",
                           "SAGs With Both Classifications",
                           "SAGs With Only GTDB Classifications",
                           "SAGs With Only SILVA Classifications",
                           "Unclassified SAGs")
#Max Completeness
OMZ_Class_Out[1,2] <- max(OMZ_Class %>% filter(GTDB_Tax!="Unclassified") %>% select(Completeness), na.rm = TRUE)
OMZ_Class_Out[2,2] <- max(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified") %>% select(Completeness), na.rm = TRUE)
OMZ_Class_Out[3,2] <- max(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax!="Unclassified") %>% 
                            select(Completeness), na.rm = TRUE)
OMZ_Class_Out[4,2] <- max(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax=="Unclassified") %>% 
                            select(Completeness), na.rm = TRUE)
OMZ_Class_Out[5,2] <- max(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax!="Unclassified") 
                          %>% select(Completeness), na.rm = TRUE)
OMZ_Class_Out[6,2] <- max(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax=="Unclassified") %>% 
                            select(Completeness), na.rm = TRUE)
#Min Completeness
OMZ_Class_Out[1,3] <- min((OMZ_Class %>% filter(GTDB_Tax!="Unclassified") %>% select(Completeness)), na.rm = TRUE)
OMZ_Class_Out[2,3] <- min(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified") %>% select(Completeness), na.rm = TRUE)
OMZ_Class_Out[3,3] <- min(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax!="Unclassified") %>% 
                            select(Completeness), na.rm = TRUE)
OMZ_Class_Out[4,3] <- min(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax=="Unclassified") %>% 
                            select(Completeness), na.rm = TRUE)
OMZ_Class_Out[5,3] <- min(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax!="Unclassified") 
                          %>% select(Completeness), na.rm = TRUE)
OMZ_Class_Out[6,3] <- min(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax=="Unclassified") %>% 
                            select(Completeness), na.rm = TRUE)

#Number of SAGs that are Med+ quality
OMZ_Class_Out[1,4] <- as.numeric(tally((OMZ_Class %>% filter(GTDB_Tax!="Unclassified") %>% select(Completeness, Contamination) %>% 
                                          filter(Completeness >=50, Contamination <10))))

OMZ_Class_Out[2,4] <- as.numeric(tally((OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified") %>% select(Completeness) %>%
                                          filter(Completeness >=50))))

OMZ_Class_Out[3,4] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax!="Unclassified") %>% 
                                         select(Completeness, Contamination) %>% filter (Completeness >=50, Contamination <10)))

OMZ_Class_Out[4,4] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax=="Unclassified") %>% 
                                         select(Completeness, Contamination) %>% filter(Completeness>=50, Contamination <10)))

OMZ_Class_Out[5,4] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax!="Unclassified") 
                                       %>% select(Completeness, Contamination) %>% filter(Completeness >=50, Contamination <10)))

OMZ_Class_Out[6,4] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax=="Unclassified") 
                                       %>% select(Completeness, Contamination) %>% filter(Completeness >=50, Contamination <10)))

#Number of SAGs that are <5% Contamination
OMZ_Class_Out[1,5] <- as.numeric(tally((OMZ_Class %>% filter(GTDB_Tax!="Unclassified") %>% select(Completeness) %>% 
                                          filter(Completeness >=50))))

OMZ_Class_Out[2,5] <- as.numeric(tally((OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified") %>% select(Contamination) %>%
                                          filter(Contamination <5))))

OMZ_Class_Out[3,5] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax!="Unclassified") %>% 
                                         select(Contamination) %>% filter (Contamination <5)))

OMZ_Class_Out[4,5] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax=="Unclassified") %>% 
                                         select(Contamination) %>% filter (Contamination <5)))

OMZ_Class_Out[5,5] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax!="Unclassified") %>%
                                       select(Contamination) %>% filter (Contamination <5)))

OMZ_Class_Out[6,5] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax=="Unclassified") %>%
                                         select(Contamination) %>% filter (Contamination <5)))

#Number of SAGs that are >5 and <10% contamination
OMZ_Class_Out[1,6] <- as.numeric(tally(OMZ_Class %>% filter(GTDB_Tax!="Unclassified") %>% select(Contamination) %>% 
                                          filter(Contamination >=5, Contamination <10)))

OMZ_Class_Out[2,6] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified") %>% select(Contamination) %>%
                                          filter(Contamination >=5, Contamination <10)))

OMZ_Class_Out[3,6] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax!="Unclassified") %>% 
                                         select(Contamination) %>% filter(Contamination >=5, Contamination <10)))

OMZ_Class_Out[4,6] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", GTDB_Tax=="Unclassified") %>% 
                                         select(Contamination) %>% filter(Contamination >=5, Contamination <10)))

OMZ_Class_Out[5,6] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax!="Unclassified") 
                                       %>%  select(Contamination) %>% filter(Contamination >=5, Contamination <10)))

OMZ_Class_Out[6,6] <- as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", GTDB_Tax=="Unclassified") 
                                       %>%  select(Contamination) %>% filter(Contamination >=5, Contamination <10)))



colnames(OMZ_Class_Out)<-c("Count", "Max Completeness", "Min Completeness", "Number of SAGs with Completeness >=50%",
                           "Number of SAGs with <5% Contamination", "Number of SAGs >=5% and <=10% Contamination")



SSU_avail_MDA<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", WGA_Approach=="MDA")))
SSU_avail_WGA<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax!="Unclassified", WGA_Approach=="WGA-X")))


small_SSU<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", WGA_Approach=="MDA")))
no_SSU_WGA<-as.numeric(tally(OMZ_Class %>% filter(SILVA_138_1_Tax=="Unclassified", WGA_Approach=="WGA-X")))


OMZ_Amps[1,4]<-SSU_avail_MDA
OMZ_Amps[2,4]<-SSU_avail_WGA
colnames(OMZ_Amps)[4]<-"Total Number of SAGs With Recoverable SSU rRNA Sequences"

OMZ_Amps[1,5]<-small_SSU
OMZ_Amps[2,5]<-no_SSU_WGA

colnames(OMZ_Amps)[5]<-"Total Number of SAGs Without Assignable SSU rRNA Sequences"

Amplicon_MDA<-as.numeric(tally(OMZ %>% filter(Primary_Tax_Method=="16S", WGA_Approach=="MDA")))
Amplicon_WGA<-as.numeric(tally(OMZ %>% filter(Primary_Tax_Method=="16S", WGA_Approach=="WGA-X")))

OMZ_Amps[1,6]<-Amplicon_MDA
OMZ_Amps[2,6]<-Amplicon_WGA
colnames(OMZ_Amps)[6]<-"Total Number of Recovered SSU Amplicon rRNA Sequences"


OMZ_Amps[1,7]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="WGS", WGA_Approach=="MDA", Status=="Sequenced")))
OMZ_Amps[2,7]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="WGS", WGA_Approach=="WGA-X", Status=="Sequenced")))
colnames(OMZ_Amps)[7]<-"Number of Sequenced SAGs Without Pre-screening"

OMZ_Amps[1,8]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="16S", WGA_Approach=="MDA", Status=="Sequenced")))
OMZ_Amps[2,8]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="16S", WGA_Approach=="WGA-X", Status=="Sequenced")))
colnames(OMZ_Amps)[8]<-"Number of Sequenced SAGs With Pre-screening"

OMZ_Amps[1,9]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="WGS",WGA_Approach=="MDA",
                                               Status=="Sequenced", SILVA_138_1_Tax=="Unclassified")))

OMZ_Amps[2,9]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="WGS", WGA_Approach=="WGA-X",
                                              Status=="Sequenced", SILVA_138_1_Tax=="Unclassified")))
colnames(OMZ_Amps)[9]<-"Number of Unclassified WGS Recovered SSU"


OMZ_Amps[1,10]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="16S", WGA_Approach=="MDA", 
                                                SILVA_138_1_Tax=="Unclassified")))

OMZ_Amps[2,10]<-as.numeric(tally(OMZ%>% filter(Primary_Tax_Method=="16S", WGA_Approach=="WGA-X",
                                                SILVA_138_1_Tax=="Unclassified")))
colnames(OMZ_Amps)[10]<-"Number of Unclassifed SSU Amplicons"


OMZ_Amps[1,11]<-as.numeric(tally(OMZ%>% filter(Status=="Sequenced", Primary_Tax_Method=="16S", WGA_Approach=="MDA", 
                                                Completeness>=50, Contamination <10)))

OMZ_Amps[2,11]<-as.numeric(tally(OMZ%>% filter(Status =="Sequenced", Primary_Tax_Method=="16S", WGA_Approach=="WGA-X",
                                                 Completeness>=50, Contamination <10)))
colnames(OMZ_Amps)[11]<-"Number of Medium QUality and Above SAGs with SSU Amplicons"

OMZ_Amps[1,12]<-as.numeric(tally(OMZ%>% filter(Status=="Sequenced", Primary_Tax_Method=="WGS", WGA_Approach=="MDA", 
                                                Completeness>=50, Contamination <10)))

OMZ_Amps[2,12]<-as.numeric(tally(OMZ%>% filter(Status =="Sequenced", Primary_Tax_Method=="WGS", WGA_Approach=="WGA-X",
                                                 Completeness>=50, Contamination <10)))
colnames(OMZ_Amps)[12]<-"Number of  Medium QUality and Above SAGs with WGS SSU"


OMZ_Amps[1,13]<-as.numeric(tally(OMZ%>% filter(Status=="Sequenced", Primary_Tax_Method=="16S", WGA_Approach=="MDA", 
                                                Completeness>90, Contamination <5, 
                                                WGS_5S!=0, WGS_16S!=0, WGS_23S!=0, WGS_tRNA>=18)))

OMZ_Amps[2,13]<-as.numeric(tally(OMZ%>% filter(Status =="Sequenced", Primary_Tax_Method=="16S", WGA_Approach=="WGA-X",
                                                 Completeness>90, Contamination <5,
                                                 WGS_5S!=0, WGS_16S!=0, WGS_23S!=0, WGS_tRNA>=18)))

colnames(OMZ_Amps)[13]<-"Number of High SAGs with SSU Amplicons"

OMZ_Amps[1,14]<-as.numeric(tally(OMZ%>% filter(Status=="Sequenced", Primary_Tax_Method=="WGS", WGA_Approach=="MDA", 
                                                Completeness>90, Contamination <5,
                                                WGS_5S!=0, WGS_16S!=0, WGS_23S!=0, WGS_tRNA>=18)))

OMZ_Amps[2,14]<-as.numeric(tally(OMZ%>% filter(Status =="Sequenced", Primary_Tax_Method=="WGS", WGA_Approach=="WGA-X",
                                                Completeness>90, Contamination <5,
                                                WGS_5S!=0, WGS_16S!=0, WGS_23S!=0, WGS_tRNA>=18)))
colnames(OMZ_Amps)[14]<-"Number of High QUality SAGs with  with WGS SSU"



Sort_Counts<-c(tally(OMZ %>% filter (Sorting_Method=="DNA")),
               tally(OMZ %>% filter (Sorting_Method=="Pre-sort CHL")),
               tally(OMZ %>% filter (Sorting_Method=="CHL")),
               tally(OMZ %>% filter (Sorting_Method=="PHYCO")))

names(Sort_Counts)<-c("DNA", "Pre-sort CHL", "CHL", "PHYCO")

write_csv(OMZ_Trim, file  = "Outputs/Table_1_Summary_Table_Sites_Mar_21_2023.csv")
write_csv(OMZ_Counts, file= "Outputs/Table_For_Fig2_Summary_Table_Primary_Tax_Mar_21_2023.csv")
write_csv(OMZ_Amps, file= "Outputs/Table_S2_Summary_Table_WGA_Approach_Mar_21_2023.csv")
write.csv(OMZ_Class_Out, file = "Outputs/Table_S3_QA_QC_Summary_Mar_21_2023.csv")
write.csv(Sort_Counts, file = "Outputs/Table_For_Fig2_Sorting_Counts_Mar_21_2023.csv")

OMZ_Quality_Score<-OMZ %>% filter (Status=="Sequenced") %>% 
  select(Sample_ID, Completeness, Contamination, WGS_5S, WGS_16S, WGS_23S, WGS_tRNA) %>% 
  mutate(Quality_Classification= ifelse(Completeness >90 & Contamination <5 & WGS_5S !=0 & WGS_16S!=0 & WGS_23S!=0 & WGS_tRNA>=18, "High",
           ifelse(Completeness>=50 & Contamination <10, "Medium", "Low")))

write_csv(OMZ_Quality_Score, file = "Outputs/QUality_Scores_For_Table_S1_Mar_21_2023.csv")