-
Notifications
You must be signed in to change notification settings - Fork 0
/
Bulk_RNAseq_DESeq2_V1.rmd
323 lines (210 loc) · 8.44 KB
/
Bulk_RNAseq_DESeq2_V1.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
---
title: "Generalized script for DESeq2 analysis of gene counts matrix from nf-core/rnaseq pipeline"
subtitle: "Following instructions from official tutorial for DESeq2: [Analyzing RNA-seq data with DESeq2](https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)"
author: Katherine Beigel
date: "`r Sys.Date()`"
---
```{r Setup, message=FALSE, results = 'hide'}
# Utility
library(tidyverse)
library(readr)
library(Matrix)
# Analysis
library(WGCNA)
library(DESeq2)
library(vsn)
library(AnnotationDbi)
# Plotting
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(ggrepel)
```
# Project set up
```{r Directory paths}
version = "V1"
proj_name = "My_Project"
dir_proj = "/project/directory/"
dir_results = "/project/directory/subdir_for_results/"
infile_counts = "/path/to/input/counts_file.tsv" # path to the count matrix from nf-core/rnaseq
infile_metadata = "/path/to/input/metadata_file.tsv" # path to metadata file with information about the samples
```
# Load count matrix and metadata
``` {r Load data}
# RSEM count file from nf-core/rnaseq: genes (rows) x samples (col), counts rounded to integers
countdata_raw = read_tsv(infile_counts)
countdata = countdata_raw %>%
dplyr::select(-`transcript_id(s)`) %>% # drop column of transcript IDs
column_to_rownames('gene_id') %>% # make the gene_id col the rownames
round() # round the counts to integers
```
# Filter out any genes that have too low counts
```{r Pre-filtering}
# Filter samples and genes with too many missing entries (WGCNA)
to_keep = goodSamplesGenes(t(countdata)) # transpose, need rows of samples x columns of genes
# If any samples need to be removed (probably not):
# countdata_filt = countdata[,to_keep$goodSamples]
# To remove genes with too many missing entries
countdata_filt = countdata[to_keep$goodGenes,]
# Filter cm for genes (rows) that have greater than 50 counts across all samples
# 50 is arbitrary-ish, if this is too high/low, this can be changed
countmat = countdata_filt[which(rowSums(countdata_filt) > 50), ]
```
# Save the filtered count matrix
``` {r Write count CSV (not required)}
write.csv(countmat,
file = paste0(dir_results, proj_name, "_", "FilteredCountMatrix", "_", version, ".csv"))
```
# Prepare metadata
```{r Metadata prep}
# Make metadata table or read in table of experiment design/metadata (tsv)
# Col 1: sample IDs that match the Ids in the count matrix
# Other columns: metadata categories and information
# OPTION 1
# Read in metadata file
metadata_raw = read.table(infile_metadata, header = TRUE, sep = "\t")
# OPTION 2:
# Metadata table should look something like this:
# metadata_raw = data.frame(sample = c("sampleA", "sampleB", "sampleC", "sampleD"),
# cell_type = c("celltype1", "celltype2", "celltype1", "celltype2"),
# treatment = c("control", "treated", "treated", "control"))
# Arrange rows in metadata so the sample order matches the countmat column order
metadata = metadata_raw %>%
arrange(factor(sample, levels = colnames(countmat)))
# Integers, characters, etc need to be converted to factors in the metadata
metadata[sapply(metadata, is.character)] = lapply(metadata[sapply(metadata, is.character)], as.factor)
# Relevel the sample data
# Here I have a column called "treatment" which specifies treated vs control samples
# Setting the control as the reference level (so we can compare treatment to control)
metadata$treatment = relevel(metadata$treatment,
ref = "control")
```
# Prepare DESeq2 Analysis
```{r DESeq2}
# Following DESeq2 instructions
# https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
# Specify the design formula
# Set this to whatever condition you want to compare in the metadata, e.g. `~ treatment` or `cell_type + treatment`
design_formula = ~ cell_type + treatment
# Make DESeq data set from the filtered count matrix
dds = DESeqDataSetFromMatrix(countData = countmat,
colData = metadata,
design = design_formula)
# DESeq2 variance stabilizing transformation for plotting heatmaps
vsd = vst(dds, blind = FALSE)
ntd = normTransform(dds)
# rld = rlog(dds, blind = FALSE) #rld() can be used if vst() doesn't look good
# Plot to see how they look
meanSdPlot(assay(vsd))
meanSdPlot(assay(ntd))
# Estimate size factors to account for sequencing depth
dds = estimateSizeFactors(dds)
# Run differential expression pipeline on the raw counts
dds = DESeq(dds)
# Produce results table by specifying the contrast of interest
# Change the contrast to whatever you want to compare
res = results(dds, contrast = c("treatment", "treated", "control")) # e.g. contrast = c("condition","treated","untreated"))
# Order based on adjusted pvalue (pdj)
res_df = as.data.frame(res) %>%
arrange(padj)
```
# Get symbols for the features
```{r Get symbols for the features}
# Get symbols instead of ENSEMBL IDs
require(org.Mm.eg.db)
id_conversions = (
AnnotationDbi::mapIds(
org.Mm.eg.db,
keys = rownames(res_df),
column = c('SYMBOL'),
keytype = 'ENSEMBL')
)
for (row in 1:nrow(res_df)){
if (!is.na(id_conversions[row])){
res_df[row, 'symbol'] = as.vector(unlist(id_conversions[row], use.names = FALSE))
} else if (is.na(id_conversions[row])){
res_df[row, 'symbol'] = rownames(res_df)[row]
}
}
# write to file (.csv)
write.csv(res_df,
file = paste0(dir_results, proj_name, "_", "DESeq2Results", "_", version, ".csv"))
```
# Heatmap plotting of samples
## Heatmap plot sample distances function
```{r Heatmap plot smaple distances function}
plot_heatmap_sampledist = function(deseq_transform_obj, col_anno_1, col_anno_2){
sampleDists <- dist(t(assay(deseq_transform_obj)))
df = as.data.frame(colData(dds)[, c(col_anno_1, col_anno_2)])
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors,
annotation_col = df,
cellwidth = 50,
cellheight = 50)
}
```
## Heatmap plot sample distances
```{r Heatmap plot sample distances}
plot_heatmap_sampledist(vsd, "treatment", "cell_type")
```
# PCA plotting of samples
## PCA plot function
```{r PCA function}
# PCA plotting function
plot_pca = function(deseq_transform_obj, variable_pca_color, variable_pca_shape){
# Get PCA data
pcaData = plotPCA(deseq_transform_obj,
intgroup = c(variable_pca_color, variable_pca_shape),
returnData = TRUE)
# Get percent variance
percentVar = round(100 * attr(pcaData, "percentVar"))
# Plot PCA
ggplot(pcaData,
aes(x = PC1,
y = PC2,
color = !!sym(variable_pca_color),
shape = !!sym(variable_pca_shape),
label = rownames(pcaData))) +
geom_point(size = 3) +
geom_text_repel() +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
theme(aspect.ratio = 1)
}
```
## PCA plot by treatment and cell_type
``` {r PCA plot by treatment}
plot_pca(vsd, "treatment", "cell_type")
```
# Heatmp plotting
## Heatmp plot function
```{r Heatmap of genes function}
plot_heatmap_genevst = function(dds_obj, deseq_transform_obj, list_of_genes, col_anno_1, col_anno_2){
genes_oi = res_df %>%
filter(symbol %in% list_of_genes | rownames(.) %in% list_of_genes)
gene_sym_key = genes_oi %>%
dplyr::select(symbol)
df = as.data.frame(colData(dds)[, c(col_anno_1, col_anno_2)])
pheatmap(as.data.frame(assay(vsd)) %>%
filter(rownames(.) %in% rownames(gene_sym_key)) %>%
`rownames<-`(gene_sym_key$symbol),
cluster_rows = FALSE,
show_rownames = TRUE,
cluster_cols = TRUE,
annotation_col = df,
cellwidth = 25,
cellheight = 25)
}
```
## Heatmap of a list of genes of interest
``` {r Heatmap plot of genes of interest}
list_of_genes = c("Vipr1", "ENSMUSG00000011171", "Vipr2", "Foxp3", "Rorc") # put your gene symbols or ENSEMBL IDs here
# This will plot the vsd values for the specified list of genes
# If your gene does not appear on the heatmap, try using the ENSMUSG ID
# If neither symbol nor ENSMUSG ID work, the gene is not in the results
plot_heatmap_genevst(dds, vsd, list_of_genes, "treatment", "cell_type")
```