-
Notifications
You must be signed in to change notification settings - Fork 0
/
ch07_DESeq2.R
109 lines (72 loc) · 2.71 KB
/
ch07_DESeq2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
## Assumes you have run the RSubread script or
# you have a feature count file made from a .bam file
# Counts should be RAW!
library(DESeq2)
library(purrr)
library(tidyverse)
library(stringr)
library(dplyr)
## File preparation-------------------------------------------------------------
dir <- getwd()
names <- c("C48_1","C96_1","C48_2","C96_2",
"S48_1","S96_1","S48_2","S96_2")
files = list.files("./data/RNAseq/", pattern = ".csv")
files # check that the order of names and files is correct or you will
# assign wrong names to samples
for(i in 1:length(files)){
x = read_csv(paste0("./data/RNAseq/",files[i]), col_names = F)
x = x[-1,-c(2,4:9)]
colnames(x) = c("ensembl",paste0(names[i]))
assign(names[i],x)
}
counts <- list(C48_1,C96_1,C48_2,C96_2,S48_1,S96_1,S48_2,S96_2) %>%
purrr::reduce(full_join, by = "ensembl") %>%
as.data.frame()
# If you get a gene version and you need to remove that info to get only
# the ENSG code
# strrep =
# sub(pattern = "\\.(.*)","",counts$ensembl)
#
# counts$ensembl = strrep
# counts = counts[!duplicated(counts$ensembl),]
rownames(counts) = counts$ensembl
counts = counts[,-1]
class(counts) = 'numeric'
head(counts)
# Create the coldata for the high level data structure
coldata <- data.frame(
celltype=c(rep("ctrl",4),rep("selected",4)),
timeline = as.factor(c(48,48,96,96,48,48,96,96)),
replicate=rep(c(1,2),4))
rownames(coldata) <- colnames(counts)
## Expression analysis----------------------------------------------------------
# Create the DESeq object
dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata,
design = ~celltype )
# Generate a linear model
dds <- DESeq(dds)
# Quality check-----------------------------------------------------------------
sizeFactors(dds)
plotDispEsts(dds)
resultsNames(dds)
# dds$celltype <- relevel(dds$celltype, ref="ctrl") # donc on lui dit quelle est la ref
# dds <- DESeq(dds)
# resultsNames(dds)
# Check the distribution of residues
as_tibble(assay(dds)) %>%
gather(sample, value = counts) %>%
ggplot(aes(x = log2(counts + 1), fill = sample)) +
geom_histogram(bins = 20) +
facet_wrap(~ sample)
# PCA preparation
rld <- rlogTransformation(dds)
plotPCA(rld,intgroup="celltype")
plotPCA(rld,intgroup="timeline")
plotPCA(rld,intgroup="replicate")
# Results can be extracted
res <- results(dds, name = "celltype_selected_vs_ctrl")
res_tbl <- as_tibble(res, rownames="ENSMUG")
# Save the tibbles etc
saveRDS(dds, file = "./data/210906_dds.rds")
saveRDS(res_tbl, file = "./data/210906_res_tbl.rds")
write_tsv(res_tbl, file = "./data/210906_res_tbl.tsv")