-
Notifications
You must be signed in to change notification settings - Fork 0
/
scRNA-seq.R
190 lines (135 loc) · 5.42 KB
/
scRNA-seq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
library(Seurat)
library(SeuratDisk)
library(ggplot2)
library(tidyverse)
library(gridExtra)
setwd("C:/Users/dimit/Desktop/scRNA_seq/")
directories <- list.dirs(path = 'data/', recursive = F, full.names = F)
# Create a for loop and in each iteration
# it should go in each folder and fetch
# the files to create counts matrix
# and then using the count matrix
# we create a Seurat object and assign
# a variable that has both the patient
# information as well as the type of the tissue.
### Remember that it takes some time to loop all the folders ###
for(x in directories){
name <- gsub('_filtered_feature_bc_matrix','', x)
counts <- ReadMtx(mtx = paste0('data/',x,'/matrix.mtx.gz'),
features = paste0('data/',x,'/features.tsv.gz'),
cells = paste0('data/',x,'/barcodes.tsv.gz'))
# create seurat objects
assign(name, CreateSeuratObject(counts = counts))
}
## Merge these objects together into one object
# Why we merge them?
# Because there is a big chance to
# make mistakes by proceeding to the
# quality control immediately after
# creating the Seurat object we prefer
# to merge all the objects into one!!
# very important step
# Under in the console type ls()
# and check the names of all
# the objects that have created.
# Based on them merge the files
merge_seurat <- merge(HB17_background, y=c(HB17_PDX, HB17_tumor,
HB30_PDX, HB30_tumor,
HB53_background, HB53_tumor),
# add.cell.ids --> we use it because when we merge we want to know
# what
add.cell.ids = ls()[3:9],
project = "HB")
merge_seurat
# the result is 33538 features (genes)
# across 77936 samples(cells)
# Preprocesing the data for QC & Trimming
view([email protected])
# based on the information now
# I want to create a column that
# will tell me what patient and
# tissue is the cell barcode origin
# from.
# Create a sample column
merge_seurat$sample <- rownames(
)
view([email protected])
# Split the column (Sample)
# that we just created.
# ??separate()
[email protected] <- separate([email protected],
col = sample,
into= c("Patient_ID","Tissue","Barcodes"),
sep = '_')
view([email protected])
# Check that everything has merged perfectly
unique([email protected]$Patient_ID)
unique([email protected]$Tissue)
## QC & Trimming
# How to filter low quality cells?
# The seurat function PercentageFeatureSet()
# enables you to calculate the mitochondrial transcript
merge_seurat$mitoPer <- PercentageFeatureSet(
merge_seurat, pattern = "^MT-"
)
view(merge_seurat$mitoPer)
## EXPLORE THE DATA##
# Filtering
# Based on the paper:
# Cells with fewer than 500
# expressed genes or 800 UMIs,
# or greater than 10%
# mitochondrial counts were removed
merge_seurat_filtered <- subset(merge_seurat, subset = nCount_RNA > 800 &
nFeature_RNA > 500 &
mitoPer < 10)
merge_seurat_filtered
# how can i find how many genes
# I had before filtering?
merge_seurat
# Now we want to figure if we have
# any batch effects
merge_seurat_filtered <- NormalizeData(
object = merge_seurat_filtered
)
merge_seurat_filtered <- FindVariableFeatures(
object = merge_seurat_filtered
)
merge_seurat_filtered <- ScaleData(
object = merge_seurat_filtered
)
merge_seurat_filtered <- RunPCA(
object = merge_seurat_filtered
)
# find the dimension of the dataset
ElbowPlot(merge_seurat_filtered)
merge_seurat_filtered <- FindNeighbors(object = merge_seurat_filtered, dims = 1:20)
merge_seurat_filtered <- FindClusters(object = merge_seurat_filtered)
merge_seurat_filtered <- RunUMAP(object = merge_seurat_filtered, dims = 1:20)
# plot
p1 <- DimPlot(merge_seurat_filtered, reduction = 'umap', group.by = 'Patient_ID')
p2 <- DimPlot(merge_seurat_filtered, reduction = 'umap', group.by = 'Tissue',
cols = c('red','green','blue'))
grid.arrange(p1, p2, ncol = 2, nrow = 2)
# perform integration to correct for batch effects ------
obj.list <- SplitObject(merge_seurat_filtered, split.by = 'Patient_ID')
for(i in 1:length(obj.list)){
obj.list[[i]] <- NormalizeData(object = obj.list[[i]])
obj.list[[i]] <- FindVariableFeatures(object = obj.list[[i]])
}
# select integration features
features <- SelectIntegrationFeatures(object.list = obj.list)
# find integration anchors (CCA)
anchors <- FindIntegrationAnchors(object.list = obj.list,
anchor.features = features)
# integrate data
seurat.integrated <- IntegrateData(anchorset = anchors)
# Scale data, run PCA and UMAP and visualize integrated data
seurat.integrated <- ScaleData(object = seurat.integrated)
seurat.integrated <- RunPCA(object = seurat.integrated)
seurat.integrated <- RunUMAP(object = seurat.integrated, dims = 1:50)
p3 <- DimPlot(seurat.integrated, reduction = 'umap', group.by = 'Patient_ID')
p4 <- DimPlot(seurat.integrated, reduction = 'umap', group.by = 'Tissue',
cols = c('red','green','blue'))
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)