Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overhaul tximport.r, output length tables #1123

Merged
merged 10 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 118 additions & 69 deletions bin/tximport.r
Original file line number Diff line number Diff line change
@@ -1,94 +1,143 @@
#!/usr/bin/env Rscript

# Written by Lorena Pantano and released under the MIT license.
# Script for importing and processing transcript-level quantifications.
# Written by Lorena Pantano, later modified by Jonathan Manning, and released
# under the MIT license.

# Loading required libraries
library(SummarizedExperiment)
library(tximport)

args = commandArgs(trailingOnly=TRUE)
# Parsing command line arguments
args <- commandArgs(trailingOnly=TRUE)
if (length(args) < 4) {
stop("Usage: tximport.r <coldata> <path> <sample_name> <quant_type> <tx2gene_path>", call.=FALSE)
stop("Usage: tximport.r <coldata_path> <path> <prefix> <quant_type> <tx2gene_path>",
call.=FALSE)
}

coldata = args[1]
path = args[2]
sample_name = args[3]
quant_type = args[4]
tx2gene_path = args[5]

prefix = sample_name

info = file.info(tx2gene_path)
if (info$size == 0) {
tx2gene = NULL
} else {
rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE)
colnames(rowdata) = c("tx", "gene_id", "gene_name")
tx2gene = rowdata[,1:2]
# Assigning command line arguments to variables
coldata_path <- args[1]
path <- args[2]
prefix <- args[3]
quant_type <- args[4]
tx2gene_path <- args[5]

## Functions

# Build a table from a SummarizedExperiment object
build_table <- function(se.obj, slot) {
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
}

pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
fns = list.files(path, pattern = pattern, recursive = T, full.names = T)
names = basename(dirname(fns))
names(fns) = names

if (file.exists(coldata)) {
coldata = read.csv(coldata, sep="\t")
coldata = coldata[match(names, coldata[,1]),]
coldata = cbind(files = fns, coldata)
} else {
message("ColData not available: ", coldata)
coldata = data.frame(files = fns, names = names)
# Write a table to a file with given parameters
write_se_table <- function(params) {
file_name <- paste0(prefix, ".", params$suffix)
write.table(build_table(params$obj, params$slot), file_name,
sep="\t", quote=FALSE, row.names = FALSE)
}

dropInfReps = quant_type == "kallisto"
# Read transcript metadata from a given path
read_transcript_info <- function(tinfo_path){
info <- file.info(tinfo_path)
if (info$size == 0) {
stop("tx2gene file is empty")
}

transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE,
col.names = c("tx", "gene_id", "gene_name"))

txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)
rownames(coldata) = coldata[["names"]]
extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]]))
if (length(extra) > 0) {
rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra))
extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]]))
transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra))
transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ]
rownames(transcript_info) <- transcript_info[["tx"]]

list(transcript = transcript_info,
gene = unique(transcript_info[,2:3]),
tx2gene = transcript_info[,1:2])
}
rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),]
rownames(rowdata) = rowdata[["tx"]]
se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]),
colData = DataFrame(coldata),
rowData = rowdata)
if (!is.null(tx2gene)) {
gi = summarizeToGene(txi, tx2gene = tx2gene)
gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
growdata = unique(rowdata[,2:3])
growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),]
rownames(growdata) = growdata[["tx"]]
gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)
gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)
gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)

# Read and process sample/column data from a given path
read_coldata <- function(coldata_path){
if (file.exists(coldata_path)) {
coldata <- read.csv(coldata_path, sep="\t")
coldata <- coldata[match(names, coldata[,1]),]
coldata <- cbind(files = fns, coldata)
} else {
message("ColData not available: ", coldata_path)
coldata <- data.frame(files = fns, names = names)
}
rownames(coldata) <- coldata[["names"]]
}

build_table = function(se.obj, slot) {
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
# Create a SummarizedExperiment object with given data
create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) {
SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length),
colData = col_data,
rowData = row_data)
}

if(exists("gse")){
write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
# Main script starts here

# Define pattern for file names based on quantification type
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
fns <- list.files(path, pattern = pattern, recursive = T, full.names = T)
names <- basename(dirname(fns))
names(fns) <- names
dropInfReps <- quant_type == "kallisto"

# Import transcript-level quantifications
txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)

# Read transcript and sample data
transcript_info <- read_transcript_info(tx2gene_path)
coldata <- read_coldata(coldata_path)

# Create initial SummarizedExperiment object
se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]],
DataFrame(coldata), transcript_info$transcript)

# Setting parameters for writing tables
params <- list(
list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"),
list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"),
list(obj = se, slot = "length", suffix = "transcript_lengths.tsv")
)

# Process gene-level data if tx2gene mapping is available
if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) {
tx2gene <- transcript_info$tx2gene
gi <- summarizeToGene(txi, tx2gene = tx2gene)
gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")

gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),]
rownames(gene_info) <- gene_info[["tx"]]

col_data_frame <- DataFrame(coldata)

# Create gene-level SummarizedExperiment objects
gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]],
col_data_frame, gene_info)
gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]],
col_data_frame, gene_info)
gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]],
col_data_frame, gene_info)

params <- c(params, list(
list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"),
list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"),
list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"),
list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"),
list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"),
list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"),
list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv")
))
}

write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
# Writing tables for each set of parameters
done <- lapply(params, write_se_table)

# Print sessioninfo to standard out
# Output session information and citations
citation("tximeta")
sessionInfo()

2 changes: 2 additions & 0 deletions modules/local/tximport/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ process TXIMPORT {
path "*gene_counts.tsv" , emit: counts_gene
path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled
path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled
path "*gene_lengths.tsv" , emit: lengths_gene
path "*transcript_tpm.tsv" , emit: tpm_transcript
path "*transcript_counts.tsv" , emit: counts_transcript
path "*transcript_lengths.tsv" , emit: lengths_transcript
path "versions.yml" , emit: versions

when:
Expand Down
14 changes: 8 additions & 6 deletions subworkflows/local/quantify_pseudo/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT {
results = ch_pseudo_results // channel: [ val(meta), results_dir ]
multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ]

tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ]
counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ]
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ]
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ]
tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ]
counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ]
tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv
counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv
lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv
tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv
counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv
lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv

merged_gene_rds = SE_GENE.out.rds // path: *.rds
merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds
Expand Down
12 changes: 12 additions & 0 deletions tower.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ reports:
display: "All samples STAR Salmon DESeq2 QC PDF plots"
"**/star_salmon/salmon.merged.gene_counts.tsv":
display: "All samples STAR Salmon merged gene raw counts"
"**/star_salmon/salmon.merged.gene_lengths.tsv":
display: "All samples STAR Salmon mean transcript length per gene"
"**/star_salmon/salmon.merged.transcript_lengths.tsv":
display: "All samples STAR Salmon transcript lengths"
"**/star_salmon/salmon.merged.gene_tpm.tsv":
display: "All samples STAR Salmon merged gene TPM counts"
"**/star_salmon/salmon.merged.transcript_counts.tsv":
Expand All @@ -15,6 +19,10 @@ reports:
display: "All samples Salmon DESeq2 QC PDF plots"
"**/salmon/salmon.merged.gene_counts.tsv":
display: "All samples Salmon merged gene raw counts"
"**/salmon/salmon.merged.gene_lengths.tsv":
display: "All samples Salmon mean transcript length per gene"
"**/salmon/salmon.merged.transcript_lengths.tsv":
display: "All samples Salmon transcript lengths"
"**/salmon/salmon.merged.gene_tpm.tsv":
display: "All samples Salmon merged gene TPM counts"
"**/salmon/salmon.merged.transcript_counts.tsv":
Expand All @@ -25,6 +33,10 @@ reports:
display: "All samples Kallisto DESeq2 QC PDF plots"
"**/kallisto/kallisto.merged.gene_counts.tsv":
display: "All samples Kallisto merged gene raw counts"
"**/kallisto/kallisto.merged.gene_lengths.tsv":
display: "All samples Kallisto mean transcript length per gene"
"**/kallisto/kallisto.merged.transcript_lengths.tsv":
display: "All samples Kallisto transcript lengths"
"**/kallisto/kallisto.merged.gene_tpm.tsv":
display: "All samples Kallisto merged gene TPM counts"
"**/kallisto/kallisto.merged.transcript_counts.tsv":
Expand Down
Loading