Skip to content

Commit

Permalink
Add tool for BREW3R.r bioconductor package (galaxyproject#6058)
Browse files Browse the repository at this point in the history
* add tool for BREW3R.r bioconductor package

* lintr

* fix tests

* added the help section and lint

* simplify brew3r docker url

Co-authored-by: Björn Grüning <[email protected]>

* use required_files

* fix single quote thanks to @bernt-matthias

* fix exclude pattern

* exclude \ from exclude_pattern

---------

Co-authored-by: Björn Grüning <[email protected]>
  • Loading branch information
lldelisle and bgruening authored Jun 11, 2024
1 parent dc8ebbe commit 3e3c47b
Show file tree
Hide file tree
Showing 7 changed files with 635 additions and 0 deletions.
11 changes: 11 additions & 0 deletions tools/brew3r_r/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
categories:
- Transcriptomics
- RNA
description: Extend 3' end of a GTF using another GTF as a template
homepage_url: https://bioconductor.org/packages/release/bioc/html/BREW3R.r.html
long_description: |
This tool is using the BREW3R.r package the way it is used in the BREW3R workflow. It extends a gtf using information from another gtf. The process allows to extend gene annotation without increasing the overlap between gene ids.
name: brew3r_r
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/brew3r_r
type: unrestricted
122 changes: 122 additions & 0 deletions tools/brew3r_r/brew3r.r_script.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
library("getopt")
suppressPackageStartupMessages(library("rtracklayer"))
library(GenomicRanges)
library("BREW3R.r")

options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
args <- commandArgs(trailingOnly = TRUE)
# - Column 1: the long flag name. A multi-character string.
# - Column 2: short flag alias of Column 1. A single-character string.
# - Column 3: Argument mask of the flag. An integer.
# Possible values: 0=no argument, 1=required argument, 2=optional argument.
# - Column 4: Data type to which the flag's argument shall be cast using
# storage.mode(). A multi-character string. This only considered for same-row
# Column 3 values of 1,2. Possible values: logical, integer, double, complex,
# character. If numeric is encountered then it will be converted to double.
# - Column 5 (optional): A brief description of the purpose of the option.
spec <- matrix(c(
"help", "h", 0, "logical", "display help",
"gtf_to_extend", "i", 1, "character", "input gtf file to be extended on 3'",
"gtf_to_overlap", "g", 1, "character",
"input gtf file that will be used to extend",
"output", "o", 1, "character", "output extended gtf",
"sup_output", "s", 1, "character",
"supplementary output file with resolution of overlaps",
"no_add", "n", 0, "logical", "do not add new exons",
"exclude_pattern", "e", 1, "character", "do not extend genes with names matching this pattern",
"filter_unstranded", "f", 0, "logical",
"remove unstranded intervals from gtf_to_overlap which overlap intervals from gtf_to_extend of both strands",
"quiet", "q", 0, "logical", "decrease verbosity",
"verbose", "v", 0, "logical", "increase verbosity"
), byrow = TRUE, ncol = 5)
opt <- getopt(spec)

# if help was asked for print a friendly message
# and exit with a non-zero error code
if (!is.null(opt$help)) {
cat(getopt(spec, usage = TRUE))
q(status = 1)
}

# Check all required arguments
if (is.null(opt$gtf_to_extend)) {
stop("--gtf_to_extend is required")
}
if (is.null(opt$gtf_to_overlap)) {
stop("--gtf_to_overlap is required")
}
if (is.null(opt$output)) {
stop("--output is required")
}

# Check incompatible arguments
if (!is.null(opt$quiet) && !is.null(opt$verbose)) {
stop("quiet and verbose are mutually exclusive options")
}

# Adjust verbosity
if (!is.null(opt$quiet)) {
options(rlib_message_verbosity = "quiet")
}

if (!is.null(opt$verbose)) {
options(BREW3R.r.verbose = "progression")
}

# Load gtfs as GenomicRanges
input_gr_to_extend <- rtracklayer::import(opt$gtf_to_extend, format = "gtf")
input_gr_template <- rtracklayer::import(opt$gtf_to_overlap, format = "gtf")

# Save CDS info
input_gr_CDS <- subset(input_gr_to_extend, type == "CDS")

# Filter the template if needed
if (!is.null(opt$filter_unstranded)) {
# Find intervals without strand information in template
unstranded.intervals <- which(strand(input_gr_template) == "*")
if (length(unstranded.intervals) > 0) {
# Check if they overlap genes from input with different strands
# First compute the overlap
ov <- suppressWarnings(
as.data.frame(findOverlaps(
input_gr_template[unstranded.intervals],
input_gr_to_extend
))
)
# Add the strand information
ov$strand <- as.factor(strand(input_gr_to_extend))[ov$subjectHits]
# Simplify the dataframe to get only the strand info
ov.simple <- unique(ov[, c("queryHits", "strand")])
# If the queryHits is duplicated it means there are different strands
multi.strand.query <- ov.simple$queryHits[duplicated(ov.simple$queryHits)]
to.remove <- unstranded.intervals[multi.strand.query]
# Remove these potentially error-prone intervals from the template
input_gr_template <- input_gr_template[-to.remove]
}
}

# Run BREW3R.r main function
new_gr_exons <- extend_granges(
input_gr_to_extend = input_gr_to_extend,
input_gr_to_overlap = input_gr_template,
add_new_exons = is.null(opt$no_add),
overlap_resolution_fn = opt$sup_output
)
# Prevent extension using pattern
if (!is.null(opt$exclude_pattern)) {
input_gr_pattern <- subset(
input_gr_to_extend,
type == "exon" & grepl(opt$exclude_pattern, gene_name)
)
new_gr_no_pattern <- subset(
new_gr_exons,
!grepl(opt$exclude_pattern, gene_name)
)
new_gr_exons <- c(new_gr_no_pattern, input_gr_pattern)
}

# Recompose with CDS
new_gr <- c(new_gr_exons, input_gr_CDS)

# Export
rtracklayer::export.gff(sort(new_gr, ignore.strand = TRUE), opt$output)
152 changes: 152 additions & 0 deletions tools/brew3r_r/brew3r_r.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
<tool id="brew3r_r" name="BREW3R.r" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
<description>Extend GTF</description>
<macros>
<token name="@TOOL_VERSION@">1.0.1</token>
<token name="@VERSION_SUFFIX@">0</token>
</macros>
<edam_topics>
<edam_topic>topic_3308</edam_topic>
</edam_topics>
<edam_operations>
<edam_operation>operation_0362</edam_operation>
</edam_operations>
<xrefs>
<!-- <xref type="bio.tools">BREW3R.r</xref> -->
<xref type="bioconductor">BREW3R.r</xref>
</xrefs>
<requirements>
<!-- <requirement type="package" version="@TOOL_VERSION@">bioconductor-brew3r.r</requirement>
<requirement type="package" version="1.64.0">bioconductor-rtracklayer</requirement>
<requirement type="package" version="1.20.4">r-getopt</requirement> -->
<container type="docker">lldelisle/brew3r:v2</container>
</requirements>
<required_files>
<include path="brew3r.r_script.R" />
</required_files>
<version_command><![CDATA[
echo $(R --version | grep version | grep -v GNU)", BREW3R.r version" $(R --vanilla --slave -e "library(BREW3R.r); cat(sessionInfo()\$otherPkgs\$BREW3R.r\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
]]></version_command>
<command detect_errors="exit_code"><![CDATA[
Rscript '${__tool_directory__}/brew3r.r_script.R'
--gtf_to_extend '$gtf_to_extend'
--gtf_to_overlap '$gtf_to_overlap'
#if '$sup_output' == 'true':
--sup_output '$output_table'
#end if
#if str($no_add) != '':
'$no_add'
#end if
#if str($exclude_pattern) != '':
--exclude_pattern '$exclude_pattern'
#end if
#if str($filter_unstranded) != '':
'$filter_unstranded'
#end if
-o output.gtf
]]></command>
<inputs>
<param argument="--gtf_to_extend" type="data" format="gtf" label="Input gtf file to be extended on 3'" help="Usually coming from public resource." />
<param argument="--gtf_to_overlap" type="data" format="gtf" label="Input gtf file that will be used to extend" help="Coming from StringTie or another public resource." />
<param argument="--sup_output" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Get a supplementary output table with resolution of overlaps" />
<param argument="--no_add" type="boolean" truevalue="--no_add" falsevalue="" checked="false" label="Do not add new exons" />
<param argument="--exclude_pattern" type="text" value="" label="Do not extend genes with names matching this pattern" help="Leave empty if you want to extend all genes.">
<sanitizer>
<valid initial="string.printable">
<remove value="'"/>
<remove value="\"/>
</valid>
</sanitizer>
</param>
<param argument="--filter_unstranded" type="boolean" truevalue="--filter_unstranded" falsevalue="" checked="false" label="Filter unstranded intervals that overlaps genes of both strands" help="Recommanded if you used StringTie on unstranded libraries." />
</inputs>
<outputs>
<data name="output" format="gtf" from_work_dir="output.gtf" label="${tool.name} on ${gtf_to_extend.name} and ${gtf_to_overlap.name}: GTF" />
<data name="output_table" format="tabular" label="${tool.name} on ${gtf_to_extend.name} and ${gtf_to_overlap.name}: overlap resolution">
<filter>sup_output == True</filter>
</data>
</outputs>
<tests>
<test expect_num_outputs="1">
<param name="gtf_to_extend" value="input.gtf"/>
<param name="gtf_to_overlap" value="second_input.gtf"/>
<output name="output" value="output.gtf" compare="diff" lines_diff="2"/>
</test>
<test expect_num_outputs="1">
<param name="gtf_to_extend" value="input.gtf"/>
<param name="gtf_to_overlap" value="second_input.gtf"/>
<param name="no_add" value="true"/>
<output name="output">
<assert_contents>
<has_n_lines n="31"/>
<not_has_text text="BREW3R"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<param name="gtf_to_extend" value="input.gtf"/>
<param name="gtf_to_overlap" value="second_input.gtf"/>
<param name="exclude_pattern" value="^Gm"/>
<output name="output">
<assert_contents>
<has_n_lines n="34"/>
<not_has_text text="exon111.ext"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<param name="gtf_to_extend" value="input.gtf"/>
<param name="gtf_to_overlap" value="second_input.gtf"/>
<param name="exclude_pattern" value="Gm$"/>
<output name="output" value="output.gtf" compare="diff" lines_diff="2"/>
<assert_command>
<has_text text="--exclude_pattern 'Gm$'"/>
</assert_command>
</test>
<test expect_num_outputs="1">
<param name="gtf_to_extend" value="input.gtf"/>
<param name="gtf_to_overlap" value="second_input.gtf"/>
<param name="filter_unstranded" value="true"/>
<output name="output">
<assert_contents>
<has_n_lines n="36"/>
<not_has_text text="exon121.ext"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
.. class:: infomark
**What it does**
This tool extend the annotations existing in an input GTF file in the 3' end using annotations from another input GTF. During the process, it makes sure that there will not be new overlaps between different genes.
Usage
.....
**Input**
2 GTF files:
- First one to extend usually comes from a public resource.
- Second one that is used as template may come from a public resource or from StringTie.
**Output**
1 GTF file with all exons from the input GTF where some of them have been extended (the exon_id ends with '.ext') and potentially new exons (the exon_id contains BREW3R).
]]></help>
<citations>
<citation type="bibtex">
@unpublished{None,
author = {Lucille Lopez-Delisle},
title = {None},
year = {None},
eprint = {None},
url = {https://github.com/lldelisle/BREW3R.r}
}</citation>
</citations>
</tool>
Loading

0 comments on commit 3e3c47b

Please sign in to comment.