Skip to content

Commit

Permalink
Update EdgeR: Provide optional formula and contrasts file for multipl…
Browse files Browse the repository at this point in the history
…e contrasts (#5512)

* Provide optional formula

* Pass contrasts file

* Make sure tilde is there

* Linting

* Wrong variable name fixed

* Try different sanitizer for formula

* Just to check if formula works

* Empty repeat to use contrasts file

* Some documentation.

* Put contrasts options inside select / when

* Improve sanitizer

* Improved sanitizer
  • Loading branch information
pcm32 authored Oct 11, 2023
1 parent 3425546 commit 0c79416
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 13 deletions.
26 changes: 20 additions & 6 deletions tools/edger/edger.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# matrixPath", "m", 2, "character" -Path to count matrix
# factFile", "f", 2, "character" -Path to factor information file
# factInput", "i", 2, "character" -String containing factors if manually input
# formula", "F", 2, "character". -String containing a formula to override default use of factInput
# annoPath", "a", 2, "character" -Path to input containing gene annotations
# contrastData", "C", 1, "character" -String containing contrasts of interest
# cpmReq", "c", 2, "double" -Float specifying cpm requirement
Expand Down Expand Up @@ -159,6 +160,7 @@ spec <- matrix(c(
"filesPath", "j", 2, "character",
"matrixPath", "m", 2, "character",
"factFile", "f", 2, "character",
"formula", "F", 2, "character",
"factInput", "i", 2, "character",
"annoPath", "a", 2, "character",
"contrastData", "C", 1, "character",
Expand Down Expand Up @@ -312,8 +314,13 @@ if (have_anno) {
out_path <- opt$outPath
dir.create(out_path, showWarnings = FALSE)

# Split up contrasts separated by comma into a vector then sanitise
contrast_data <- unlist(strsplit(opt$contrastData, split = ","))
# Check if contrastData is a file or not
if (file.exists(opt$contrastData)) {
contrast_data <- unlist(read.table(opt$contrastData, sep = "\t", header = TRUE)[[1]])
} else {
# Split up contrasts separated by comma into a vector then sanitise
contrast_data <- unlist(strsplit(opt$contrastData, split = ","))
}
contrast_data <- sanitise_equation(contrast_data)
contrast_data <- gsub(" ", ".", contrast_data, fixed = TRUE)

Expand Down Expand Up @@ -397,10 +404,17 @@ data$samples <- factors
data$genes <- genes



formula <- "~0"
for (i in seq_along(factor_list)) {
formula <- paste(formula, factor_list[i], sep = "+")
if (!is.null(opt$formula)) {
formula <- opt$formula
# sanitisation can be getting rid of the "~"
if (!startsWith(formula, "~")) {
formula <- paste0("~", formula)
}
} else {
formula <- "~0"
for (i in seq_along(factor_list)) {
formula <- paste(formula, factor_list[i], sep = "+")
}
}

formula <- formula(formula)
Expand Down
104 changes: 97 additions & 7 deletions tools/edger/edger.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
</description>
<macros>
<token name="@TOOL_VERSION@">3.36.0</token>
<token name="@VERSION_SUFFIX@">2</token>
<token name="@VERSION_SUFFIX@">3</token>
</macros>
<edam_topics>
<edam_topic>topic_3308</edam_topic>
Expand Down Expand Up @@ -68,7 +68,15 @@ Rscript '$__tool_directory__/edger.R'
-a '$anno.geneanno'
#end if
-C '${ ','.join( ['%s' % $x.contrast for x in $rep_contrast] ) }'
#if $formula:
-F '$formula'
#end if
#if $contrasts.contrastOpt == 'file':
-C '$contrasts.cinfo'
#else:
-C '${ ','.join( ['%s' % $x.contrast for x in $contrasts.rep_contrast] ) }'
#end if
#if $f.filt.filt_select == 'yes':
#if $f.filt.cformat.format_select == 'cpm':
Expand Down Expand Up @@ -176,13 +184,45 @@ cp '$outReport.files_path'/*.tsv output_dir/
</when>
<when value="no"/>
</conditional>
<!-- Optional formula -->
<param name="formula" type="text" optional="true" label="Formula for linear model" help="An optional formula for the EdgeR linear model, this will override the use of the fields in factors as a simple sum. The formula can only use elements available in the factors file. This needs to be exactly as EdgeR expect the formula, ie. `~ 0 + factor_A + factor_B:factor_C`. See EdgeR documentation for more details.">
<sanitizer invalid_char="">
<valid initial="string.letters,string.digits">
<add value="_"/>
<add value="-"/>
<add value="+"/>
<add value="*"/>
<add value="/"/>
<add value="^"/>
<add value=":"/>
<add value="."/>
<add value="~"/>
<add value=" "/>
<add value="("/>
<add value=")"/>
<add value="@"/>
<add value="$"/>
</valid>
</sanitizer>
</param>
<!-- Contrasts -->
<repeat name="rep_contrast" title="Contrast" min="1" default="1">
<param name="contrast" type="text" label="Contrast of Interest" help="Names of two groups to compare separated by a hyphen e.g. Mut-WT. If the order is Mut-WT the fold changes in the results will be up/down in Mut relative to WT. If you have more than one contrast enter each separately using the Insert Contrast button below. For differences between contrasts use e.g. (MT.t1-MT.t0)-(WT.t1-WT.t0). For more info, see Chapter 8 in the limma User's guide: https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf or https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf page 36 for nested comparisons.">
<validator type="empty_field"/>
<validator type="regex" message="Please only use letters, numbers, parentheses or underscores">^[\w\-()]+$</validator>
<conditional name="contrasts">
<param name="contrastOpt" type="select" label="Input contrasts manually or through a file">
<option value="manual">manually</option>
<option value="file">file</option>
</param>
</repeat>
<when value="manual">
<repeat name="rep_contrast" title="Contrast" min="1" default="1">
<param name="contrast" type="text" label="Contrast of Interest" help="Names of two groups to compare separated by a hyphen e.g. Mut-WT. If the order is Mut-WT the fold changes in the results will be up/down in Mut relative to WT. If you have more than one contrast enter each separately using the Insert Contrast button below. For differences between contrasts use e.g. (MT.t1-MT.t0)-(WT.t1-WT.t0). For more info, see Chapter 8 in the limma User's guide: https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf or https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf page 36 for nested comparisons.">
<validator type="empty_field"/>
<validator type="regex" message="Please only use letters, numbers, parentheses or underscores">^[\w\-()]+$</validator>
</param>
</repeat>
</when>
<when value="file">
<param name="cinfo" optional="true" type="data" format="tabular" label="Contrasts File" help="Setting this file will ignore any manually added contrasts above, make sure to remove any contrast fields above pressing the trash bin icon, or the tool will fail. First line of the file must be a header, below that each separate contrast should be on a line. Contrast formulas need to be based on ther factors data and potentially the formula provided. See EdgeR documentation on contrasts for more details."/>
</when>
</conditional>
<!-- Filter Options -->
<section name="f" expanded="false" title="Filter Low Counts">
<conditional name="filt">
Expand Down Expand Up @@ -264,6 +304,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -302,6 +343,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="MutA,MutA,MutA,MutB,MutB,MutB,WTA,WTA,WTA,WTB,WTB,WTB"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="(MutA-MutB)-(WTA-WTB)"/>
</repeat>
Expand Down Expand Up @@ -333,6 +375,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand All @@ -356,6 +399,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -383,6 +427,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Batch"/>
<param name="groupNames" value="b1,b2,b3,b1,b2,b3"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand All @@ -402,6 +447,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="ffile" value="yes"/>
<param name="finfo" value="factorinfo.txt"/>
<param name="counts" value="matrix.txt"/>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand All @@ -424,6 +470,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -451,6 +498,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -494,6 +542,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
</repeat>
<param name="annoOpt" value="yes"/>
<param name="geneanno" value="anno.txt"/>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -530,6 +579,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -565,6 +615,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -599,6 +650,7 @@ cp '$outReport.files_path'/*.tsv output_dir/
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<param name="contrastOpt" value="manual"/>
<repeat name="rep_contrast">
<param name="contrast" value="Mut-WT"/>
</repeat>
Expand Down Expand Up @@ -626,6 +678,36 @@ cp '$outReport.files_path'/*.tsv output_dir/
</element>
</output_collection>
</test>
<!-- Ensure formula and contrast file work -->
<test expect_num_outputs="2">
<param name="format" value="matrix"/>
<param name="counts" value="matrix.txt"/>
<repeat name="rep_factor">
<param name="factorName" value="Genotype"/>
<param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT"/>
</repeat>
<repeat name="rep_factor">
<param name="factorName" value="Batch"/>
<param name="groupNames" value="b1,b2,b3,b1,b2,b3"/>
</repeat>
<param name="contrastOpt" value="file"/>
<param name="cinfo" value="contrasts_file.txt"/>
<param name="formula" value="~ 0 + Genotype + Batch"/>
<param name="normalisationOption" value="TMM"/>
<output_collection name="outTables" count="2">
<element name="edgeR_Mut-WT" ftype="tabular">
<assert_contents>
<has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR"/>
<has_text_matching expression="11304.*0.4584"/>
</assert_contents>
</element>
<element name="edgeR_WT-Mut" ftype="tabular">
<assert_contents>
<has_text_matching expression="GeneID.*logFC.*logCPM.*F.*PValue.*FDR"/>
</assert_contents>
</element>
</output_collection>
</test>
</tests>
<help><![CDATA[
.. class:: infomark
Expand Down Expand Up @@ -713,12 +795,20 @@ Example:
*Groups:* The names of the groups for the factor. The names should start with a letter, and only contain letters, numbers and underscores, other characters such as spaces and hyphens must not be used. If entered into the tool form above, the order must be the same as the samples (to which the groups correspond) are listed in the columns of the counts matrix, with the values separated by commas.
**Formula:**
By default the tool will construct a formula for modelling counts based on the contents of the factors files or the factors given.
This can be overriden by directly providing the EdgeR formula in section named Formula.
**Contrasts of Interest:**
The contrasts you wish to make between levels.
A common contrast would be a simple difference between two levels: "Mut-WT"
represents the difference between the mutant and wild type genotypes.
Multiple contrasts must be entered separately using the Insert Contrast button, spaces must not be used.
Alternatively, you can specify a file with contrasts. The file must contain a header (it's value is irrelevant)
and one contrast per line on the first column (other columns are ignored). If using this option, make sure to
remove any contrast section from the manual part, or the tool will fail.
**Filter Low Counts:**
Genes with very low counts across all libraries provide little evidence for differential expression.
In the biological point of view, a gene must be expressed at some minimal level before
Expand Down
3 changes: 3 additions & 0 deletions tools/edger/test-data/contrasts_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Contrasts
Mut-WT
WT-Mut

0 comments on commit 0c79416

Please sign in to comment.