Skip to content

Commit

Permalink
Split meryl (#5942)
Browse files Browse the repository at this point in the history
* split meryl

* fix linting errors

in original and split tools

* split description help

* add fastq test data

generated using reformat.sh from bbmap

reformat.sh in=test-data/child.fasta out=test-data/child.fastq qfake=40 fastareadlen=150 qout=33 addcolon=t trimreaddescription=t uniquenames=t

* fix bug in count-kmers

* add test for fastq(sanger)

* gzip test data

to stay in size limits

* fix tests

* deprecate monolithic meryl

* make suite, finish tools

- add assumptions of meryldb zip files
- output names

* bump profile
  • Loading branch information
bernt-matthias authored Apr 25, 2024
1 parent 431ba94 commit a95f5b0
Show file tree
Hide file tree
Showing 11 changed files with 811 additions and 10 deletions.
11 changes: 5 additions & 6 deletions tools/meryl/meryl.xml → deprecated/tools/meryl/meryl.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
<macros>
<import>macros.xml</import>
</macros>
<expand macro='edam_ontology' />
<xrefs>
<xref type="bio.tools">meryl</xref>
</xrefs>
<expand macro='edam_ontology' />
<expand macro='requirements' />
<version_command>meryl --version</version_command>
<command detect_errors='exit_code'><![CDATA[
Expand All @@ -21,7 +21,7 @@
ln -s '$operation_type.input_reads' ./input.${operation_type.input_reads.ext} &&
meryl
$operation_type.count_operations
$operation_type.count_operation
k=$size
memory=\$GALAXY_MEMORY_GB
threads=\${GALAXY_SLOTS:-1}
Expand Down Expand Up @@ -148,7 +148,7 @@
<option value="trio-mode">Build hap-mer dbs for trios</option>
</param>
<when value="count-kmers">
<param name="count_operations" type="select" label="Count operations" help="Select an operation to be executed">
<param name="count_operation" type="select" label="Count operation" help="Select an operation to be executed">
<option value="count">Count: count the occurrences of canonical k-mers</option>
<option value="count-forward">Count-forward: count the occurreces of forward k-mers</option>
<option value="count-reverse">Count-reverse: count the occurreces of reverse k-mers</option>
Expand Down Expand Up @@ -446,7 +446,6 @@
<param name="command_type" value="groups-kmers" />
<param name="groups_operations" value="union" />
<param name="input_meryldb_02" value="output_02.read-db.meryldb,output_03.read-db.meryldb" ftype="meryldb" />
<param name="input_meryldb_03" value="" ftype="meryldb" />
</conditional>
<output name="read_db" ftype="meryldb">
<assert_contents>
Expand Down Expand Up @@ -588,13 +587,13 @@
</conditional>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="3362942" delta="300" />
<has_size value="3362942" delta="2000" />
</assert_contents>
</output>
<output name="read_db_hist" file="output_23.read-db.hist" />
<output name="pat_db" ftype="meryldb">
<assert_contents>
<has_size value="120610" delta="300" />
<has_size value="120610" delta="400" />
</assert_contents>
</output>
<output name="pat_db_hist" file="output_23.pat.hist" />
Expand Down
16 changes: 12 additions & 4 deletions tools/meryl/.shed.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
categories:
- Assembly
name: meryl
owner: iuc
description: Meryl a k-mer counter.
long_description: |
Meryl a k-mer counter. It is built into the Celera assembler and is also available as a stand-alone application. Meryl uses a sorting-based approach that sorts k-mers in lexicographical order.
name: meryl
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/meryl
homepage_url: https://github.com/marbl/meryl
type: unrestricted
categories:
- Assembly
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper for Meryl function: {{ tool_name }}."
suite:
name: "suite_meryl"
description: "A suite of tools that brings the Meryl project into Galaxy."
long_description: |
Meryl a k-mer counter. It is built into the Celera assembler and is also available as a stand-alone application. Meryl uses a sorting-based approach that sorts k-mers in lexicographical order.
123 changes: 123 additions & 0 deletions tools/meryl/arithmetic-kmers.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
<tool id="meryl_arithmetic_kmers" name="Meryl" version="@TOOL_VERSION@+@GALAXY_TOOL_VERSION@@SUFFIX_VERSION@" profile="@PROFILE@">
<description>apply arithmetic operations to k-mer counts</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="edam_ontology"/>
<xrefs>
<xref type="bio.tools">meryl</xref>
</xrefs>
<expand macro="requirements"/>
<version_command>meryl --version</version_command>
<command detect_errors="exit_code"><![CDATA[
export GALAXY_MEMORY_GB=\$((\${GALAXY_MEMORY_MB:-8192}/1024)) &&
mkdir -p ./temp_db/ &&
tar -zxf $input_meryldb_02 -C ./temp_db/ &&
mv ./temp_db/* tmp.meryl &&
meryl
$arithmetic_operations
$X
tmp.meryl
output read-db.meryl &&
tar -zcf read-db.meryldb read-db.meryl
]]></command>
<inputs>
<param name="arithmetic_operations" type="select" label="Arithmetic operations" help="Select an operation to be executed">
<option value="increase">Increase: add x to the count of each k-mer</option>
<option value="decrease">Decrease: subtract x from the count of each k-mer</option>
<option value="multiply">Multiply: multiply the count of each k-mer by x</option>
<option value="divide">Divide: divide the count of each k-mer by x</option>
<option value="divide-round">Divide-round: divide the count of each k-mer by x and round the results</option>
<option value="modulo">Modulo: set the count of each k-mer to the remainder of the count divided by x</option>
</param>
<param name="input_meryldb_02" type="data" format="meryldb" label="Input meryldb" help="Select a meryldb dataset"/>
<param name="X" type="integer" min="1" max="1000000" value="" optional="true" label="Operand"/>
</inputs>
<outputs>
<data name="read_db" format="meryldb" from_work_dir="read-db.meryldb"/>
</outputs>
<tests>
<test expect_num_outputs="1">
<param name="X" value="100000"/>
<param name="input_meryldb_02" value="read-db.meryldb" ftype="meryldb"/>
<param name="arithmetic_operations" value="increase"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="59500" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<param name="X" value="100"/>
<param name="input_meryldb_02" value="read-db.meryldb" ftype="meryldb"/>
<param name="arithmetic_operations" value="decrease"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="42313" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<param name="X" value="3"/>
<param name="input_meryldb_02" value="read-db.meryldb" ftype="meryldb"/>
<param name="arithmetic_operations" value="multiply"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="60530" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<param name="X" value="2"/>
<param name="input_meryldb_02" value="read-db.meryldb" ftype="meryldb"/>
<param name="arithmetic_operations" value="divide"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="56200" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<param name="X" value="2"/>
<param name="input_meryldb_02" value="read-db.meryldb" ftype="meryldb"/>
<param name="arithmetic_operations" value="divide-round"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="56100" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<param name="X" value="3"/>
<param name="input_meryldb_02" value="read-db.meryldb" ftype="meryldb"/>
<param name="arithmetic_operations" value="modulo"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="37501" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
</test>
</tests>
<help>

.. class:: infomark

**Purpose**

Meryl is the k-mer counter. This tool applies arithmetic operations on k-mer counts:

- Increase: add x to the count of each k-mer
- Decrease: subsctract x from the count of each k-mer
- Multiply: multiply the count of each k-mer by x
- Divide: divide the count of each k-mer by x
- Divide-round: divide the count of each k-mer by x and round th results
- Modulo: set the count of each k-mer to the remainder of the count divided by x
</help>
<expand macro="citations"/>
</tool>
122 changes: 122 additions & 0 deletions tools/meryl/count-kmers.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
<tool id="meryl_count_kmers" name="Meryl" version="@TOOL_VERSION@+@GALAXY_TOOL_VERSION@@SUFFIX_VERSION@" profile="@PROFILE@">
<description>count k-mers</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="edam_ontology"/>
<xrefs>
<xref type="bio.tools">meryl</xref>
</xrefs>
<expand macro="requirements"/>
<version_command>meryl --version</version_command>
<command detect_errors="exit_code"><![CDATA[
export GALAXY_MEMORY_GB=\$((\${GALAXY_MEMORY_MB:-8192}/1024)) &&
#if $options_kmer_size.kmer_size == 'estimate'
#from math import log
#set size=int(log(int($options_kmer_size.genome_size)*(1-float($options_kmer_size.collision_rate))/float($options_kmer_size.collision_rate))/log(4))
#elif $options_kmer_size.kmer_size == 'provide'
#set size=$options_kmer_size.input_kmer_size
#end if
ln -s '$input_reads' ./input.${input_reads.ext} &&
meryl
$count_operation
k=$size
memory=\$GALAXY_MEMORY_GB
threads=\${GALAXY_SLOTS:-1}
./input.${input_reads.ext}
output read-db.meryl &&
echo 'K-mer size: ${size}' &&
tar -zcf read-db.meryldb read-db.meryl
]]></command>
<inputs>
<param name="count_operation" type="select" label="Count operations" help="Select an operation to be executed">
<option value="count">Count: count the occurrences of canonical k-mers</option>
<option value="count-forward">Count-forward: count the occurreces of forward k-mers</option>
<option value="count-reverse">Count-reverse: count the occurreces of reverse k-mers</option>
</param>
<param name="input_reads" type="data" format="fastq,fastq.gz,fasta,fasta.gz" label="Input sequences" help="Select your reads in FASTA/FASTQ format."/>
<conditional name="options_kmer_size">
<param name="kmer_size" type="select" label="K-mer size selector">
<option value="provide">Set a k-mer size</option>
<option value="estimate">Estimate the best k-mer size</option>
</param>
<when value="provide">
<param name="input_kmer_size" type="integer" min="1" max="50" value="" label="K-mer size" help="For a human genome, the best k-mer size is k=21 for both haploid (3.1G) or diploid (6.2G).."/>
</when>
<when value="estimate">
<param name="genome_size" type="integer" min="1000" max="70000000000" value="1000" label="Genome size" help="Haploid genome size or diploid genome size, depending on what we evaluate. In bp. Only required if the k-mer size is not provided."/>
<param name="collision_rate" type="float" min="0.0001" max="0.01" value="0.001" label="Tolerable collision rate" help="Tolerable collision rate. By default is 0.001."/>
</when>
</conditional>
</inputs>
<outputs>
<data name="read_db" format="meryldb" from_work_dir="read-db.meryldb"/>
</outputs>
<tests>
<test expect_num_outputs="1">
<conditional name="options_kmer_size">
<param name="kmer_size" value="provide"/>
<param name="input_kmer_size" value="7"/>
</conditional>
<param name="input_reads" value="child.fasta"/>
<param name="count_operation" value="count"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="22152" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
<assert_stdout>
<has_line line="K-mer size: 7"/>
</assert_stdout>
</test>
<test expect_num_outputs="1">
<conditional name="options_kmer_size">
<param name="kmer_size" value="provide"/>
<param name="input_kmer_size" value="7"/>
</conditional>
<param name="input_reads" value="child.fasta.gz" ftype="fasta.gz"/>
<param name="count_operation" value="count"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="22200" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
<assert_stdout>
<has_line line="K-mer size: 7"/>
</assert_stdout>
</test>
<test expect_num_outputs="1">
<conditional name="options_kmer_size">
<param name="kmer_size" value="provide"/>
<param name="input_kmer_size" value="7"/>
</conditional>
<param name="input_reads" value="child.fastq.gz" ftype="fastqsanger.gz"/>
<param name="count_operation" value="count"/>
<output name="read_db" ftype="meryldb">
<assert_contents>
<has_size value="22200" delta="1000"/>
<expand macro="meryldb_archive_assumptions"/>
</assert_contents>
</output>
<assert_stdout>
<has_line line="K-mer size: 7"/>
</assert_stdout>
</test>
</tests>
<help>

.. class:: infomark

**Purpose**

Meryl is the k-mer counter. This tool can be used to count kmers.

- Count: count the occurrences of canonical k-mers
- Count-forward: count the occurreces of forward k-mers
- Count-reverse: count the occurreces of reverse k-mers
</help>
<expand macro="citations"/>
</tool>
Loading

0 comments on commit a95f5b0

Please sign in to comment.