Skip to content

Commit

Permalink
Add bwa-mem2 data manager
Browse files Browse the repository at this point in the history
  • Loading branch information
natefoo committed Sep 26, 2024
1 parent a9a143b commit cb911d8
Show file tree
Hide file tree
Showing 12 changed files with 340 additions and 0 deletions.
14 changes: 14 additions & 0 deletions data_managers/data_manager_bwa_mem2_index_builder/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
categories:
- Data Managers
description: Bwa-mem2 is the next version of the bwa-mem algorithm in bwa.
homepage_url: https://github.com/bwa-mem2/bwa-mem2
long_description: |
Bwa-mem2 is the next version of the bwa-mem algorithm in bwa. It produces
alignment identical to bwa and is ~1.3-3.1x faster depending on the use-case,
dataset and the running machine. Bwa-mem2 uses a different index format that
is efficient on disk space and runtime memory but requires larger amounts of
memory (roughly 27x the reference) when building.
name: data_manager_bwa_mem2_index_builder
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_bwa_mem2_index_builder
type: unrestricted
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python
# Based heavily on the bwa-mem data manager wrapper script by Dan Blankenberg
from __future__ import print_function

import argparse
import json
import os
import subprocess
import sys

DEFAULT_DATA_TABLE_NAME = "bwa_mem2_indexes"


def get_id_name(params, dbkey, fasta_description=None):
# TODO: ensure sequence_id is unique and does not already appear in location file
sequence_id = params['param_dict']['sequence_id']
if not sequence_id:
sequence_id = dbkey

sequence_name = params['param_dict']['sequence_name']
if not sequence_name:
sequence_name = fasta_description
if not sequence_name:
sequence_name = dbkey
return sequence_id, sequence_name


def build_bwa_mem2_index(data_manager_dict, options, params, sequence_id, sequence_name):
data_table_name = options.data_table_name or DEFAULT_DATA_TABLE_NAME
target_directory = params['output_data'][0]['extra_files_path']
if not os.path.exists(target_directory):
os.mkdir(target_directory)
fasta_base_name = os.path.split(options.fasta_filename)[-1]
sym_linked_fasta_filename = os.path.join(target_directory, fasta_base_name)
os.symlink(options.fasta_filename, sym_linked_fasta_filename)
args = ['bwa-mem2', 'index', sym_linked_fasta_filename]
proc = subprocess.Popen(args=args, shell=False, cwd=target_directory)
return_code = proc.wait()
if return_code:
print("Error building index.", file=sys.stderr)
sys.exit(return_code)
data_table_entry = dict(value=sequence_id, dbkey=options.fasta_dbkey, name=sequence_name, path=fasta_base_name)
_add_data_table_entry(data_manager_dict, data_table_name, data_table_entry)


def _add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
return data_manager_dict


def main():
# Parse Command Line
parser = argparse.ArgumentParser()
parser.add_argument('--output', dest='output', action='store', type=str, default=None)
parser.add_argument('--fasta_filename', dest='fasta_filename', action='store', type=str, default=None)
parser.add_argument('--fasta_dbkey', dest='fasta_dbkey', action='store', type=str, default=None)
parser.add_argument('--fasta_description', dest='fasta_description', action='store', type=str, default=None)
parser.add_argument('--data_table_name', dest='data_table_name', action='store', type=str, default='bwa_mem2_indexes')
options = parser.parse_args()

filename = options.output

with open(filename) as fh:
params = json.load(fh)
data_manager_dict = {}

if options.fasta_dbkey in [None, '', '?']:
raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (options.fasta_dbkey))

sequence_id, sequence_name = get_id_name(params, dbkey=options.fasta_dbkey, fasta_description=options.fasta_description)

# build the index
build_bwa_mem2_index(data_manager_dict, options, params, sequence_id, sequence_name)

# save info to json file
with open(filename, 'w') as fh:
json.dump(data_manager_dict, fh, sort_keys=True)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<tool id="bwa_mem2_index_builder_data_manager" name="BWA-MEM2 index" tool_type="manage_data" version="0.0.1" profile="19.05">
<description>builder</description>
<requirements>
<requirement type="package" version="2.2.1">bwa-mem2</requirement>
<requirement type="package" version="3.9">python</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/bwa_mem2_index_builder.py'
--output '${out_file}'
--fasta_filename '${all_fasta_source.fields.path}'
--fasta_dbkey '${all_fasta_source.fields.dbkey}'
--fasta_description '${all_fasta_source.fields.name}'
--data_table_name bwa_mem2_indexes
]]>
</command>
<inputs>
<param name="all_fasta_source" type="select" label="Source FASTA Sequence">
<options from_data_table="all_fasta"/>
</param>
<param name="sequence_name" type="text" value="" label="Name of sequence" />
<param name="sequence_id" type="text" value="" label="ID for sequence" />
</inputs>
<outputs>
<data name="out_file" format="data_manager_json" />
</outputs>
<tests>
<test>
<param name="all_fasta_source" value="phiX174"/>
<output name="out_file" file="bwa_mem2_data_manager.json"/>
</test>
</tests>
<help>
<![CDATA[
.. class:: infomark
**Notice:** If you leave name, description, or id blank, it will be generated automatically.
Bwa-mem2 is the next version of the bwa-mem algorithm in bwa. It produces
alignment identical to bwa and is ~1.3-3.1x faster depending on the use-case,
dataset and the running machine. Bwa-mem2 uses a different index format that
is efficient on disk space and runtime memory but requires larger amounts of
memory (roughly 27x the reference) when building.
]]>
</help>
<citations>
<citation type="doi">10.1038/nmeth.3317</citation>
</citations>
</tool>
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0"?>
<data_managers>
<data_manager tool_file="data_manager/bwa_mem2_index_builder.xml" id="bwa_mem2_index_builder">
<data_table name="bwa_mem2_indexes">
<output>
<column name="value" />
<column name="dbkey" />
<column name="name" />
<column name="path" output_ref="out_file" >
<move type="directory" relativize_symlinks="True">
<!-- <source>${path}</source>--> <!-- out_file.extra_files_path is used as base by default --> <!-- if no source, eg for type=directory, then refers to base -->
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">${dbkey}/bwa_mem2_index/${value}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/${dbkey}/bwa_mem2_index/${value}/${path}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
</data_manager>
</data_managers>
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#This file lists the locations and dbkeys of all the fasta files
#under the "genome" directory (a directory that contains a directory
#for each build). The script extract_fasta.py will generate the file
#all_fasta.loc. This file has the format (white space characters are
#TAB characters):
#
#<unique_build_id> <dbkey> <display_name> <file_path>
#
#So, all_fasta.loc could look something like this:
#
#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
#
#Your all_fasta.loc file should contain an entry for each individual
#fasta file. So there will be multiple fasta files for each build,
#such as with hg19 above.
#
phiX174 phiX174 phiX174 ${__HERE__}/phiX174.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"bwa_mem2_indexes": [{"dbkey": "phiX174", "name": "phiX174", "path": "phiX174.fasta", "value": "phiX174"}]}}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
>phiX174
GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT
GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA
ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG
TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA
GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC
TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT
TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT
CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT
TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG
TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC
GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA
CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAG
TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT
AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC
CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA
TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC
TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA
CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA
GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT
GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA
ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC
TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT
TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC
ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCGTGATGTTATTTCTTCATTTGGAGGTAAAAC
CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT
GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC
CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC
TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG
TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT
TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA
AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT
TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT
ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC
GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC
TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT
TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA
TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG
TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC
CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG
AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC
CGGGCAATAATGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT
TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG
CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA
AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT
GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG
GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA
TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT
CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG
TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA
GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC
CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA
TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA
AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC
TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT
CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA
TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG
TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT
CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT
TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC
ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG
TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA
ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG
GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC
CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT
GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTACTATTCAGCGTTTGATGAATGCAATGCGACAG
GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT
ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG
CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC
CGTCTTCATTTCCATGCGGTGCATTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC
GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT
CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG
CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA
TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT
TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG
TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC
AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC
TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#This file lists the locations and dbkeys of all the fasta files
#under the "genome" directory (a directory that contains a directory
#for each build). The script extract_fasta.py will generate the file
#all_fasta.loc. This file has the format (white space characters are
#TAB characters):
#
#<unique_build_id> <dbkey> <display_name> <file_path>
#
#So, all_fasta.loc could look something like this:
#
#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
#
#Your all_fasta.loc file should contain an entry for each individual
#fasta file. So there will be multiple fasta files for each build,
#such as with hg19 above.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#This is a sample file distributed with Galaxy that enables tools
#to use a directory of BWA indexed sequences data files. You will need
#to create these data files and then create a bwa_index.loc file
#similar to this one (store it in this directory) that points to
#the directories in which those files are stored. The bwa_index.loc
#file has this format (longer white space characters are TAB characters):
#
#<unique_build_id> <dbkey> <display_name> <file_path>
#
#So, for example, if you had phiX indexed stored in
#/depot/data2/galaxy/phiX/base/,
#then the bwa_index.loc entry would look like this:
#
#phiX174 phiX phiX Pretty /depot/data2/galaxy/phiX/base/phiX.fa
#
#and your /depot/data2/galaxy/phiX/base/ directory
#would contain phiX.fa.* files:
#
#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 phiX.fa.amb
#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 phiX.fa.ann
#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 phiX.fa.bwt
#...etc...
#
#Your bwa_index.loc file should include an entry per line for each
#index set you have stored. The "file" in the path does not actually
#exist, but it is the prefix for the actual index files. For example:
#
#phiX174 phiX phiX174 /depot/data2/galaxy/phiX/base/phiX.fa
#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/hg18/base/hg18canon.fa
#hg18full hg18 hg18 Full /depot/data2/galaxy/hg18/base/hg18full.fa
#/orig/path/hg19.fa hg19 hg19 /depot/data2/galaxy/hg19/base/hg19.fa
#...etc...
#
#Note that for backwards compatibility with workflows, the unique ID of
#an entry must be the path that was in the original loc file, because that
#is the value stored in the workflow for that parameter. That is why the
#hg19 entry above looks odd. New genomes can be better-looking.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
<tables>
<!-- Locations of indexes in the BWA-MEM2 mapper format-->
<table name="bwa_mem2_indexes" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="tool-data/bwa_mem2_index.loc" />
</table>
</tables>
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<tables>
<!-- Locations of all fasta files under genome directory -->
<table name="all_fasta" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="${__HERE__}/test-data/all_fasta.loc" />
</table>
<!-- Locations of indexes in the BWA-MEM2 mapper format-->
<table name="bwa_mem2_indexes" comment_char="#">
<columns>value, dbkey, name, path</columns>
<file path="${__HERE__}/test-data/bwa_mem2_index.loc" />
</table>
</tables>

0 comments on commit cb911d8

Please sign in to comment.