From 509e7a83df1caf1d942e83fce1583c53ce75c690 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Fri, 22 May 2015 09:19:56 +0100 Subject: [PATCH 1/2] Add support for the mouse library, GRCm38 --- docs/MANUAL.html | 1 + docs/MANUAL.markdown | 1 + scripts/download_genomic_library.sh | 32 +++++++++++++++++++++++++++-- scripts/kraken-build | 4 ++-- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/docs/MANUAL.html b/docs/MANUAL.html index 6829d86..f5a7e98 100644 --- a/docs/MANUAL.html +++ b/docs/MANUAL.html @@ -154,6 +154,7 @@

Custom Databases

  • plasmids: RefSeq plasmid sequences
  • viruses: RefSeq complete viral genomes
  • human: GRCh38 human genome
  • +
  • mouse: GRCm38 mouse genome
  • To download and install any one of these, use the --download-library switch, e.g.:

    kraken-build --download-library bacteria --db $DBNAME
    diff --git a/docs/MANUAL.markdown b/docs/MANUAL.markdown index 903a1ea..69cabf8 100644 --- a/docs/MANUAL.markdown +++ b/docs/MANUAL.markdown @@ -365,6 +365,7 @@ To build a custom database: - plasmids: RefSeq plasmid sequences - viruses: RefSeq complete viral genomes - human: GRCh38 human genome + - mouse: GRCm38 mouse genome To download and install any one of these, use the `--download-library` switch, e.g.: diff --git a/scripts/download_genomic_library.sh b/scripts/download_genomic_library.sh index b1a7f13..82fc425 100755 --- a/scripts/download_genomic_library.sh +++ b/scripts/download_genomic_library.sh @@ -23,6 +23,7 @@ # plasmids - NCBI RefSeq plasmid sequences # viruses - NCBI RefSeq complete viral DNA and RNA genomes # human - NCBI RefSeq GRCh38 human reference genome +# mouse - NCBI RefSeq GRCm38 mouse reference genome set -u # Protect against uninitialized vars. set -e # Stop on error @@ -101,7 +102,7 @@ case "$1" in do wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/$directory/ file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\S+\.fa\.gz)\s*$/ and print $1' .listing) - [ -z "$file" ] && exit 1 + [ -z $file ] && exit 1 rm .listing wget $FTP_SERVER/genomes/H_sapiens/$directory/$file gunzip "$file" @@ -112,8 +113,35 @@ case "$1" in echo "Skipping download of human genome, already downloaded here." fi ;; + "mouse") + mkdir -p $LIBRARY_DIR/Mouse + cd $LIBRARY_DIR/Mouse + if [ ! -e "lib.complete" ] + then + # get list of CHR_* directories + wget --spider --no-remove-listing $FTP_SERVER/genomes/M_musculus/ + directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing) + rm .listing + + # For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add + for directory in $directories + do + wget --spider --no-remove-listing $FTP_SERVER/genomes/M_musculus/$directory/ + file=$(perl -nle '/^-/ and /\b(mm_ref_GRCm\S+\.fa\.gz)\s*$/ and print $1' .listing) + [ -z $file ] && exit 1 + rm .listing + wget $FTP_SERVER/genomes/M_musculus/$directory/$file + gunzip "$file" + done + + touch "lib.complete" + else + echo "Skipping download of mouse genome, already downloaded here." + fi + ;; *) echo "Unsupported library. Valid options are: " - echo " bacteria plasmids virus human" + echo " bacteria plasmids virus human mouse" ;; + esac diff --git a/scripts/kraken-build b/scripts/kraken-build index bf36ae6..fd219f9 100755 --- a/scripts/kraken-build +++ b/scripts/kraken-build @@ -40,7 +40,7 @@ my $DEF_MINIMIZER_LEN = 15; my $DEF_KMER_LEN = 31; my $DEF_THREAD_CT = 1; -my @VALID_LIBRARY_TYPES = qw/bacteria plasmids viruses human/; +my @VALID_LIBRARY_TYPES = qw/bacteria plasmids viruses human mouse/; # Option/task option variables my ( @@ -200,7 +200,7 @@ Task options (exactly one must be selected): --download-taxonomy Download NCBI taxonomic information --download-library TYPE Download partial library (TYPE = one of "bacteria", "plasmids", - "viruses", "human") + "viruses", "human","mouse") --add-to-library FILE Add FILE to library --build Create DB from library (requires taxonomy d/l'ed and at least one file From dec6bce7f7235b3645fd8a3a4c72b9208d5d665a Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Fri, 22 May 2015 09:21:38 +0100 Subject: [PATCH 2/2] fix indent --- docs/MANUAL.markdown | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/MANUAL.markdown b/docs/MANUAL.markdown index 69cabf8..b274643 100644 --- a/docs/MANUAL.markdown +++ b/docs/MANUAL.markdown @@ -365,7 +365,7 @@ To build a custom database: - plasmids: RefSeq plasmid sequences - viruses: RefSeq complete viral genomes - human: GRCh38 human genome - - mouse: GRCm38 mouse genome + - mouse: GRCm38 mouse genome To download and install any one of these, use the `--download-library` switch, e.g.: