DerrickWood · taltman · Feb 21, 2017 · Feb 21, 2017 · Feb 21, 2017 · Feb 21, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+src/classify
+src/db_shrink
+src/db_sort
+src/*.o
+src/make_seqid_to_taxid_map
+src/set_lcas
diff --git a/docs/MANUAL.markdown b/docs/MANUAL.markdown
@@ -65,6 +65,11 @@ read the paragraph about MiniKraken, below.
     sort of development package installed will have all of the above
     listed programs and libraries available.
 
+    Kraken will use the program [dustmasker] when installed to mask 
+    low-complexity regions of library sequences to reduce the number of 
+    false-positive hits. This is highly recommended. [dustmasker] is 
+    bundled with NCBI BLAST.
+
     Finally, if you want to build your own database, you will need to
     install the [Jellyfish] $k$-mer counter.  Note that Kraken only
     supports use of Jellyfish version 1.  Jellyfish version 2 is not
@@ -86,7 +91,7 @@ read the paragraph about MiniKraken, below.
     required for this database is also only 4 GB.
 
 [Jellyfish]:  http://www.cbcb.umd.edu/software/jellyfish/
-
+[dustmasker]: https://www.ncbi.nlm.nih.gov/books/NBK279681/
 
 Installation
 ============
@@ -459,6 +464,13 @@ To build a custom database:
     size (in GB) for the database.  This allows you to create a MiniKraken
     database without having to create a full Kraken database first.
 
+5) Turning off low-complexity masking: Kraken will by default use the 
+    program `dustmasker` (bundled with NCBI BLAST) if installed and in the
+    system path to mask low-complexity regions of library sequences, 
+    which is the primary source of false positive hits. If you do not 
+    want the library sequences to be masked, you can provide the 
+    `--no-mask` option to disable masking.
+
 A full list of options for `kraken-build` can be obtained using
 `kraken-build --help`.
 

diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
@@ -76,11 +76,28 @@ else
     echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
   fi
 
-  find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
-    xargs -0 cat | \
-    jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
-      -o database /dev/fd/0
+  if [ -z "$KRAKEN_NO_MASK" ] && which dustmasker > /dev/null
+  then
+
+      find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+	  xargs -0 cat \
+	  | dustmasker -outfmt fasta \
+	  | sed -e '/>/!s/a\|c\|g\|t/N/g' \
+	  | jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
+	  -o database /dev/fd/0
 
+  else
+
+      [ -z "$KRAKEN_NO_MASK" ] \
+	  && echo "WARNING: dustmasker not found, database will not be masked for low-complexity regions."
+
+      find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+	  xargs -0 cat \
+	  | jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
+	  -o database /dev/fd/0
+
+  fi
+
   # Merge only if necessary
   if [ -e "database_1" ]
   then

diff --git a/scripts/download_genomic_library.sh b/scripts/download_genomic_library.sh
@@ -28,22 +28,22 @@ set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 
 LIBRARY_DIR="$KRAKEN_DB_NAME/library"
-NCBI_SERVER="ftp.ncbi.nlm.nih.gov"
+NCBI_SERVER="ftp.ncbi.nih.gov"
 FTP_SERVER="ftp://$NCBI_SERVER"
 RSYNC_SERVER="rsync://$NCBI_SERVER"
 THIS_DIR=$PWD
+EXTENSION=genomic.fna.gz
 
 case "$1" in
   "bacteria")
     mkdir -p $LIBRARY_DIR/Bacteria
     cd $LIBRARY_DIR/Bacteria
     if [ ! -e "lib.complete" ]
     then
-      rm -f all.fna.tar.gz
-      wget $FTP_SERVER/genomes/archive/old_refseq/Bacteria/all.fna.tar.gz
+      wget -N $FTP_SERVER/refseq/release/bacteria/*.$EXTENSION
       echo -n "Unpacking..."
-      tar zxf all.fna.tar.gz
-      rm all.fna.tar.gz
+      gunzip *.$EXTENSION
+      rm *.$EXTENSION
       echo " complete."
       touch "lib.complete"
     else
@@ -55,11 +55,10 @@ case "$1" in
     cd $LIBRARY_DIR/Plasmids
     if [ ! -e "lib.complete" ]
     then
-      rm -f plasmids.all.fna.tar.gz
-      wget $FTP_SERVER/genomes/Plasmids/plasmids.all.fna.tar.gz
+      wget -N $FTP_SERVER/refseq/release/plasmid/*.$EXTENSION
       echo -n "Unpacking..."
-      tar zxf plasmids.all.fna.tar.gz
-      rm plasmids.all.fna.tar.gz
+      gunzip *.$EXTENSION
+      rm *.$EXTENSION
       echo " complete."
       touch "lib.complete"
     else
@@ -71,15 +70,10 @@ case "$1" in
     cd $LIBRARY_DIR/Viruses
     if [ ! -e "lib.complete" ]
     then
-      rm -f all.fna.tar.gz
-      rm -f all.ffn.tar.gz
-      wget $FTP_SERVER/genomes/Viruses/all.fna.tar.gz
-      wget $FTP_SERVER/genomes/Viruses/all.ffn.tar.gz
+      wget -N $FTP_SERVER/refseq/release/viral/*.$EXTENSION
       echo -n "Unpacking..."
-      tar zxf all.fna.tar.gz
-      tar zxf all.ffn.tar.gz
-      rm all.fna.tar.gz
-      rm all.ffn.tar.gz
+      gunzip *.$EXTENSION
+      rm *.$EXTENSION
       echo " complete."
       touch "lib.complete"
     else
@@ -91,23 +85,33 @@ case "$1" in
     cd $LIBRARY_DIR/Human
     if [ ! -e "lib.complete" ]
     then
-      # get list of CHR_* directories
-      wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/
-      directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing)
-      rm .listing
 
-      # For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add
-      for directory in $directories
-      do
-        wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/$directory/
-        file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\S+\.fa\.gz)\s*$/ and print $1' .listing)
-        [ -z "$file" ] && exit 1
-        rm .listing
-        wget $FTP_SERVER/genomes/H_sapiens/$directory/$file
-        gunzip "$file"
-      done
+	## Download all files in a single invocation to wget:
+	wget \
+	    --no-directories \
+	    --recursive \
+	    --level=2 \
+	    --accept "hs_ref_GRCh*.fa.gz" \
+	    $FTP_SERVER/genomes/Homo_sapiens/
 
-      touch "lib.complete"
+      # # get list of CHR_* directories
+      # wget --spider --no-remove-listing $FTP_SERVER/genomes/Homo_sapiens/
+      # directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing)
+      # rm .listing
+
+      # # For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add
+      # for directory in $directories
+      # do
+      #   wget --spider --no-remove-listing $FTP_SERVER/genomes/Homo_sapiens/$directory/
+      #   file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\w+\.fa\.gz)\s*$/ and print $1' .listing)
+      #   [ -z "$file" ] && exit 1
+      #   rm .listing
+      #   wget $FTP_SERVER/genomes/H_sapiens/$directory/$file
+      #  gunzip *.gz
+      #done
+
+	gunzip *.gz
+	touch "lib.complete"
     else
       echo "Skipping download of human genome, already downloaded here."
     fi

diff --git a/scripts/kraken-build b/scripts/kraken-build
@@ -53,6 +53,7 @@ my (
   $max_db_size,
   $work_on_disk,
   $shrink_block_offset,
+  $no_mask,
 
   $dl_taxonomy,
   $dl_library,
@@ -98,6 +99,7 @@ GetOptions(
   "max-db-size=s", \$max_db_size,
   "work-on-disk", \$work_on_disk,
   "shrink-block-offset=i", \$shrink_block_offset,
+  "no-mask", \$no_mask,
 
   "download-taxonomy" => \$dl_taxonomy,
   "download-library=s" => \$dl_library,
@@ -152,13 +154,14 @@ if ($max_db_size !~ /^$/ && $max_db_size <= 0) {
   die "Can't have negative max database size.\n";
 }
 
-$ENV{"KRAKEN_DB_NAME"} = $db;
-$ENV{"KRAKEN_THREAD_CT"} = $threads;
+$ENV{"KRAKEN_DB_NAME"}       = $db;
+$ENV{"KRAKEN_THREAD_CT"}     = $threads;
 $ENV{"KRAKEN_MINIMIZER_LEN"} = $minimizer_len;
-$ENV{"KRAKEN_KMER_LEN"} = $kmer_len;
-$ENV{"KRAKEN_HASH_SIZE"} = $hash_size;
-$ENV{"KRAKEN_MAX_DB_SIZE"} = $max_db_size;
-$ENV{"KRAKEN_WORK_ON_DISK"} = $work_on_disk;
+$ENV{"KRAKEN_KMER_LEN"}      = $kmer_len;
+$ENV{"KRAKEN_HASH_SIZE"}     = $hash_size;
+$ENV{"KRAKEN_MAX_DB_SIZE"}   = $max_db_size;
+$ENV{"KRAKEN_WORK_ON_DISK"}  = $work_on_disk;
+$ENV{"KRAKEN_NO_MASK"}       = $no_mask;
 
 if ($dl_taxonomy) {
   download_taxonomy();
@@ -235,10 +238,13 @@ Options:
                              (default: 1)
   --work-on-disk             Perform most operations on disk rather than in
                              RAM (will slow down build in most cases)
+  --no-mask                  Do not use "dustmasker" to mask low-complexity 
+                             regions in library sequences.
+
 EOF
   exit $exit_code;
 }
-
+##'
 sub display_help {
   usage(0);
 }