Skip to content

Commit

Permalink
all the things
Browse files Browse the repository at this point in the history
  • Loading branch information
DerrickWood committed Dec 12, 2013
1 parent 3b6d687 commit fcc19fc
Show file tree
Hide file tree
Showing 29 changed files with 4,170 additions and 0 deletions.
52 changes: 52 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
v0.10.3-beta:
* remove Fatal.pm use in kraken-report
* fixed false success message on make failure in installer
* explicitly require g++ as C++ compiler in Makefile
* change to quickfile.cpp to do proper syncing on close
* fixed kraken-build bug w/ --work-on-disk (cause of some major build stalls)
* changed hash size calculation to use Perl
* close input files explicitly in db_sort/db_shrink to reduce reported memory
* allow db_shrink to work in RAM
* updates to README file

v0.10.2-beta:
* fixed kraken-report bug w/ --show-zeros
* fixed kraken-report installation bug
* updates to README file

v0.10.1-beta:
* fixed 2nd bug in build_kraken.sh in calculating hash size (thanks T. Antao)
* fixed bug in add_to_library.sh for some bash versions (thanks T. Antao)
* fixed issue where search window wasn't cached until a failure (query speedup)
* added $KRAKEN_DIR fallback for kraken/kraken-build (thanks S. Koren)

v0.10.0-beta:
* added CHANGELOG
* fixed quick mode hit list output
* updated README citation
* changed minimizer sort order (query speedup), changes database structure
* use linear search with small windows (query speedup)
* changed query procedure (query speedup); search w/o 1st calculating minimizer
* changed readlink in installer to perl Cwd::abs_path (portability)
* removed MAP_POPULATE for preloading, uses read loop instead (bugfix/port.)
* added --work-on-disk switch to kraken-build
* added kraken-report script
* fixed bug in build_kraken.sh in calculating hash size (thanks T. Antao)

v0.9.1b:
* fixed bug to allow kraken-build --shrink

v0.9.0b:
* full rewrite
* minimizers used to speed queries, prefix index removed

v0.3:
* DB build parallelized, Jellyfish removed from LCA assignment

v0.2:
* full rewrite, most progs. changed to C++
* Jellyfish removed from classification step
* prefix index used to speed queries

v0.1:
* initial version, mostly Perl
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions install_kraken.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/bash

# Copyright 2013, Derrick Wood <[email protected]>
#
# This file is part of the Kraken taxonomic classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see <http://www.gnu.org/licenses/>.

set -e

VERSION="0.10.3-beta"

if [ -z "$1" ] || [ -n "$2" ]
then
echo "Usage: $(basename $0) KRAKEN_DIR"
exit 64
fi

if [ "$1" = "KRAKEN_DIR" ]
then
echo "Please replace \"KRAKEN_DIR\" with the name of the directory"
echo "that you want to install Kraken in."
exit 1
fi

# Perl cmd used to canonicalize dirname - "readlink -f" doesn't work
# on OS X.
export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")

(cd src && make)
mkdir -p "$KRAKEN_DIR"
for file in scripts/*.broken
do
perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\
-e 's/#####=(\w+)=#####/$H{$1}/g' \
"KRAKEN_DIR=$KRAKEN_DIR" "VERSION=$VERSION" \
< "$file" > "$KRAKEN_DIR/$(basename $file .broken)"
done
cp scripts/*.sh "$KRAKEN_DIR"
make -C src install
chmod -R +x "$KRAKEN_DIR"

echo
echo "Kraken installation complete."
echo
echo "To make things easier for you, you may want to copy/symlink the following"
echo "files into a directory in your PATH:"
echo " $KRAKEN_DIR/kraken"
echo " $KRAKEN_DIR/kraken-build"
echo " $KRAKEN_DIR/kraken-report"
63 changes: 63 additions & 0 deletions scripts/add_to_library.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

# Copyright 2013, Derrick Wood <[email protected]>
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see <http://www.gnu.org/licenses/>.

# Copy specified file into a Kraken library

set -u # Protect against uninitialized vars.
set -e # Stop on error

LIBRARY_DIR="$KRAKEN_DB_NAME/library"

if [ ! -e "$1" ]
then
echo "Can't add \"$1\": file does not exist"
exit 1
fi
if [ ! -f "$1" ]
then
echo "Can't add \"$1\": not a regular file"
exit 1
fi

if ! head -1 "$1" | perl -nle 'exit 1 unless /^>gi\|(\d+)\|/'
then
echo "Can't add \"$1\": could not find GI number"
exit 1
fi
seq_ct=$(grep -m2 '^>' "$1" | wc -l)
if (( seq_ct > 1 ))
then
echo "Can't add \"$1\": multiple sequences found"
exit 1
fi

mkdir -p "$LIBRARY_DIR/added"
ct=0
freefile=""
while [ -z "$freefile" ]
do
freefile=$(seq -f '%015g' $ct $ct).fna
if [ -e "$LIBRARY_DIR/added/$freefile" ]
then
ct=$(($ct + 1))
freefile=""
fi
done
cp "$1" "$LIBRARY_DIR/added/$freefile"
echo "Added \"$1\" to library ($KRAKEN_DB_NAME)"
186 changes: 186 additions & 0 deletions scripts/build_kraken_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#!/bin/bash

# Copyright 2013, Derrick Wood <[email protected]>
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see <http://www.gnu.org/licenses/>.

# Build a Kraken database
# Designed to be called by kraken_build

set -u # Protect against uninitialized vars.
set -e # Stop on error
set -o pipefail # Stop on failures in non-final pipeline commands

function report_time_elapsed() {
curr_time=$(date "+%s.%N")
perl -e '$time = $ARGV[1] - $ARGV[0];' \
-e '$sec = int($time); $nsec = $time - $sec;' \
-e '$min = int($sec/60); $sec %= 60;' \
-e '$hr = int($min/60); $min %= 60;' \
-e 'print "${hr}h" if $hr;' \
-e 'print "${min}m" if $min || $hr;' \
-e 'printf "%.3fs", $sec + $nsec;' \
$1 $curr_time
}

start_time=$(date "+%s.%N")

DATABASE_DIR="$KRAKEN_DB_NAME"

if [ ! -d "$DATABASE_DIR" ]
then
echo "Can't find Kraken DB directory \"$KRAKEN_DB_NAME\""
exit 1
fi
cd "$DATABASE_DIR"

MEMFLAG=""
if [ -z "$KRAKEN_WORK_ON_DISK" ]
then
MEMFLAG="-M"
echo "Kraken build set to minimize disk writes."
else
echo "Kraken build set to minimize RAM usage."
fi

if [ -e "database.jdb" ]
then
echo "Skipping step 1, k-mer set already exists."
else
echo "Creating k-mer set (step 1 of 6)..."
start_time1=$(date "+%s.%N")

# Estimate hash size as 1.15 * chars in library FASTA files
if [ -z "$KRAKEN_HASH_SIZE" ]
then
KRAKEN_HASH_SIZE=$(find library/ -name '*.fna' -printf '%s\n' | perl -nle '$sum += $_; END {printf "%d\n", 1.15 * $sum}')
echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
fi

find library/ -name '*.fna' -print0 | xargs -0 cat | \
jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
-o database /dev/fd/0

# Merge only if necessary
if [ -e "database_1" ]
then
jellyfish merge -o database.jdb.tmp database_*
else
mv database_0 database.jdb.tmp
fi

# Once here, DB is finalized, can put file in place.
mv database.jdb.tmp database.jdb

echo "K-mer set created. [$(report_time_elapsed $start_time1)]"
fi

if [ -z "$KRAKEN_MAX_DB_SIZE" ]
then
echo "Skipping step 2, no database reduction requested."
else
if [ -e "database.jdb.big" ]
then
echo "Skipping step 2, database reduction already done."
else
start_time1=$(date "+%s.%N")
kdb_size=$(stat -c '%s' database.jdb)
idx_size=$(echo "8 * (4 ^ $KRAKEN_MINIMIZER_LEN + 2)" | bc)
resize_needed=$(echo "scale = 10; ($kdb_size+$idx_size)/(2^30) > $KRAKEN_MAX_DB_SIZE" | bc)
if (( resize_needed == 0 ))
then
echo "Skipping step 2, database reduction unnecessary."
else
echo "Reducing database size (step 2 of 6)..."
max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc)
if (( $(echo "$max_kdb_size < 0" | bc) == 1 ))
then
echo "Maximum database size too small, aborting reduction."
exit 1
fi
# Key ct is 8 byte int stored 48 bytes from start of file
key_ct=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 48, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a')
overage=$(echo "($kdb_size - $max_kdb_size + 11) / 12" | bc)
percentage=$(echo "100 * ($key_ct - $overage) / $key_ct" | bc)
echo "Using $percentage percent of original database."
db_shrink $MEMFLAG -d database.jdb -o database.jdb.small -p $percentage
mv database.jdb database.jdb.big.tmp
mv database.jdb.small database.jdb
mv database.jdb.big.tmp database.jdb.big
echo "Database reduced. [$(report_time_elapsed $start_time1)]"
fi
fi
fi

if [ -e "database.kdb" ]
then
echo "Skipping step 3, k-mer set already sorted."
else
echo "Sorting k-mer set (step 3 of 6)..."
start_time1=$(date "+%s.%N")
db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \
-d database.jdb -o database.kdb.tmp \
-i database.idx

# Once here, DB is sorted, can put file in proper place.
mv database.kdb.tmp database.kdb

echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]"
fi

if [ -e "gi2file.map" ]
then
echo "Skipping step 4, GI number to file map already complete."
else
echo "Creating GI number to file map (step 4 of 6)..."
start_time1=$(date "+%s.%N")
find library/ -name '*.fna' -print0 | \
xargs -0 grep -m1 -H '^>' | \
awk -F '\\|' '{ sub(/:>gi$/, "", $1); print $2 "|" $1 }' \
> gi2file.map.tmp
mv gi2file.map.tmp gi2file.map

echo "GI number to file map created. [$(report_time_elapsed $start_time1)]"
fi

if [ -e "file2taxon.map" ]
then
echo "Skipping step 5, file to taxon map already complete."
else
echo "Creating file to taxon map (step 5 of 6)..."
start_time1=$(date "+%s.%N")
make_file_to_taxon_map taxonomy/gi_taxid_nucl.dmp gi2file.map \
> file2taxon.map.tmp
mv file2taxon.map.tmp file2taxon.map
line_ct=$(wc -l file2taxon.map | awk '{print $1}')

echo "$line_ct files mapped to taxa. [$(report_time_elapsed $start_time1)]"
fi

if [ -e "lca.complete" ]
then
echo "Skipping step 6, LCAs already set."
else
echo "Setting LCAs in database (step 6 of 6)..."
start_time1=$(date "+%s.%N")
set_lcas $MEMFLAG -x -d database.kdb -i database.idx \
-n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -f file2taxon.map
touch "lca.complete"

echo "Database LCAs set. [$(report_time_elapsed $start_time1)]"
fi

echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]"
Loading

0 comments on commit fcc19fc

Please sign in to comment.