-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3b6d687
commit fcc19fc
Showing
29 changed files
with
4,170 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
v0.10.3-beta: | ||
* remove Fatal.pm use in kraken-report | ||
* fixed false success message on make failure in installer | ||
* explicitly require g++ as C++ compiler in Makefile | ||
* change to quickfile.cpp to do proper syncing on close | ||
* fixed kraken-build bug w/ --work-on-disk (cause of some major build stalls) | ||
* changed hash size calculation to use Perl | ||
* close input files explicitly in db_sort/db_shrink to reduce reported memory | ||
* allow db_shrink to work in RAM | ||
* updates to README file | ||
|
||
v0.10.2-beta: | ||
* fixed kraken-report bug w/ --show-zeros | ||
* fixed kraken-report installation bug | ||
* updates to README file | ||
|
||
v0.10.1-beta: | ||
* fixed 2nd bug in build_kraken.sh in calculating hash size (thanks T. Antao) | ||
* fixed bug in add_to_library.sh for some bash versions (thanks T. Antao) | ||
* fixed issue where search window wasn't cached until a failure (query speedup) | ||
* added $KRAKEN_DIR fallback for kraken/kraken-build (thanks S. Koren) | ||
|
||
v0.10.0-beta: | ||
* added CHANGELOG | ||
* fixed quick mode hit list output | ||
* updated README citation | ||
* changed minimizer sort order (query speedup), changes database structure | ||
* use linear search with small windows (query speedup) | ||
* changed query procedure (query speedup); search w/o 1st calculating minimizer | ||
* changed readlink in installer to perl Cwd::abs_path (portability) | ||
* removed MAP_POPULATE for preloading, uses read loop instead (bugfix/port.) | ||
* added --work-on-disk switch to kraken-build | ||
* added kraken-report script | ||
* fixed bug in build_kraken.sh in calculating hash size (thanks T. Antao) | ||
|
||
v0.9.1b: | ||
* fixed bug to allow kraken-build --shrink | ||
|
||
v0.9.0b: | ||
* full rewrite | ||
* minimizers used to speed queries, prefix index removed | ||
|
||
v0.3: | ||
* DB build parallelized, Jellyfish removed from LCA assignment | ||
|
||
v0.2: | ||
* full rewrite, most progs. changed to C++ | ||
* Jellyfish removed from classification step | ||
* prefix index used to speed queries | ||
|
||
v0.1: | ||
* initial version, mostly Perl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic classification system. | ||
# | ||
# Kraken is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# Kraken is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with Kraken. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
set -e | ||
|
||
VERSION="0.10.3-beta" | ||
|
||
if [ -z "$1" ] || [ -n "$2" ] | ||
then | ||
echo "Usage: $(basename $0) KRAKEN_DIR" | ||
exit 64 | ||
fi | ||
|
||
if [ "$1" = "KRAKEN_DIR" ] | ||
then | ||
echo "Please replace \"KRAKEN_DIR\" with the name of the directory" | ||
echo "that you want to install Kraken in." | ||
exit 1 | ||
fi | ||
|
||
# Perl cmd used to canonicalize dirname - "readlink -f" doesn't work | ||
# on OS X. | ||
export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1") | ||
|
||
(cd src && make) | ||
mkdir -p "$KRAKEN_DIR" | ||
for file in scripts/*.broken | ||
do | ||
perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\ | ||
-e 's/#####=(\w+)=#####/$H{$1}/g' \ | ||
"KRAKEN_DIR=$KRAKEN_DIR" "VERSION=$VERSION" \ | ||
< "$file" > "$KRAKEN_DIR/$(basename $file .broken)" | ||
done | ||
cp scripts/*.sh "$KRAKEN_DIR" | ||
make -C src install | ||
chmod -R +x "$KRAKEN_DIR" | ||
|
||
echo | ||
echo "Kraken installation complete." | ||
echo | ||
echo "To make things easier for you, you may want to copy/symlink the following" | ||
echo "files into a directory in your PATH:" | ||
echo " $KRAKEN_DIR/kraken" | ||
echo " $KRAKEN_DIR/kraken-build" | ||
echo " $KRAKEN_DIR/kraken-report" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic sequence classification system. | ||
# | ||
# Kraken is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# Kraken is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with Kraken. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
# Copy specified file into a Kraken library | ||
|
||
set -u # Protect against uninitialized vars. | ||
set -e # Stop on error | ||
|
||
LIBRARY_DIR="$KRAKEN_DB_NAME/library" | ||
|
||
if [ ! -e "$1" ] | ||
then | ||
echo "Can't add \"$1\": file does not exist" | ||
exit 1 | ||
fi | ||
if [ ! -f "$1" ] | ||
then | ||
echo "Can't add \"$1\": not a regular file" | ||
exit 1 | ||
fi | ||
|
||
if ! head -1 "$1" | perl -nle 'exit 1 unless /^>gi\|(\d+)\|/' | ||
then | ||
echo "Can't add \"$1\": could not find GI number" | ||
exit 1 | ||
fi | ||
seq_ct=$(grep -m2 '^>' "$1" | wc -l) | ||
if (( seq_ct > 1 )) | ||
then | ||
echo "Can't add \"$1\": multiple sequences found" | ||
exit 1 | ||
fi | ||
|
||
mkdir -p "$LIBRARY_DIR/added" | ||
ct=0 | ||
freefile="" | ||
while [ -z "$freefile" ] | ||
do | ||
freefile=$(seq -f '%015g' $ct $ct).fna | ||
if [ -e "$LIBRARY_DIR/added/$freefile" ] | ||
then | ||
ct=$(($ct + 1)) | ||
freefile="" | ||
fi | ||
done | ||
cp "$1" "$LIBRARY_DIR/added/$freefile" | ||
echo "Added \"$1\" to library ($KRAKEN_DB_NAME)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic sequence classification system. | ||
# | ||
# Kraken is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# Kraken is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with Kraken. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
# Build a Kraken database | ||
# Designed to be called by kraken_build | ||
|
||
set -u # Protect against uninitialized vars. | ||
set -e # Stop on error | ||
set -o pipefail # Stop on failures in non-final pipeline commands | ||
|
||
function report_time_elapsed() { | ||
curr_time=$(date "+%s.%N") | ||
perl -e '$time = $ARGV[1] - $ARGV[0];' \ | ||
-e '$sec = int($time); $nsec = $time - $sec;' \ | ||
-e '$min = int($sec/60); $sec %= 60;' \ | ||
-e '$hr = int($min/60); $min %= 60;' \ | ||
-e 'print "${hr}h" if $hr;' \ | ||
-e 'print "${min}m" if $min || $hr;' \ | ||
-e 'printf "%.3fs", $sec + $nsec;' \ | ||
$1 $curr_time | ||
} | ||
|
||
start_time=$(date "+%s.%N") | ||
|
||
DATABASE_DIR="$KRAKEN_DB_NAME" | ||
|
||
if [ ! -d "$DATABASE_DIR" ] | ||
then | ||
echo "Can't find Kraken DB directory \"$KRAKEN_DB_NAME\"" | ||
exit 1 | ||
fi | ||
cd "$DATABASE_DIR" | ||
|
||
MEMFLAG="" | ||
if [ -z "$KRAKEN_WORK_ON_DISK" ] | ||
then | ||
MEMFLAG="-M" | ||
echo "Kraken build set to minimize disk writes." | ||
else | ||
echo "Kraken build set to minimize RAM usage." | ||
fi | ||
|
||
if [ -e "database.jdb" ] | ||
then | ||
echo "Skipping step 1, k-mer set already exists." | ||
else | ||
echo "Creating k-mer set (step 1 of 6)..." | ||
start_time1=$(date "+%s.%N") | ||
|
||
# Estimate hash size as 1.15 * chars in library FASTA files | ||
if [ -z "$KRAKEN_HASH_SIZE" ] | ||
then | ||
KRAKEN_HASH_SIZE=$(find library/ -name '*.fna' -printf '%s\n' | perl -nle '$sum += $_; END {printf "%d\n", 1.15 * $sum}') | ||
echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'" | ||
fi | ||
|
||
find library/ -name '*.fna' -print0 | xargs -0 cat | \ | ||
jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ | ||
-o database /dev/fd/0 | ||
|
||
# Merge only if necessary | ||
if [ -e "database_1" ] | ||
then | ||
jellyfish merge -o database.jdb.tmp database_* | ||
else | ||
mv database_0 database.jdb.tmp | ||
fi | ||
|
||
# Once here, DB is finalized, can put file in place. | ||
mv database.jdb.tmp database.jdb | ||
|
||
echo "K-mer set created. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
|
||
if [ -z "$KRAKEN_MAX_DB_SIZE" ] | ||
then | ||
echo "Skipping step 2, no database reduction requested." | ||
else | ||
if [ -e "database.jdb.big" ] | ||
then | ||
echo "Skipping step 2, database reduction already done." | ||
else | ||
start_time1=$(date "+%s.%N") | ||
kdb_size=$(stat -c '%s' database.jdb) | ||
idx_size=$(echo "8 * (4 ^ $KRAKEN_MINIMIZER_LEN + 2)" | bc) | ||
resize_needed=$(echo "scale = 10; ($kdb_size+$idx_size)/(2^30) > $KRAKEN_MAX_DB_SIZE" | bc) | ||
if (( resize_needed == 0 )) | ||
then | ||
echo "Skipping step 2, database reduction unnecessary." | ||
else | ||
echo "Reducing database size (step 2 of 6)..." | ||
max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc) | ||
if (( $(echo "$max_kdb_size < 0" | bc) == 1 )) | ||
then | ||
echo "Maximum database size too small, aborting reduction." | ||
exit 1 | ||
fi | ||
# Key ct is 8 byte int stored 48 bytes from start of file | ||
key_ct=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 48, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a') | ||
overage=$(echo "($kdb_size - $max_kdb_size + 11) / 12" | bc) | ||
percentage=$(echo "100 * ($key_ct - $overage) / $key_ct" | bc) | ||
echo "Using $percentage percent of original database." | ||
db_shrink $MEMFLAG -d database.jdb -o database.jdb.small -p $percentage | ||
mv database.jdb database.jdb.big.tmp | ||
mv database.jdb.small database.jdb | ||
mv database.jdb.big.tmp database.jdb.big | ||
echo "Database reduced. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
fi | ||
fi | ||
|
||
if [ -e "database.kdb" ] | ||
then | ||
echo "Skipping step 3, k-mer set already sorted." | ||
else | ||
echo "Sorting k-mer set (step 3 of 6)..." | ||
start_time1=$(date "+%s.%N") | ||
db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \ | ||
-d database.jdb -o database.kdb.tmp \ | ||
-i database.idx | ||
|
||
# Once here, DB is sorted, can put file in proper place. | ||
mv database.kdb.tmp database.kdb | ||
|
||
echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
|
||
if [ -e "gi2file.map" ] | ||
then | ||
echo "Skipping step 4, GI number to file map already complete." | ||
else | ||
echo "Creating GI number to file map (step 4 of 6)..." | ||
start_time1=$(date "+%s.%N") | ||
find library/ -name '*.fna' -print0 | \ | ||
xargs -0 grep -m1 -H '^>' | \ | ||
awk -F '\\|' '{ sub(/:>gi$/, "", $1); print $2 "|" $1 }' \ | ||
> gi2file.map.tmp | ||
mv gi2file.map.tmp gi2file.map | ||
|
||
echo "GI number to file map created. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
|
||
if [ -e "file2taxon.map" ] | ||
then | ||
echo "Skipping step 5, file to taxon map already complete." | ||
else | ||
echo "Creating file to taxon map (step 5 of 6)..." | ||
start_time1=$(date "+%s.%N") | ||
make_file_to_taxon_map taxonomy/gi_taxid_nucl.dmp gi2file.map \ | ||
> file2taxon.map.tmp | ||
mv file2taxon.map.tmp file2taxon.map | ||
line_ct=$(wc -l file2taxon.map | awk '{print $1}') | ||
|
||
echo "$line_ct files mapped to taxa. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
|
||
if [ -e "lca.complete" ] | ||
then | ||
echo "Skipping step 6, LCAs already set." | ||
else | ||
echo "Setting LCAs in database (step 6 of 6)..." | ||
start_time1=$(date "+%s.%N") | ||
set_lcas $MEMFLAG -x -d database.kdb -i database.idx \ | ||
-n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -f file2taxon.map | ||
touch "lca.complete" | ||
|
||
echo "Database LCAs set. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
|
||
echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]" |
Oops, something went wrong.