From 4487ec115903ef9e1e30cc775536dc296a18ba0b Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 10 Jul 2015 10:41:05 -0400 Subject: [PATCH 001/105] Added .gitignore files --- .gitignore | 1 + src/.gitignore | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 .gitignore create mode 100644 src/.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..500b4a0 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/install/ diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..f30e916 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,6 @@ +*.o +/db_sort +/classify +/db_shrink +/set_lcas +/make_seqid_to_taxid_map From a07b9a2917689c9145785cc6a64d33c009a41255 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 10 Jul 2015 10:55:34 -0400 Subject: [PATCH 002/105] Skip empty FASTA sequences instead of exiting the program --- src/seqreader.cpp | 4 +--- src/set_lcas.cpp | 8 ++++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/seqreader.cpp b/src/seqreader.cpp index 78c1442..6903c0d 100644 --- a/src/seqreader.cpp +++ b/src/seqreader.cpp @@ -72,9 +72,7 @@ namespace kraken { dna.seq = seq_ss.str(); if (dna.seq.empty()) { - warnx("malformed fasta file - zero-length record (%s)", dna.id.c_str()); - valid = false; - return dna; + valid = true; // set_lcas handles empty sequences } return dna; diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 46b2e09..c02307d 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -109,11 +109,19 @@ void process_single_file() { FastaReader reader(Multi_fasta_filename); DNASequence dna; uint32_t seqs_processed = 0; + uint32_t seqs_skipped = 0; + uint32_t seqs_no_taxid = 0; while (reader.is_valid()) { dna = reader.next_sequence(); if (! reader.is_valid()) break; + + if ( dna.seq.empty() ) { + ++seq_skipped; + continue; + } + uint32_t taxid = ID_to_taxon_map[dna.id]; if (taxid) { #pragma omp parallel for schedule(dynamic) From d6071dabe604f9faa98f29534dd7fa9caef9a379 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 10 Jul 2015 11:06:05 -0400 Subject: [PATCH 003/105] Added options -T to force taxid of the sequences, and -v for verbose output When -T is set, for each observed k-mer the taxid of the sequence is set - instead of the lowest common ancestor of of the sequence taxid and the currently set taxid. This is useful for setting the taxid of contaminant sequences, which may also be observed in database genomes, to the contaminant taxid. -v gives more verbose output --- src/set_lcas.cpp | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index c02307d..6d12533 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -39,7 +39,10 @@ int Num_threads = 1; string DB_filename, Index_filename, Nodes_filename, File_to_taxon_map_filename, ID_to_taxon_map_filename, Multi_fasta_filename; +bool force_taxid = false; + bool Allow_extra_kmers = false; +bool verbose = false; bool Operate_in_RAM = false; bool One_FASTA_file = false; map Parent_map; @@ -52,11 +55,12 @@ int main(int argc, char **argv) { #endif parse_command_line(argc, argv); - Parent_map = build_parent_map(Nodes_filename); + + if (!force_taxid) { + Parent_map = build_parent_map(Nodes_filename); + } QuickFile db_file(DB_filename, "rw"); - Database = KrakenDB(db_file.ptr()); - KmerScanner::set_k(Database.get_k()); char *temp_ptr = NULL; size_t db_file_size = db_file.size(); @@ -67,8 +71,12 @@ int main(int argc, char **argv) { ifs.read(temp_ptr, db_file_size); ifs.close(); Database = KrakenDB(temp_ptr); + } else { + Database = KrakenDB(db_file.ptr()); } + KmerScanner::set_k(Database.get_k()); + QuickFile idx_file(Index_filename); KrakenDBIndex db_index(idx_file.ptr()); Database.set_index(&db_index); @@ -127,11 +135,18 @@ void process_single_file() { #pragma omp parallel for schedule(dynamic) for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1); + + ++seqs_processed; + } else { + if (verbose) + cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl; + + ++seqs_no_taxid } - cerr << "\rProcessed " << ++seqs_processed << " sequences"; + cerr << "\rProcessed " << seqs_processed << " sequences"; } - cerr << "\r "; - cerr << "\rFinished processing " << seqs_processed << " sequences" << endl; + cerr << "\r "; + cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< skipped_seqs <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl; } void process_files() { @@ -186,9 +201,13 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { if (! Allow_extra_kmers) errx(EX_DATAERR, "kmer found in sequence that is not in database"); else + cerr << "kmer found in sequence w/ taxid " << taxid << " that is not in database" << endl; continue; } - *val_ptr = lca(Parent_map, taxid, *val_ptr); + if (!force_taxid) + *val_ptr = lca(Parent_map, taxid, *val_ptr); + else + *val_ptr = taxid; } } @@ -198,7 +217,7 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xM")) != -1) { + while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTv")) != -1) { switch (opt) { case 'f' : File_to_taxon_map_filename = optarg; @@ -226,9 +245,15 @@ void parse_command_line(int argc, char **argv) { omp_set_num_threads(Num_threads); #endif break; + case 'T' : + force_taxid = true; + break; case 'n' : Nodes_filename = optarg; break; + case 'v' : + verbose = true; + break; case 'x' : Allow_extra_kmers = true; break; @@ -267,6 +292,8 @@ void usage(int exit_code) { << " -f filename File to taxon map" << endl << " -F filename Multi-FASTA file with sequence data" << endl << " -m filename Sequence ID to taxon map" << endl + << " -T Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl + << " -v Verbose output" << endl << " -h Print this message" << endl << endl << "-F and -m must be specified together. If -f is given, " From 14b74e2a1380d70582423f6d46c6493baf5d72b6 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 10 Jul 2015 11:06:37 -0400 Subject: [PATCH 004/105] added comments --- src/krakenutil.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index a00e6bb..0c424c4 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -53,11 +53,14 @@ namespace kraken { if (a == 0 || b == 0) return a ? a : b; + // create a path from a to the root set a_path; while (a > 0) { a_path.insert(a); a = parent_map[a]; } + + // search for b in the path from a to the root while (b > 0) { if (a_path.count(b) > 0) return b; From 567ae7bd2bc710f483614e5a34b604e6a411e3b5 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 17 Sep 2015 13:02:21 -0400 Subject: [PATCH 005/105] update --- scripts/build_kraken_db.sh | 9 +++++---- src/set_lcas.cpp | 10 +++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh index 7df4d0b..9d090e0 100755 --- a/scripts/build_kraken_db.sh +++ b/scripts/build_kraken_db.sh @@ -39,6 +39,7 @@ function report_time_elapsed() { start_time=$(date "+%s.%N") DATABASE_DIR="$KRAKEN_DB_NAME" +FIND_OPTS=-L if [ ! -d "$DATABASE_DIR" ] then @@ -72,11 +73,11 @@ else # Estimate hash size as 1.15 * chars in library FASTA files if [ -z "$KRAKEN_HASH_SIZE" ] then - KRAKEN_HASH_SIZE=$(find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -printf '%s\n' | perl -nle '$sum += $_; END {print int(1.15 * $sum)}') + KRAKEN_HASH_SIZE=$(find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -printf '%s\n' | perl -nle '$sum += $_; END {print int(1.15 * $sum)}') echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'" fi - find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ xargs -0 cat | \ jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ -o database /dev/fd/0 @@ -160,7 +161,7 @@ then else echo "Creating GI number to seqID map (step 4 of 6)..." start_time1=$(date "+%s.%N") - find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ xargs -0 cat | report_gi_numbers.pl > gi2seqid.map.tmp mv gi2seqid.map.tmp gi2seqid.map @@ -187,7 +188,7 @@ then else echo "Setting LCAs in database (step 6 of 6)..." start_time1=$(date "+%s.%N") - find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ xargs -0 cat | \ set_lcas $MEMFLAG -x -d database.kdb -i database.idx \ -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 6d12533..1fc333e 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -65,12 +65,14 @@ int main(int argc, char **argv) { char *temp_ptr = NULL; size_t db_file_size = db_file.size(); if (Operate_in_RAM) { + cerr << "Getting " << DB_filename << " into memory ... "; db_file.close_file(); temp_ptr = new char[ db_file_size ]; ifstream ifs(DB_filename.c_str(), ifstream::binary); ifs.read(temp_ptr, db_file_size); ifs.close(); Database = KrakenDB(temp_ptr); + cerr << "done" << endl; } else { Database = KrakenDB(db_file.ptr()); } @@ -97,6 +99,7 @@ int main(int argc, char **argv) { } void process_single_file() { + cerr << "Processing multiple FASTA files" << endl; ifstream map_file(ID_to_taxon_map_filename.c_str()); if (map_file.rdstate() & ifstream::failbit) { err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str()); @@ -126,7 +129,7 @@ void process_single_file() { break; if ( dna.seq.empty() ) { - ++seq_skipped; + ++seqs_skipped; continue; } @@ -141,15 +144,16 @@ void process_single_file() { if (verbose) cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl; - ++seqs_no_taxid + ++seqs_no_taxid; } cerr << "\rProcessed " << seqs_processed << " sequences"; } cerr << "\r "; - cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< skipped_seqs <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl; + cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< seqs_skipped <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl; } void process_files() { + cerr << "Processing files in " << File_to_taxon_map_filename.c_str() << endl; ifstream map_file(File_to_taxon_map_filename.c_str()); if (map_file.rdstate() & ifstream::failbit) { err(EX_NOINPUT, "can't open %s", File_to_taxon_map_filename.c_str()); From dfecb3138dcdb49d7e838ab98921a80f56371781 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 4 Dec 2015 10:32:09 -0500 Subject: [PATCH 006/105] Added '>kraken:taxid|' header parsing to set_lcas - makes it possible to run set_lcas on sequences that were not in the DB build originally --- src/set_lcas.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 1fc333e..e769132 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -133,7 +133,15 @@ void process_single_file() { continue; } - uint32_t taxid = ID_to_taxon_map[dna.id]; + // Get the taxid. If the header specifies kraken:taxid, use that + uint32_t taxid; + string prefix = "kraken:taxid|"; + if (dna.id.substr(0,prefix.size()) == prefix) { + taxid = std::atoi(dna.id.substr(prefix.size()).c_str()); + } else { + taxid = ID_to_taxon_map[dna.id]; + } + if (taxid) { #pragma omp parallel for schedule(dynamic) for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) From 9fc61e36ec05539cd6d22fad17d714dccb083de2 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 20 Dec 2015 12:53:59 -0500 Subject: [PATCH 007/105] Only report missing kmers when verbose --- src/set_lcas.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index e769132..a0d601a 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -210,11 +210,13 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { Database.canonical_representation(*kmer_ptr) ); if (val_ptr == NULL) { - if (! Allow_extra_kmers) + if (! Allow_extra_kmers) { errx(EX_DATAERR, "kmer found in sequence that is not in database"); - else + } + else if (verbose) { cerr << "kmer found in sequence w/ taxid " << taxid << " that is not in database" << endl; - continue; + } + continue; } if (!force_taxid) *val_ptr = lca(Parent_map, taxid, *val_ptr); From c71ffbc5a0c49f1b9f9425473710d46440c752e3 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 9 Dec 2016 16:11:25 -0500 Subject: [PATCH 008/105] Use 'find ... -exec cat' instead 'find .. -print0 | xargs -0 cat' --- scripts/build_kraken_db.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh index 9d090e0..d0b49a3 100755 --- a/scripts/build_kraken_db.sh +++ b/scripts/build_kraken_db.sh @@ -77,8 +77,7 @@ else echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'" fi - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ - xargs -0 cat | \ + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ -o database /dev/fd/0 @@ -114,9 +113,10 @@ else else echo "Reducing database size (step 2 of 6)..." max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc) + idx_size_gb=$(printf %.2f $(echo "$idx_size/2^30" | bc) ) if (( $(echo "$max_kdb_size < 0" | bc) == 1 )) then - echo "Maximum database size too small, aborting reduction." + echo "Maximum database size too small - index alone needs $idx_size_gb GB. Aborting reduction." exit 1 fi # Key ct is 8 byte int stored 48 bytes from start of file @@ -161,8 +161,8 @@ then else echo "Creating GI number to seqID map (step 4 of 6)..." start_time1=$(date "+%s.%N") - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ - xargs -0 cat | report_gi_numbers.pl > gi2seqid.map.tmp + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ + report_gi_numbers.pl > gi2seqid.map.tmp mv gi2seqid.map.tmp gi2seqid.map echo "GI number to seqID map created. [$(report_time_elapsed $start_time1)]" @@ -188,8 +188,7 @@ then else echo "Setting LCAs in database (step 6 of 6)..." start_time1=$(date "+%s.%N") - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ - xargs -0 cat | \ + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ set_lcas $MEMFLAG -x -d database.kdb -i database.idx \ -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 touch "lca.complete" From 8259c6af049d5f368ded746cd5f6e891f5d45897 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 9 Dec 2016 16:12:28 -0500 Subject: [PATCH 009/105] Allow multiple --db arguments --- scripts/kraken | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/scripts/kraken b/scripts/kraken index 57cc717..c81ed38 100755 --- a/scripts/kraken +++ b/scripts/kraken @@ -45,7 +45,7 @@ my $quick = 0; my $min_hits = 1; my $fasta_input = 0; my $fastq_input = 0; -my $db_prefix; +my @db_prefix; my $threads; my $preload = 0; my $gunzip = 0; @@ -56,11 +56,12 @@ my $only_classified_output = 0; my $unclassified_out; my $classified_out; my $outfile; +my $report_file; GetOptions( "help" => \&display_help, "version" => \&display_version, - "db=s" => \$db_prefix, + "db=s" => \@db_prefix, "threads=i" => \$threads, "fasta-input" => \$fasta_input, "fastq-input" => \$fastq_input, @@ -69,6 +70,7 @@ GetOptions( "unclassified-out=s" => \$unclassified_out, "classified-out=s" => \$classified_out, "output=s" => \$outfile, + "report-file=s" => \$report_file, "preload" => \$preload, "paired" => \$paired, "check-names" => \$check_names, @@ -85,23 +87,23 @@ if (! @ARGV) { print STDERR "Need to specify input filenames!\n"; usage(); } -eval { $db_prefix = krakenlib::find_db($db_prefix); }; + +eval { @db_prefix = map { krakenlib::find_db($_) } @db_prefix }; if ($@) { die "$PROG: $@"; } -my $taxonomy = "$db_prefix/taxonomy/nodes.dmp"; +my $taxonomy = $db_prefix[0]."/taxonomy/nodes.dmp"; if ($quick) { undef $taxonomy; # Skip loading nodes file, not needed in quick mode } -my $kdb_file = "$db_prefix/database.kdb"; -my $idx_file = "$db_prefix/database.idx"; -if (! -e $kdb_file) { - die "$PROG: $kdb_file does not exist!\n"; -} -if (! -e $idx_file) { - die "$PROG: $idx_file does not exist!\n"; + +my @kdb_files = map { "$_/database.kdb" } @db_prefix; +my @idx_files = map { "$_/database.idx" } @db_prefix; + +foreach my $file (@kdb_files,@idx_files) { + die "$PROG: $file does not exist!\n" if (! -e $file); } if ($min_hits > 1 && ! $quick) { @@ -133,8 +135,8 @@ if ($auto_detect) { # set flags for classifier my @flags; -push @flags, "-d", $kdb_file; -push @flags, "-i", $idx_file; +push @flags, map { ("-d", $_) } @kdb_files; +push @flags, map { ("-i", $_) } @idx_files; push @flags, "-t", $threads if $threads > 1; push @flags, "-n", $taxonomy if defined $taxonomy; push @flags, "-q" if $quick; @@ -193,6 +195,7 @@ if (@pipe_argv) { } } +print STDERR "$CLASSIFY, @flags, @ARGV\n"; exec $CLASSIFY, @flags, @ARGV; die "$PROG: exec error: $!\n"; From 7c678d699c3b52c7f120785bb4ec2c7c04740fc6 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 9 Feb 2017 11:12:52 -0500 Subject: [PATCH 010/105] Count unique k-mers for each taxid --- scripts/kraken-report | 18 +- scripts/read_merger.pl | 21 +- scripts/report_gi_numbers.pl | 8 +- src/Makefile | 6 +- src/assert_helpers.h | 283 +++++++++++++++ src/classify.cpp | 127 +++++-- src/get_kmers.cpp | 309 ++++++++++++++++ src/hyperloglogbias.h | 133 +++++++ src/hyperloglogplus.h | 623 ++++++++++++++++++++++++++++++++ src/make_seqid_to_taxid_map.cpp | 16 +- src/third_party/MurmurHash3.cpp | 335 +++++++++++++++++ src/third_party/MurmurHash3.h | 37 ++ 12 files changed, 1869 insertions(+), 47 deletions(-) create mode 100644 src/assert_helpers.h create mode 100644 src/get_kmers.cpp create mode 100644 src/hyperloglogbias.h create mode 100644 src/hyperloglogplus.h create mode 100644 src/third_party/MurmurHash3.cpp create mode 100644 src/third_party/MurmurHash3.h diff --git a/scripts/kraken-report b/scripts/kraken-report index 8351593..99cab1b 100755 --- a/scripts/kraken-report +++ b/scripts/kraken-report @@ -37,11 +37,13 @@ require "$KRAKEN_DIR/krakenlib.pm"; my $show_zeros = 0; my $db_prefix; +my $is_cnts_table = 0; GetOptions( "help" => \&display_help, "version" => \&display_version, "show-zeros" => \$show_zeros, + "cnts-table" => \$is_cnts_table, "db=s" => \$db_prefix, ); @@ -77,10 +79,18 @@ load_taxonomy($db_prefix); my %taxo_counts; my $seq_count = 0; $taxo_counts{0} = 0; -while (<>) { - my @fields = split; - $taxo_counts{$fields[2]}++; - $seq_count++; +if ($is_cnts_table) { + while (<>) { + my ($taxid,$count) = split; + $taxo_counts{$taxid} = $count; + $seq_count += $count; + } +} else { + while (<>) { + my (undef,$taxid) = split; + $taxo_counts{$taxid}++; + $seq_count++; + } } my $classified_count = $seq_count - $taxo_counts{0}; diff --git a/scripts/read_merger.pl b/scripts/read_merger.pl index 2d32477..6e97099 100755 --- a/scripts/read_merger.pl +++ b/scripts/read_merger.pl @@ -88,7 +88,12 @@ while (defined($seq1 = read_sequence($fh1))) { $seq2 = read_sequence($fh2); if (! defined $seq2) { - die "$PROG: mismatched sequence counts\n"; + print STDERR "$PROG: mismatched sequence counts - file 1 has more reads\n + Outputting the further reads unpaired\n"; + print_sequence($seq1); + while (defined($seq1 = read_sequence($fh1))) { + print_sequence($seq1); + } } if ($check_names && $seq1->{id} ne $seq2->{id}) { die "$PROG: mismatched mate pair names ('$seq1->{id}' & '$seq2->{id}')\n"; @@ -96,7 +101,13 @@ print_merged_sequence($seq1, $seq2); } if (defined($seq2 = read_sequence($fh2))) { - die "$PROG: mismatched sequence counts\n"; + print STDERR "$PROG: mismatched sequence counts - file 2 has more reads\n + Outputting the further reads unpaired\n"; + print_sequence($seq2); + while (defined($seq2 = read_sequence($fh2))) { + print_sequence($seq2); + } + } close $fh1; close $fh2; @@ -162,3 +173,9 @@ sub print_merged_sequence { print ">" . $seq1->{id} . "\n"; print $seq1->{seq} . "N" . $seq2->{seq} . "\n"; } + +sub print_sequence { + my ($seq1) = @_; + print ">" . $seq1->{id} . "\n"; + print $seq1->{seq} . "\n"; +} diff --git a/scripts/report_gi_numbers.pl b/scripts/report_gi_numbers.pl index ce6a0bc..88a24f0 100755 --- a/scripts/report_gi_numbers.pl +++ b/scripts/report_gi_numbers.pl @@ -19,11 +19,11 @@ # Reads multi-FASTA input and for each sequence ID reports a # tab-delimited line: -# +# # # or in the case of a sequence with Kraken taxid information: # -# TAXID +# TAXID # # Assumes all sequence IDs actually have GI numbers or Kraken # taxid information. @@ -38,12 +38,12 @@ next unless /^>(\S+)/; my $seq_id = $1; if ($seq_id =~ /(^|\|)kraken:taxid\|(\d+)/) { - print "TAXID\t$2\t$seq_id\n"; + print "TAXID\t$2\t$seq_id\t$_\n"; next; } if ($seq_id !~ /(^|\|)gi\|(\d+)/) { die "$PROG: sequence ID $seq_id lacks GI number, aborting.\n"; } - print "$2\t$seq_id\n"; + print "$2\t$seq_id\t$_\n"; } diff --git a/src/Makefile b/src/Makefile index 2f927f6..6e2c938 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,6 @@ CXX = g++ -CXXFLAGS = -Wall -fopenmp -O3 -PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink +CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fsyntax-only +PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink get_kmers .PHONY: all install clean @@ -18,6 +18,8 @@ db_sort: krakendb.o quickfile.o set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o +get_kmers: krakendb.o quickfile.o krakenutil.o seqreader.o + classify: krakendb.o quickfile.o krakenutil.o seqreader.o make_seqid_to_taxid_map: quickfile.o diff --git a/src/assert_helpers.h b/src/assert_helpers.h new file mode 100644 index 0000000..6a2fe97 --- /dev/null +++ b/src/assert_helpers.h @@ -0,0 +1,283 @@ +/* + * Copyright 2011, Ben Langmead + * + * This file is part of Bowtie 2. + * + * Bowtie 2 is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Bowtie 2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Bowtie 2. If not, see . + */ + +#ifndef ASSERT_HELPERS_H_ +#define ASSERT_HELPERS_H_ + +#include +#include +#include +#include + +/** + * Assertion for release-enabled assertions + */ +class ReleaseAssertException : public std::runtime_error { +public: + ReleaseAssertException(const std::string& msg = "") : std::runtime_error(msg) {} +}; + +/** + * Macros for release-enabled assertions, and helper macros to make + * all assertion error messages more helpful. + */ +#ifndef NDEBUG +#define ASSERT_ONLY(...) __VA_ARGS__ +#else +#define ASSERT_ONLY(...) +#endif + +#define rt_assert(b) \ + if(!(b)) { \ + std::cerr << "rt_assert at " << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(); \ + } +#define rt_assert_msg(b,msg) \ + if(!(b)) { \ + std::cerr << msg << " at " << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(msg); \ + } + +#define rt_assert_eq(ex,ac) \ + if(!((ex) == (ac))) { \ + std::cerr << "rt_assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(); \ + } +#define rt_assert_eq_msg(ex,ac,msg) \ + if(!((ex) == (ac))) { \ + std::cerr << "rt_assert_eq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(msg); \ + } + +#ifndef NDEBUG +#define assert_eq(ex,ac) \ + if(!((ex) == (ac))) { \ + std::cerr << "assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#define assert_eq_msg(ex,ac,msg) \ + if(!((ex) == (ac))) { \ + std::cerr << "assert_eq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#else +#define assert_eq(ex,ac) +#define assert_eq_msg(ex,ac,msg) +#endif + +#define rt_assert_neq(ex,ac) \ + if(!((ex) != (ac))) { \ + std::cerr << "rt_assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(); \ + } +#define rt_assert_neq_msg(ex,ac,msg) \ + if(!((ex) != (ac))) { \ + std::cerr << "rt_assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(msg); \ + } + +#ifndef NDEBUG +#define assert_neq(ex,ac) \ + if(!((ex) != (ac))) { \ + std::cerr << "assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#define assert_neq_msg(ex,ac,msg) \ + if(!((ex) != (ac))) { \ + std::cerr << "assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#else +#define assert_neq(ex,ac) +#define assert_neq_msg(ex,ac,msg) +#endif + +#define rt_assert_gt(a,b) \ + if(!((a) > (b))) { \ + std::cerr << "rt_assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(); \ + } +#define rt_assert_gt_msg(a,b,msg) \ + if(!((a) > (b))) { \ + std::cerr << "rt_assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(msg); \ + } + +#ifndef NDEBUG +#define assert_gt(a,b) \ + if(!((a) > (b))) { \ + std::cerr << "assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#define assert_gt_msg(a,b,msg) \ + if(!((a) > (b))) { \ + std::cerr << "assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#else +#define assert_gt(a,b) +#define assert_gt_msg(a,b,msg) +#endif + +#define rt_assert_geq(a,b) \ + if(!((a) >= (b))) { \ + std::cerr << "rt_assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(); \ + } +#define rt_assert_geq_msg(a,b,msg) \ + if(!((a) >= (b))) { \ + std::cerr << "rt_assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(msg); \ + } + +#ifndef NDEBUG +#define assert_geq(a,b) \ + if(!((a) >= (b))) { \ + std::cerr << "assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#define assert_geq_msg(a,b,msg) \ + if(!((a) >= (b))) { \ + std::cerr << "assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#else +#define assert_geq(a,b) +#define assert_geq_msg(a,b,msg) +#endif + +#define rt_assert_lt(a,b) \ + if(!(a < b)) { \ + std::cerr << "rt_assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(); \ + } +#define rt_assert_lt_msg(a,b,msg) \ + if(!(a < b)) { \ + std::cerr << "rt_assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(msg); \ + } + +#ifndef NDEBUG +#define assert_lt(a,b) \ + if(!(a < b)) { \ + std::cerr << "assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#define assert_lt_msg(a,b,msg) \ + if(!(a < b)) { \ + std::cerr << "assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#else +#define assert_lt(a,b) +#define assert_lt_msg(a,b,msg) +#endif + +#define rt_assert_leq(a,b) \ + if(!((a) <= (b))) { \ + std::cerr << "rt_assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(); \ + } +#define rt_assert_leq_msg(a,b,msg) \ + if(!((a) <= (b))) { \ + std::cerr << "rt_assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + throw ReleaseAssertException(msg); \ + } + +#ifndef NDEBUG +#define assert_leq(a,b) \ + if(!((a) <= (b))) { \ + std::cerr << "assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#define assert_leq_msg(a,b,msg) \ + if(!((a) <= (b))) { \ + std::cerr << "assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \ + std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \ + assert(0); \ + } +#else +#define assert_leq(a,b) +#define assert_leq_msg(a,b,msg) +#endif + +#ifndef NDEBUG +#define assert_in(c, s) assert_in2(c, s, __FILE__, __LINE__) +static inline void assert_in2(char c, const char *str, const char *file, int line) { + const char *s = str; + while(*s != '\0') { + if(c == *s) return; + s++; + } + std::cerr << "assert_in: (" << c << ") not in (" << str << ")" << std::endl; + std::cerr << file << ":" << line << std::endl; + assert(0); +} +#else +#define assert_in(c, s) +#endif + +#ifndef NDEBUG +#define assert_range(b, e, v) assert_range_helper(b, e, v, __FILE__, __LINE__) +template +inline static void assert_range_helper(const T& begin, + const T& end, + const T& val, + const char *file, + int line) +{ + if(val < begin || val > end) { + std::cerr << "assert_range: (" << val << ") not in [" + << begin << ", " << end << "]" << std::endl; + std::cerr << file << ":" << line << std::endl; + assert(0); + } +} +#else +#define assert_range(b, e, v) +#endif + +// define a macro to indicate variables that are only required for asserts +// used to make production build happy, i.e. disable "warning: variable ‘x’ set but not used [-Wunused-but-set-variable]" +#define _unused(x) ((void)x) + +#endif /*ASSERT_HELPERS_H_*/ diff --git a/src/classify.cpp b/src/classify.cpp index 3fb9416..5909a85 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -22,6 +22,7 @@ #include "krakenutil.hpp" #include "quickfile.hpp" #include "seqreader.hpp" +#include "hyperloglogplus.h" const size_t DEF_WORK_UNIT_SIZE = 500000; @@ -37,8 +38,18 @@ string hitlist_string(vector &taxa, vector &ambig); set get_ancestry(uint32_t taxon); void report_stats(struct timeval time1, struct timeval time2); +struct ReadCounts { + uint32_t n_reads; + uint32_t n_kmers; + HyperLogLogPlusMinus kmers; // unique k-mer count per taxon +}; + +map taxon_counts; // stats per taxon + int Num_threads = 1; -string DB_filename, Index_filename, Nodes_filename; +vector DB_filenames; +vector Index_filenames; +string Nodes_filename; bool Quick_mode = false; bool Fastq_input = false; bool Print_classified = false; @@ -46,9 +57,10 @@ bool Print_unclassified = false; bool Print_kraken = true; bool Populate_memory = false; bool Only_classified_kraken_output = false; +bool Print_sequence = true; uint32_t Minimum_hit_count = 1; map Parent_map; -KrakenDB Database; +vector KrakenDatabases; string Classified_output_file, Unclassified_output_file, Kraken_output_file; ostream *Classified_output; ostream *Unclassified_output; @@ -59,34 +71,62 @@ uint64_t total_classified = 0; uint64_t total_sequences = 0; uint64_t total_bases = 0; +void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) { + QuickFile db_file; + db_file.open_file(DB_filename); + if (Populate_memory) { + db_file.load_file(); + } + database = KrakenDB(db_file.ptr()); + QuickFile idx_file; + idx_file.open_file(Index_filename); + if (Populate_memory) + idx_file.load_file(); + + KrakenDBIndex db_index(idx_file.ptr()); + database.set_index(&db_index); +} + int main(int argc, char **argv) { #ifdef _OPENMP omp_set_num_threads(1); #endif parse_command_line(argc, argv); - if (! Nodes_filename.empty()) + if (! Nodes_filename.empty()) { + cerr << "Building parent node map " << endl; Parent_map = build_parent_map(Nodes_filename); + } if (Populate_memory) - cerr << "Loading database... "; - - QuickFile db_file; - db_file.open_file(DB_filename); - if (Populate_memory) - db_file.load_file(); - Database = KrakenDB(db_file.ptr()); - KmerScanner::set_k(Database.get_k()); + cerr << "Loading database(s)... " << endl; + + // TODO: Check DB_filenames and Index_filesnames have the same length + for (size_t i=0; i < DB_filenames.size(); ++i) { + cerr << "\t " << DB_filenames[i] << endl; + static QuickFile db_file; + db_file.open_file(DB_filenames[i]); + if (Populate_memory) + db_file.load_file(); + static KrakenDB Database = KrakenDB(db_file.ptr()); + KmerScanner::set_k(Database.get_k()); + + static QuickFile idx_file; + idx_file.open_file(Index_filenames[i]); + if (Populate_memory) + idx_file.load_file(); + static KrakenDBIndex db_index(idx_file.ptr()); + Database.set_index(&db_index); + + + KrakenDatabases.push_back(&Database); + } - QuickFile idx_file; - idx_file.open_file(Index_filename); - if (Populate_memory) - idx_file.load_file(); - KrakenDBIndex db_index(idx_file.ptr()); - Database.set_index(&db_index); + // TODO: Check all databases have the same k + KmerScanner::set_k(KrakenDatabases[0]->get_k()); if (Populate_memory) - cerr << "complete." << endl; + cerr << "\ncomplete." << endl; if (Print_classified) { if (Classified_output_file == "-") @@ -147,6 +187,7 @@ void report_stats(struct timeval time1, struct timeval time2) { } void process_file(char *filename) { + cerr << "k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl; string file_str(filename); DNASequenceReader *reader; DNASequence dna; @@ -199,9 +240,26 @@ void process_file(char *filename) { } } // end parallel section + // Write out report - print k-mers and read numbers + for (auto& elem : taxon_counts) { + //elem.first gives you the key (int) + //elem.second gives you the mapped element (vector) + cerr << elem.first << "\t" << elem.second.n_reads << "\t" << + elem.second.n_kmers << "\t" << elem.second.kmers.cardinality() << "\n"; + } + delete reader; } +uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& current_bin_key, + int64_t& current_min_pos, int64_t& current_max_pos) { + uint32_t* val_ptr = database.kmer_query( + database.canonical_representation(*kmer_ptr), ¤t_bin_key, + ¤t_min_pos, ¤t_max_pos); + uint32_t taxon = val_ptr ? *val_ptr : 0; + return taxon; +} + void classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss) { vector taxa; @@ -211,11 +269,9 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, uint32_t taxon = 0; uint32_t hits = 0; // only maintained if in quick mode - uint64_t current_bin_key; - int64_t current_min_pos = 1; - int64_t current_max_pos = 0; + uint64_t current_bin_key; int64_t current_min_pos = 1; int64_t current_max_pos = 0; - if (dna.seq.size() >= Database.get_k()) { + if (dna.seq.size() >= KrakenDatabases[0]->get_k()) { KmerScanner scanner(dna.seq); while ((kmer_ptr = scanner.next_kmer()) != NULL) { taxon = 0; @@ -224,13 +280,15 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, } else { ambig_list.push_back(0); - uint32_t *val_ptr = Database.kmer_query( - Database.canonical_representation(*kmer_ptr), - ¤t_bin_key, - ¤t_min_pos, ¤t_max_pos - ); - taxon = val_ptr ? *val_ptr : 0; + + for (auto& db : KrakenDatabases) { + taxon = get_taxon_for_kmer(*db, kmer_ptr, current_bin_key, current_min_pos, current_max_pos); + if (taxon) break; + } + if (taxon) { + taxon_counts[taxon].kmers.add(*kmer_ptr); + ++taxon_counts[taxon].n_kmers; hit_counts[taxon]++; if (Quick_mode && ++hits >= Minimum_hit_count) break; @@ -249,6 +307,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, if (call) #pragma omp atomic total_classified++; + ++(taxon_counts[call].n_reads); if (Print_unclassified || Print_classified) { ostringstream *oss_ptr = call ? &coss : &uoss; @@ -290,6 +349,9 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, koss << hitlist_string(taxa, ambig_list); } + if (Print_sequence) + koss << "\t" << dna.seq; + koss << endl; } @@ -349,10 +411,10 @@ void parse_command_line(int argc, char **argv) { while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:M")) != -1) { switch (opt) { case 'd' : - DB_filename = optarg; + DB_filenames.push_back(optarg); break; case 'i' : - Index_filename = optarg; + Index_filenames.push_back(optarg); break; case 't' : sig = atoll(optarg); @@ -409,11 +471,11 @@ void parse_command_line(int argc, char **argv) { } } - if (DB_filename.empty()) { + if (DB_filenames.empty()) { cerr << "Missing mandatory option -d" << endl; usage(); } - if (Index_filename.empty()) { + if (Index_filenames.empty()) { cerr << "Missing mandatory option -i" << endl; usage(); } @@ -443,6 +505,7 @@ void usage(int exit_code) { << " -f Input is in FASTQ format" << endl << " -c Only include classified reads in output" << endl << " -M Preload database files" << endl + << " -s Print sequence in Kraken output" << endl << " -h Print this message" << endl << endl << "At least one FASTA or FASTQ file must be specified." << endl diff --git a/src/get_kmers.cpp b/src/get_kmers.cpp new file mode 100644 index 0000000..9288078 --- /dev/null +++ b/src/get_kmers.cpp @@ -0,0 +1,309 @@ +/* + * Copyright 2013-2015, Derrick Wood + * + * This file is part of the Kraken taxonomic sequence classification system. + * + * Kraken is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Kraken is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ + +#include "kraken_headers.hpp" +#include "quickfile.hpp" +#include "krakendb.hpp" +#include "krakenutil.hpp" +#include "seqreader.hpp" +#include + +#define SKIP_LEN 50000 + +using namespace std; +using namespace kraken; + +void parse_command_line(int argc, char **argv); +void usage(int exit_code=EX_USAGE); +void process_files(); +void process_single_file(); +void process_file(string filename, uint32_t taxid); +void get_kmers(uint32_t taxid, string &seq, size_t start, size_t finish); + +int Num_threads = 1; +string DB_filename, Index_filename, Nodes_filename, + File_to_taxon_map_filename, + ID_to_taxon_map_filename, Multi_fasta_filename; +bool force_taxid = false; + +bool Allow_extra_kmers = false; +bool verbose = false; +bool Operate_in_RAM = false; +bool One_FASTA_file = false; +map Parent_map; +map ID_to_taxon_map; +set All_taxon_ids; +unordered_multimap Kmer_taxa_map; +map, uint32_t > TaxidPair_counts; +KrakenDB Database; + +int main(int argc, char **argv) { + #ifdef _OPENMP + omp_set_num_threads(1); + #endif + + parse_command_line(argc, argv); + + if (!force_taxid) { + Parent_map = build_parent_map(Nodes_filename); + } + + QuickFile db_file(DB_filename, "rw"); + + char *temp_ptr = NULL; + size_t db_file_size = db_file.size(); + if (Operate_in_RAM) { + cerr << "Getting " << DB_filename << " into memory ... "; + db_file.close_file(); + temp_ptr = new char[ db_file_size ]; + ifstream ifs(DB_filename.c_str(), ifstream::binary); + ifs.read(temp_ptr, db_file_size); + ifs.close(); + Database = KrakenDB(temp_ptr); + cerr << "done" << endl; + } else { + Database = KrakenDB(db_file.ptr()); + } + + KmerScanner::set_k(Database.get_k()); + + QuickFile idx_file(Index_filename); + KrakenDBIndex db_index(idx_file.ptr()); + Database.set_index(&db_index); + + if (One_FASTA_file) + process_single_file(); + else + process_files(); + + + + if (Operate_in_RAM) { + ofstream ofs(DB_filename.c_str(), ofstream::binary); + ofs.write(temp_ptr, db_file_size); + ofs.close(); + delete temp_ptr; + } + + return 0; +} + +void process_single_file() { + cerr << "Processing multiple FASTA files" << endl; + ifstream map_file(ID_to_taxon_map_filename.c_str()); + if (map_file.rdstate() & ifstream::failbit) { + err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str()); + } + string line; + while (map_file.good()) { + getline(map_file, line); + if (line.empty()) + break; + string seq_id; + uint32_t taxid; + istringstream iss(line); + iss >> seq_id; + iss >> taxid; + ID_to_taxon_map[seq_id] = taxid; + } + + FastaReader reader(Multi_fasta_filename); + DNASequence dna; + uint32_t seqs_processed = 0; + uint32_t seqs_skipped = 0; + uint32_t seqs_no_taxid = 0; + + while (reader.is_valid()) { + dna = reader.next_sequence(); + if (! reader.is_valid()) + break; + + if ( dna.seq.empty() ) { + ++seqs_skipped; + continue; + } + + // Get the taxid. If the header specifies kraken:taxid, use that + uint32_t taxid; + string prefix = "kraken:taxid|"; + if (dna.id.substr(0,prefix.size()) == prefix) { + taxid = std::atoi(dna.id.substr(prefix.size()).c_str()); + } else { + taxid = ID_to_taxon_map[dna.id]; + } + + if (taxid) { + #pragma omp parallel for schedule(dynamic) + for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) + get_kmers(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1); + + ++seqs_processed; + } else { + if (verbose) + cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl; + + ++seqs_no_taxid; + } + cerr << "\rProcessed " << seqs_processed << " sequences"; + } + cerr << "\r "; + cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< seqs_skipped <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl; +} + +void process_files() { + cerr << "Processing files in " << File_to_taxon_map_filename.c_str() << endl; + ifstream map_file(File_to_taxon_map_filename.c_str()); + if (map_file.rdstate() & ifstream::failbit) { + err(EX_NOINPUT, "can't open %s", File_to_taxon_map_filename.c_str()); + } + string line; + uint32_t seqs_processed = 0; + + while (map_file.good()) { + getline(map_file, line); + if (line.empty()) + break; + string filename; + uint32_t taxid; + istringstream iss(line); + iss >> filename; + iss >> taxid; + process_file(filename, taxid); + cerr << "\rProcessed " << ++seqs_processed << " sequences"; + } + cerr << "\r "; + cerr << "\rFinished processing " << seqs_processed << " sequences" << endl; +} + +void process_file(string filename, uint32_t taxid) { + FastaReader reader(filename); + DNASequence dna; + + // For the purposes of this program, we assume these files are + // single-fasta files. + dna = reader.next_sequence(); + + #pragma omp parallel for schedule(dynamic) + for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) + get_kmers(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1); +} + +void get_kmers(uint32_t taxid, string &seq, size_t start, size_t finish) { + + All_taxon_ids.insert(taxid); + KmerScanner scanner(seq, start, finish); + uint64_t *kmer_ptr; + + while ((kmer_ptr = scanner.next_kmer()) != NULL) { + if (scanner.ambig_kmer()) + continue; + + Kmer_taxa_map.insert({*kmer_ptr, taxid}); + } +} + +void parse_command_line(int argc, char **argv) { + int opt; + long long sig; + + if (argc > 1 && strcmp(argv[1], "-h") == 0) + usage(0); + while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTv")) != -1) { + switch (opt) { + case 'f' : + File_to_taxon_map_filename = optarg; + break; + case 'd' : + DB_filename = optarg; + break; + case 'i' : + Index_filename = optarg; + break; + case 'F' : + Multi_fasta_filename = optarg; + break; + case 'm' : + ID_to_taxon_map_filename = optarg; + break; + case 't' : + sig = atoll(optarg); + if (sig <= 0) + errx(EX_USAGE, "can't use nonpositive thread count"); + #ifdef _OPENMP + if (sig > omp_get_num_procs()) + errx(EX_USAGE, "thread count exceeds number of processors"); + Num_threads = sig; + omp_set_num_threads(Num_threads); + #endif + break; + case 'T' : + force_taxid = true; + break; + case 'n' : + Nodes_filename = optarg; + break; + case 'v' : + verbose = true; + break; + case 'x' : + Allow_extra_kmers = true; + break; + case 'M' : + Operate_in_RAM = true; + break; + default: + usage(); + break; + } + } + + if (DB_filename.empty() || Index_filename.empty() || + Nodes_filename.empty()) + usage(); + if (File_to_taxon_map_filename.empty() && + (Multi_fasta_filename.empty() || ID_to_taxon_map_filename.empty())) + usage(); + + if (! File_to_taxon_map_filename.empty()) + One_FASTA_file = false; + else + One_FASTA_file = true; +} + +void usage(int exit_code) { + cerr << "Usage: get_kmers [options]" << endl + << endl + << "Options: (*mandatory)" << endl + << "* -d filename Kraken DB filename" << endl + << "* -i filename Kraken DB index filename" << endl + << "* -n filename NCBI Taxonomy nodes file" << endl + << " -t # Number of threads" << endl + << " -M Copy DB to RAM during operation" << endl + << " -x K-mers not found in DB do not cause errors" << endl + << " -f filename File to taxon map" << endl + << " -F filename Multi-FASTA file with sequence data" << endl + << " -m filename Sequence ID to taxon map" << endl + << " -T Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl + << " -v Verbose output" << endl + << " -h Print this message" << endl + << endl + << "-F and -m must be specified together. If -f is given, " + << "-F/-m are ignored." << endl; + exit(exit_code); +} diff --git a/src/hyperloglogbias.h b/src/hyperloglogbias.h new file mode 100644 index 0000000..013bd5b --- /dev/null +++ b/src/hyperloglogbias.h @@ -0,0 +1,133 @@ +/* + * hyperloglogbias.h + * + * Created on: Apr 25, 2015 + * Author: fbreitwieser + */ + +#ifndef HYPERLOGLOGBIAS_H_ +#define HYPERLOGLOGBIAS_H_ + +const double rawEstimateData_precision4[] = { + 11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394 +}; + +const double rawEstimateData_precision5[] = { + 23, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852 +}; + +const double rawEstimateData_precision6[] = { + 46, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858 +}; + +const double rawEstimateData_precision7[] = { + 92, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102 +}; + +const double rawEstimateData_precision8[] = { + 184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192 +}; + +const double rawEstimateData_precision9[] = { + 369, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768 +}; + +const double rawEstimateData_precision10[] = { + 738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828 +}; + +const double rawEstimateData_precision11[] = { + 1477, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176 +}; + +const double rawEstimateData_precision12[] = { + 2954, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22 +}; + +const double rawEstimateData_precision13[] = { + 5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424 +}; + +const double rawEstimateData_precision14[] = { + 11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884 +}; + +const double rawEstimateData_precision15[] = { + 23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842 +}; + +const double rawEstimateData_precision16[] = { + 47271, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542 +}; + +const double rawEstimateData_precision17[] = { + 94542, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845 +}; + +const double rawEstimateData_precision18[] = { + 189084, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691 +}; + + +const double biasData_precision4[] = { + 10, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606 +}; + +const double biasData_precision5[] = { + 22, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014 +}; + +const double biasData_precision6[] = { + 45, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983 +}; + +const double biasData_precision7[] = { + 91, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996 +}; + +const double biasData_precision8[] = { + 183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996 +}; + +const double biasData_precision9[] = { + 368, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997 +}; + +const double biasData_precision10[] = { + 737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041 +}; + +const double biasData_precision11[] = { + 1476, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993 +}; + +const double biasData_precision12[] = { + 2953, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988 +}; + +const double biasData_precision13[] = { + 5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972 +}; + +const double biasData_precision14[] = { + 11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038 +}; + +const double biasData_precision15[] = { + 23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005 +}; + +const double biasData_precision16[] = { + 47270, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001 +}; + +const double biasData_precision17[] = { + 94541, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028 +}; + +const double biasData_precision18[] = { + 189083, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892 +}; + + +#endif /* HYPERLOGLOGBIAS_H_ */ diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h new file mode 100644 index 0000000..33f5dc1 --- /dev/null +++ b/src/hyperloglogplus.h @@ -0,0 +1,623 @@ +/* + * hyperloglogplus.h + * + * Implementation of HyperLogLog++ algorithm described by Stefan Heule et al. + * + * Created on: Apr 25, 2015 + * Author: fbreitwieser + */ + +#ifndef HYPERLOGLOGPLUS_H_ +#define HYPERLOGLOGPLUS_H_ + +#include +#include +#include +#include +#include +#include //log +#include //vector.count +#include + +#include "hyperloglogbias.h" +#include "third_party/MurmurHash3.cpp" +#include "assert_helpers.h" + +using namespace std; + +//#define HLL_DEBUG +//#define NDEBUG +//#define NDEBUG2 +#define arr_len(a) (a + sizeof a / sizeof a[0]) + +// experimentally determined threshold values for p - 4 +static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100, + 6500, 11500, 20000, 50000, 120000, 350000}; + + +/////////////////////// + +// +/** + * gives the estimated cardinality for m bins, v of which are non-zero + * @param m number of bins in the matrix + * @param v number of non-zero bins + * @return + */ +double linearCounting(uint32_t m, uint32_t v) { + if (v > m) { + throw std::invalid_argument("number of v should not be greater than m"); + } + double fm = double(m); + return fm * log(fm/double(v)); +} + +/** + * from Numerical Recipes, 3rd Edition, p 352 + * Returns hash of u as a 64-bit integer. + * +*/ +inline uint64_t ranhash (uint64_t u) { + uint64_t v = u * 3935559000370003845 + 2691343689449507681; + + v ^= v >> 21; v ^= v << 37; v ^= v >> 4; + + v *= 4768777513237032717; + + v ^= v << 20; v ^= v >> 41; v ^= v << 5; + + return v; +} + +inline uint64_t murmurhash3_finalizer (uint64_t key) { + key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that. + key ^= key >> 33; + key *= 0xff51afd7ed558ccd; + key ^= key >> 33; + key *= 0xc4ceb9fe1a85ec53; + key ^= key >> 33; + return key; +} + +/** + * Bias correction factors for specific m's + * @param m + * @return + */ +double alpha(uint32_t m) { + switch (m) { + case 16: return 0.673; + case 32: return 0.697; + case 64: return 0.709; + } + + // m >= 128 + return 0.7213 / (1 + 1.079/double(m)); +} + +/** + * calculate the raw estimate as harmonic mean of the ranks in the register + * @param array + * @return + */ +double calculateEstimate(vector array) { + double inverseSum = 0.0; + for (size_t i = 0; i < array.size(); ++i) { + // TODO: pre-calculate the power calculation + inverseSum += pow(2,-array[i]); + } + return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum; +} + +uint32_t countZeros(vector s) { + return (uint32_t)count(s.begin(), s.end(), 0); +} + +/** + * Extract bits (from uint32_t or uint64_t) using LSB 0 numbering from hi to lo, including lo + * @param bits + * @param hi + * @param lo + * @return + */ +template +T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) { + + // create a bitmask: + // (T(1) << (hi - lo) a 1 at the position (hi - lo) + // ((T(1) << (hi - lo) - 1) 1's from position 0 to position (hi-lo-1) + // (((T(1) << (hi - lo)) - 1) << lo) 1's from position lo to position hi + + // The T(1) is required to not cause overflow on 32bit machines + // TODO: consider creating a bitmask only once in the beginning + T bitmask = (((T(1) << (hi - lo)) - 1) << lo); + T result = value & bitmask; + + if (!shift_left) { + // shift resulting bits to the right + result = result >> lo; + } else { + // shift resulting bits to the left + result = result << (sizeof(T)*8 - hi); + } + return result; +} + +template +T extractBits(T bits, uint8_t hi) { + // create a bitmask for first hi bits (LSB 0 numbering) + T bitmask = T(-1) << (sizeof(T)*8 - hi); + + return (bits & bitmask); +} + +// functions for counting the number of leading 0-bits (clz) +// and counting the number of trailing 0-bits (ctz) +//#ifdef __GNUC__ + +// TODO: switch between builtin clz and 64_clz based on architecture +//#define clz(x) __builtin_clz(x) +#if 0 +static int clz_manual(uint64_t x) +{ + // This uses a binary search (counting down) algorithm from Hacker's Delight. + uint64_t y; + int n = 64; + y = x >>32; if (y != 0) {n -= 32; x = y;} + y = x >>16; if (y != 0) {n -= 16; x = y;} + y = x >> 8; if (y != 0) {n -= 8; x = y;} + y = x >> 4; if (y != 0) {n -= 4; x = y;} + y = x >> 2; if (y != 0) {n -= 2; x = y;} + y = x >> 1; if (y != 0) return n - 2; + return n - x; +} +#endif + +inline uint32_t clz(const uint32_t x) { + return __builtin_clz(x); +} + +inline uint32_t clz(const uint64_t x) { + uint32_t u32 = (x >> 32); + uint32_t result = u32 ? __builtin_clz(u32) : 32; + if (result == 32) { + u32 = x & 0xFFFFFFFFUL; + result += (u32 ? __builtin_clz(u32) : 32); + } + return result; +} +//#else + +uint32_t clz_log2(const uint64_t w) { + return 63 - floor(log2(w)); +} +//#endif + + +// TODO: the sparse list may be encoded with variable length encoding +// see Heule et al., section 5.3.2 +// Also, using sets might give a larger overhead as each insertion costs more +// consider using vector and sort/unique when merging. +typedef set SparseListType; +typedef uint64_t HashSize; + +/** + * HyperLogLogPlusMinus class + * typename T corresponds to the hash size - usually either uint32_t or uint64_t (implemented for uint64_t) + */ + +typedef uint64_t T_KEY; +template +class HyperLogLogPlusMinus { + +private: + + vector M; // registers (M) of size m + uint8_t p; // precision + uint32_t m; // number of registers + bool sparse; // sparse representation of the data? + SparseListType sparseList; // TODO: use a compressed list instead + + // vectors containing data for bias correction + vector > rawEstimateData; // TODO: make this static + vector > biasData; + + // sparse versions of p and m + static const uint8_t pPrime = 25; // precision when using a sparse representation + // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32 + static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime + + +public: + + ~HyperLogLogPlusMinus() {}; + + /** + * Create new HyperLogLogPlusMinus counter + * @param precision + * @param sparse + */ + HyperLogLogPlusMinus(uint8_t precision=10, bool sparse=true):p(precision),sparse(sparse) { + if (precision > 18 || precision < 4) { + throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18"); + } + + this->m = 1 << precision; + + if (sparse) { + this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size + } else { + this->M = vector(m); + } + } + + /** + * Add a new item to the counter. + * @param item + */ + void add(T_KEY item) { + add(item, sizeof(T_KEY)); + } + + /** + * Add a new item to the counter. + * @param item + * @param size size of item + */ + void add(T_KEY item, size_t size) { + + // compute hash for item + HashSize hash_value = murmurhash3_finalizer(item); + +#ifdef HLL_DEBUG + cerr << "Value: " << item << "; hash(value): " << hash_value << endl; + cerr << bitset<64>(hash_value) << endl; +#endif + + if (sparse) { + // sparse mode: put the encoded hash into sparse list + uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value); + this->sparseList.insert(encoded_hash_value); + +#ifdef HLL_DEBUG + idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value); + assert_eq(ir.idx,get_index(hash_value, p)); + assert_eq(ir.rank, get_rank(hash_value, p)); +#endif + + // if the sparseList is too large, switch to normal (register) representation + if (this->sparseList.size() > this->m) { // TODO: is the size of m correct? + switchToNormalRepresentation(); + } + } else { + // normal mode + // take first p bits as index {x63,...,x64-p} + uint32_t idx = get_index(hash_value, p); + // shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0} + uint8_t rank = get_rank(hash_value, p); + + // update the register if current rank is bigger + if (rank > this->M[idx]) { + this->M[idx] = rank; + } + } + } + + void add(vector words) { + for(size_t i = 0; i < words.size(); ++i) { + this->add(words[i]); + } + } + + /** + * Reset to its initial state. + */ + void reset() { + this->sparse = true; + this->sparseList.clear(); // + this->M.clear(); + } + + /** + * Convert from sparse representation (using tmpSet and sparseList) to normal (using register) + */ + void switchToNormalRepresentation() { +#ifdef HLL_DEBUG + cerr << "switching to normal representation" << endl; + cerr << " est before: " << cardinality(true) << endl; +#endif + this->sparse = false; + this->M = vector(this->m); + if (sparseList.size() > 0) { //TDOD: do I need to check this, here? + addToRegisters(this->sparseList); + this->sparseList.clear(); + } +#ifdef HLL_DEBUG + cerr << " est after: " << cardinality(true) << endl; +#endif + } + + /** + * add sparseList to the registers of M + */ + void addToRegisters(const SparseListType &sparseList) { + if (sparseList.size() == 0) { + return; + } + for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) { + + idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr); + + assert_lt(ir.idx,M.size()); + if (ir.rank > this->M[ir.idx]) { + this->M[ir.idx] = ir.rank; + } + } + } + + /** + * Merge another HyperLogLogPlusMinus into this. Converts to normal representation + * @param other + */ + void merge(const HyperLogLogPlusMinus* other) { + if (this->p != other->p) { + throw std::invalid_argument("precisions must be equal"); + } + + if (this->sparse && other->sparse) { + if (this->sparseList.size()+other->sparseList.size() > this->m) { + switchToNormalRepresentation(); + addToRegisters(other->sparseList); + } else { + this->sparseList.insert(other->sparseList.begin(),other->sparseList.end()); + } + } else if (other->sparse) { + // other is sparse, but this is not + addToRegisters(other->sparseList); + } else { + if (this->sparse) { + switchToNormalRepresentation(); + } + + // merge registers + for (size_t i = 0; i < other->M.size(); ++i) { + if (other->M[i] > this->M[i]) { + this->M[i] = other->M[i]; + } + } + } + } + + /** + * + * @return cardinality estimate + */ + uint64_t cardinality(bool verbose=true) { + if (sparse) { + // if we are still 'sparse', then use linear counting, which is more + // accurate for low cardinalities, and use increased precision pPrime + return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size()))); + } + + // initialize bias correction data + if (rawEstimateData.empty()) { initRawEstimateData(); } + if (biasData.empty()) { initBiasData(); } + + // calculate raw estimate on registers + //double est = alpha(m) * harmonicMean(M, m); + double est = calculateEstimate(M); + + // correct for biases if estimate is smaller than 5m + if (est <= double(m)*5.0) { + est -= getEstimateBias(est); + } + + uint32_t v = countZeros(M); + if (v > 2) { + // calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix + double lc_estimate = linearCounting(m, v); + + // check if the lc estimate is below the threshold + if (lc_estimate <= double(threshold[p-4])) { + if (lc_estimate < 0) { throw; } + // return lc estimate of cardinality + return lc_estimate; + } + return lc_estimate; // always use lc_estimate when available + } + + // return bias-corrected hyperloglog estimate of cardinality + return uint64_t(est); + } + +private: + + uint8_t rank(HashSize x, uint8_t b) { + uint8_t v = 1; + while (v <= b && !(x & 0x80000000)) { + v++; + x <<= 1; + } + return v; + } + + template inline uint32_t get_index(const T hash_value, const uint8_t p, const uint8_t size) const { + // take first p bits as index {x63,...,x64-p} + assert_lt(p,size); + uint32_t idx = hash_value >> (size - p); + return idx; + } + + inline uint32_t get_index(const uint64_t hash_value, const uint8_t p) const { + return get_index(hash_value, p, 64); + } + + inline uint32_t get_index(const uint32_t hash_value, const uint8_t p) const { + return get_index(hash_value, p, 32); + } + + template inline + T get_trailing_ones(const uint8_t p) const { + return (T(1) << p ) - 1; + } + + template inline + uint8_t get_rank(const T hash_value, const uint8_t p) const { + // shift p values off, and count leading zeros of the remaining string {x63-p,...,x0} + T_KEY rank_bits = (hash_value << p | get_trailing_ones(p)); +#ifdef HLL_DEBUG + cerr << "rank bits: " << bitset<32>(rank_bits) << endl; +#endif + + uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1; + assert_leq(rank_val,64-p+1); + return rank_val; + } + + void initRawEstimateData() { + rawEstimateData = vector >(); + + rawEstimateData.push_back(vector(rawEstimateData_precision4,arr_len(rawEstimateData_precision4))); + rawEstimateData.push_back(vector(rawEstimateData_precision5,arr_len(rawEstimateData_precision5))); + rawEstimateData.push_back(vector(rawEstimateData_precision6,arr_len(rawEstimateData_precision6))); + rawEstimateData.push_back(vector(rawEstimateData_precision7,arr_len(rawEstimateData_precision7))); + rawEstimateData.push_back(vector(rawEstimateData_precision8,arr_len(rawEstimateData_precision8))); + rawEstimateData.push_back(vector(rawEstimateData_precision9,arr_len(rawEstimateData_precision9))); + rawEstimateData.push_back(vector(rawEstimateData_precision10,arr_len(rawEstimateData_precision10))); + rawEstimateData.push_back(vector(rawEstimateData_precision11,arr_len(rawEstimateData_precision11))); + rawEstimateData.push_back(vector(rawEstimateData_precision12,arr_len(rawEstimateData_precision12))); + rawEstimateData.push_back(vector(rawEstimateData_precision13,arr_len(rawEstimateData_precision13))); + rawEstimateData.push_back(vector(rawEstimateData_precision14,arr_len(rawEstimateData_precision14))); + rawEstimateData.push_back(vector(rawEstimateData_precision15,arr_len(rawEstimateData_precision15))); + rawEstimateData.push_back(vector(rawEstimateData_precision16,arr_len(rawEstimateData_precision16))); + rawEstimateData.push_back(vector(rawEstimateData_precision17,arr_len(rawEstimateData_precision17))); + rawEstimateData.push_back(vector(rawEstimateData_precision18,arr_len(rawEstimateData_precision18))); + + } + + void initBiasData() { + biasData = vector >(); + + biasData.push_back(vector(biasData_precision4,arr_len(biasData_precision4))); + biasData.push_back(vector(biasData_precision5,arr_len(biasData_precision5))); + biasData.push_back(vector(biasData_precision6,arr_len(biasData_precision6))); + biasData.push_back(vector(biasData_precision7,arr_len(biasData_precision7))); + biasData.push_back(vector(biasData_precision8,arr_len(biasData_precision8))); + biasData.push_back(vector(biasData_precision9,arr_len(biasData_precision9))); + biasData.push_back(vector(biasData_precision10,arr_len(biasData_precision10))); + biasData.push_back(vector(biasData_precision11,arr_len(biasData_precision11))); + biasData.push_back(vector(biasData_precision12,arr_len(biasData_precision12))); + biasData.push_back(vector(biasData_precision13,arr_len(biasData_precision13))); + biasData.push_back(vector(biasData_precision14,arr_len(biasData_precision14))); + biasData.push_back(vector(biasData_precision15,arr_len(biasData_precision15))); + biasData.push_back(vector(biasData_precision16,arr_len(biasData_precision16))); + biasData.push_back(vector(biasData_precision17,arr_len(biasData_precision17))); + biasData.push_back(vector(biasData_precision18,arr_len(biasData_precision18))); + } + + /** + * Estimate the bias using empirically determined values. + * Uses weighted average of the two cells between which the estimate falls. + * TODO: Check if nearest neighbor average gives better values, as proposed in the paper + * @param est + * @return correction value for + */ + double getEstimateBias(double estimate) { + vector rawEstimateTable = rawEstimateData[p-4]; + vector biasTable = biasData[p-4]; + + // check if estimate is lower than first entry, or larger than last + if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); } + if (rawEstimateTable.back() <= estimate) { return rawEstimateTable.back() - biasTable.back(); } + + // get iterator to first element that is not smaller than estimate + vector::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate); + size_t pos = it - rawEstimateTable.begin(); + + double e1 = rawEstimateTable[pos-1]; + double e2 = rawEstimateTable[pos]; + + double c = (estimate - e1) / (e2 - e1); + + return biasTable[pos-1]*(1-c) + biasTable[pos]*c; + } + + + /** + * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation. + * + * Difference from the algorithm described in the paper: + * The index always is in the p most significant bits + * + * see section 5.3 in Heule et al. + * @param x the hash bits + * @return encoded hash value + */ + uint32_t encodeHashIn32Bit(uint64_t hash_value) { + // extract first pPrime bits, and shift them onto a 32-bit integer + uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32); + +#ifdef HLL_DEBUG + cerr << "value: " << bitset<64>(hash_value) << endl; + cerr << "index: " << std::bitset<32>(idx) << " ( bits from 64 to " << 64-pPrime << "; " << idx << ")" << endl; +#endif + + // are the bits {63-p, ..., 63-p'} all 0? + if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) { + // compute the additional rank (minimum rank is already p'-p) + // the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag + uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0 + return idx | uint32_t(additional_rank<<1) | 1; + } else { + // else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0) + assert_eq((idx & 1),0); + return idx; + } + } + + + /** + * struct holding the index and rank/rho of an entry + */ + struct idx_n_rank { + uint32_t idx; + uint8_t rank; + idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {} + }; + + // + // + /** + * Decode a hash from the sparse representation. + * Returns the index and number of leading zeros (nlz) with precision p stored in k + * @param k the hash bits + * @return index and rank in non-sparse format + */ + idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const { + + // difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes + uint32_t idx = get_index(encoded_hash_value, p); + uint8_t rank_val; + + // check if the last bit is 1 + if ( (encoded_hash_value & 1) == 1) { + // if yes: the hash was stored with higher precision, bits p to pPrime were 0 + uint8_t additional_rank = pPrime - p; + rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1); + } else { + rank_val = get_rank(encoded_hash_value,p); + + // clz counts 64 bit only, it seems + if (rank_val > 32) + rank_val -= 32; + } + + return(idx_n_rank(idx,rank_val)); + } + +}; + + + + +#endif /* HYPERLOGLOGPLUS_H_ */ diff --git a/src/make_seqid_to_taxid_map.cpp b/src/make_seqid_to_taxid_map.cpp index 30b3091..8b968aa 100644 --- a/src/make_seqid_to_taxid_map.cpp +++ b/src/make_seqid_to_taxid_map.cpp @@ -41,12 +41,22 @@ void report_taxo_numbers(char *filename); int main(int argc, char **argv) { if (argc < 3) { - cerr << "Usage: make_seqid_to_taxid_map " + cerr << "Usage: make_seqid_to_taxid_map [ ]\n" + << " If nodes.dmp and names.dmp files are provided, then each sequence header is added with a further link\n" + << " to the taxonomy." << endl; return 1; } char *map_filename = argv[1]; char *list_filename = argv[2]; + + char *nodes_filename; + char *names_filename; + if (argc == 5) { + nodes_filename = argv[3]; + names_filename = argv[4]; + } + fill_request_map(list_filename); report_taxo_numbers(map_filename); @@ -96,8 +106,8 @@ void fill_request_map(char *filename) { fptr_start = fptr = file.ptr(); size_t file_size = file.size(); - // Line format: - // OR: TAXID (user spec'ed) + // Line format: + // OR: TAXID (user spec'ed) while ((size_t)(fptr - fptr_start) < file_size) { char *nl_ptr = strchr(fptr, '\n'); char *sep_ptr = strchr(fptr, '\t'); diff --git a/src/third_party/MurmurHash3.cpp b/src/third_party/MurmurHash3.cpp new file mode 100644 index 0000000..aa7982d --- /dev/null +++ b/src/third_party/MurmurHash3.cpp @@ -0,0 +1,335 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE inline __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i*4+0); + uint32_t k2 = getblock32(blocks,i*4+1); + uint32_t k3 = getblock32(blocks,i*4+2); + uint32_t k4 = getblock32(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + diff --git a/src/third_party/MurmurHash3.h b/src/third_party/MurmurHash3.h new file mode 100644 index 0000000..e1c6d34 --- /dev/null +++ b/src/third_party/MurmurHash3.h @@ -0,0 +1,37 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) && (_MSC_VER < 1600) + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ From 3e5a0090b2ff4997b8b869be537924bb68d1fe13 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 12 Feb 2017 15:19:13 -0500 Subject: [PATCH 011/105] Skip two gi2seqid and seqid2taxid map creation for now --- scripts/build_kraken_db.sh | 43 +++++++++++++----------------------- scripts/report_gi_numbers.pl | 2 ++ 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh index d0b49a3..5d42fca 100755 --- a/scripts/build_kraken_db.sh +++ b/scripts/build_kraken_db.sh @@ -66,7 +66,7 @@ if [ -e "database.jdb" ] then echo "Skipping step 1, k-mer set already exists." else - echo "Creating k-mer set (step 1 of 6)..." + echo "Creating k-mer set (step 1 of 5)..." start_time1=$(date "+%s.%N") check_for_jellyfish.sh @@ -111,7 +111,7 @@ else then echo "Skipping step 2, database reduction unnecessary." else - echo "Reducing database size (step 2 of 6)..." + echo "Reducing database size (step 2 of 5)..." max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc) idx_size_gb=$(printf %.2f $(echo "$idx_size/2^30" | bc) ) if (( $(echo "$max_kdb_size < 0" | bc) == 1 )) @@ -143,7 +143,7 @@ if [ -e "database.kdb" ] then echo "Skipping step 3, k-mer set already sorted." else - echo "Sorting k-mer set (step 3 of 6)..." + echo "Sorting k-mer set (step 3 of 5)..." start_time1=$(date "+%s.%N") db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \ -d database.jdb -o database.kdb.tmp \ @@ -155,41 +155,28 @@ else echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]" fi -if [ -e "gi2seqid.map" ] -then - echo "Skipping step 4, GI number to seqID map already complete." -else - echo "Creating GI number to seqID map (step 4 of 6)..." - start_time1=$(date "+%s.%N") - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ - report_gi_numbers.pl > gi2seqid.map.tmp - mv gi2seqid.map.tmp gi2seqid.map - - echo "GI number to seqID map created. [$(report_time_elapsed $start_time1)]" -fi - if [ -e "seqid2taxid.map" ] then - echo "Skipping step 5, seqID to taxID map already complete." + echo "Skipping step 4, seqID to taxID map already complete." else - echo "Creating seqID to taxID map (step 5 of 6)..." - start_time1=$(date "+%s.%N") - make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \ - > seqid2taxid.map.tmp - mv seqid2taxid.map.tmp seqid2taxid.map - line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') - - echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" + echo "Creating seqID to taxID map (step 4 of 5)... [blu]" +# start_time1=$(date "+%s.%N") +# make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \ +# > seqid2taxid.map.tmp +# mv seqid2taxid.map.tmp seqid2taxid.map +# line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') + +# echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" fi if [ -e "lca.complete" ] then - echo "Skipping step 6, LCAs already set." + echo "Skipping step 5, LCAs already set." else - echo "Setting LCAs in database (step 6 of 6)..." + echo "Setting LCAs in database (step 5 of 5)..." start_time1=$(date "+%s.%N") find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ - set_lcas $MEMFLAG -x -d database.kdb -i database.idx \ + set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \ -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 touch "lca.complete" diff --git a/scripts/report_gi_numbers.pl b/scripts/report_gi_numbers.pl index 88a24f0..0d07b85 100755 --- a/scripts/report_gi_numbers.pl +++ b/scripts/report_gi_numbers.pl @@ -38,6 +38,7 @@ next unless /^>(\S+)/; my $seq_id = $1; if ($seq_id =~ /(^|\|)kraken:taxid\|(\d+)/) { + print "TAXID\t$2\t$seq_id\t$_\n"; next; } @@ -45,5 +46,6 @@ if ($seq_id !~ /(^|\|)gi\|(\d+)/) { die "$PROG: sequence ID $seq_id lacks GI number, aborting.\n"; } + print "$2\t$seq_id\t$_\n"; } From 93904c7397b9baac187d346ff477f59e38b3ddad Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 14 Feb 2017 20:54:57 -0500 Subject: [PATCH 012/105] Don't display counts --- src/classify.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/classify.cpp b/src/classify.cpp index 5909a85..9dbe48b 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -58,6 +58,7 @@ bool Print_kraken = true; bool Populate_memory = false; bool Only_classified_kraken_output = false; bool Print_sequence = true; +bool Print_Progress = false; uint32_t Minimum_hit_count = 1; map Parent_map; vector KrakenDatabases; @@ -235,7 +236,8 @@ void process_file(char *filename) { (*Unclassified_output) << unclassified_output_ss.str(); total_sequences += work_unit.size(); total_bases += total_nt; - cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ..."; + if (Print_Progress) + cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ..."; } } } // end parallel section From 905fa088a2355b50491016b5c63291f5a935e533 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 14 Feb 2017 20:55:10 -0500 Subject: [PATCH 013/105] Fix Makefile --- src/Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Makefile b/src/Makefile index 6e2c938..48debd2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,6 @@ CXX = g++ -CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fsyntax-only -PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink get_kmers +CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 +PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink .PHONY: all install clean @@ -18,8 +18,6 @@ db_sort: krakendb.o quickfile.o set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o -get_kmers: krakendb.o quickfile.o krakenutil.o seqreader.o - classify: krakendb.o quickfile.o krakenutil.o seqreader.o make_seqid_to_taxid_map: quickfile.o From 7b4530441a4a1b7591f76ff0a9ee4a70089b9bba Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 15 Feb 2017 12:50:36 -0500 Subject: [PATCH 014/105] Added taxdb from k-SLAM for writing report after classification --- src/build_taxdb.cpp | 33 ++++ src/taxdb.h | 372 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 405 insertions(+) create mode 100644 src/build_taxdb.cpp create mode 100644 src/taxdb.h diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp new file mode 100644 index 0000000..a802aa2 --- /dev/null +++ b/src/build_taxdb.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the Kraken taxonomic sequence classification system. + * + * Kraken is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Kraken is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ + +#include "taxdb.h" + +#include +using namespace std; + +int main(int argc, char **argv) { + std::string database_dir = argv[0]; + TaxonomyDB taxdb; + taxdb.writeTaxonomyIndex( + std::cout, + database_dir + "/taxonomy/nodes.dmp", + database_dir + "/taxonomy/names.dmp"); + +} diff --git a/src/taxdb.h b/src/taxdb.h new file mode 100644 index 0000000..da975e3 --- /dev/null +++ b/src/taxdb.h @@ -0,0 +1,372 @@ +/* Original work Copyright 2013 David Ainsworth + * Modified work copyright 2017 Florian Breitwieser + * + * The original file is part of SLAM + * + * SLAM is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * SLAM is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + + * You should have received a copy of the GNU Affero General Public License + * along with SLAM. If not, see . + */ + +#ifndef TAXD_DB_H_ +#define TAXD_DB_H_ + +#include +#include +#include +#include +#include +#include +#include + +void log (const std::string& s) { + std::cerr << s << "\n"; +} + +std::vector tokenise(const std::string &line, const std::string& delimiters) { + std::vector tokens; + // Skip delimiters at beginning. + std::string::size_type lastPos = line.find_first_not_of(delimiters, 0); + std::string::size_type pos = line.find_first_of(delimiters, lastPos); + while (std::string::npos != pos || std::string::npos != lastPos) { + tokens.push_back(line.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = line.find_first_not_of(delimiters, pos); + pos = line.find_first_of(delimiters, lastPos); + } + return tokens; +} + +class TaxonomyEntry { + public: + uint32_t taxonomyID = 0; + uint32_t parentTaxonomyID = 0; + std::string rank; + std::string scientificName; + + TaxonomyEntry() {} + TaxonomyEntry(uint32_t taxonomyID_, uint32_t parentTaxonomyID_, std::string rank_, std::string scientificName_) : + taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {} + + inline bool operator==(const TaxonomyEntry& other) const { + return this->taxonomyID == other.taxonomyID && + this->parentTaxonomyID == other.parentTaxonomyID && + this->scientificName == other.scientificName; + } + TaxonomyEntry* parent = nullptr; + std::vector children; + unsigned numReadsAligned = 0; + unsigned numReadsAlignedToChildren = 0; + bool used = false; + uint64_t genomeSize = 0; + uint64_t genomeSizeOfChildren = 0; + uint64_t numBelow = 0; +}; + +class TaxonomyDB { + public: + TaxonomyDB(const std::string inFileName); + TaxonomyDB() {}; + std::unordered_map taxIDsAndEntries; + void parseNamesDump(const std::string namesDumpFileName); + void parseNodesDump(const std::string nodesDumpFileName); + uint32_t getTaxIDAtRank(const uint32_t taxID, const std::string& rank) const; + std::string getScientificName(const uint32_t taxID) const; + std::string getRank(const uint32_t taxID) const; + uint32_t getLowestCommonAncestor(const std::vector& taxIDs) const; + uint32_t getParentTaxID(const uint32_t taxID) const; + std::string getLineage(uint32_t taxonomyID) const; + std::string getMetaPhlAnLineage(uint32_t taxonomyID) const; + char* getIndexFileName(const uint32_t hostTaxID) const; + void readTaxonomyIndex(const std::string inFileName); + void writeTaxonomyIndex(std::ostream & outs, + const std::string namesDumpFileName, + const std::string nodesDumpFileName); + bool isSubSpecies(uint32_t taxonomyID) const; + int isBelowInTree(uint32_t upper, uint32_t lower) const; + void createPointers(); +}; + + +void TaxonomyDB::createPointers() { + for (auto& tax : taxIDsAndEntries) { + auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); + if (parentIt != taxIDsAndEntries.end()) { + tax.second.parent = &(parentIt->second); + parentIt->second.children.push_back(&tax.second); + } + } +} +TaxonomyDB::TaxonomyDB(const std::string inFileName) { + log("Building taxonomy index"); + readTaxonomyIndex(inFileName); + createPointers(); + log("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + + " nodes"); +} + +void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { + std::ifstream nodesDumpFile(nodesDumpFileName); + if (!nodesDumpFile.is_open()) + throw std::runtime_error("unable to open nodes file"); + std::string line; + while (nodesDumpFile.good()) { + getline(nodesDumpFile, line); + std::vector tokens = tokenise(line, "\t|"); + if (tokens.size() > 2) { + TaxonomyEntry newEntry; + newEntry.taxonomyID = stoi(tokens[0]); + newEntry.parentTaxonomyID = stoi(tokens[1]); + newEntry.rank = tokens[2]; + auto entryIt = taxIDsAndEntries.insert({ + newEntry.taxonomyID, newEntry + }); + if (!entryIt.second) { + entryIt.first->second.taxonomyID = newEntry.taxonomyID; + newEntry.parentTaxonomyID = stoi(tokens[1]); + } + } + } +} + +void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { + std::ifstream namesDumpFile(namesDumpFileName); + if (!namesDumpFile.is_open()) + throw std::runtime_error("unable to open names file"); + std::string line; + while (namesDumpFile.good()) { + getline(namesDumpFile, line); + std::vector tokens = tokenise(line, "|"); + for (auto& token : tokens) { + if (token.size() > 1) { + if (token[0] == '\t') token.erase(0, 1); + if (token[token.size() - 1] == '\t') token.erase(token.size() - 1, 1); + } + } + if (tokens.size() > 3) { + TaxonomyEntry newEntry; + newEntry.taxonomyID = stoi(tokens[0]); + // for(auto & token : tokens) + // std::cout<second.scientificName = newEntry.scientificName; + } + } + } +} + +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, + const std::string namesDumpFileName, + const std::string nodesDumpFileName) { + parseNodesDump(nodesDumpFileName); + parseNamesDump(namesDumpFileName); + for (auto& entry : taxIDsAndEntries) { + outs << entry.first << "\t" << entry.second.parentTaxonomyID << "\t" + << entry.second.scientificName << "\t" << entry.second.rank << "\n"; + } +} + +void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { + std::ifstream inFile(inFileName); + if (!inFile.is_open()) + throw std::runtime_error("unable to open taxonomy index file"); + + uint32_t taxonomyID, parentTaxonomyID; + std::string scientificName, rank; + + while (inFile >> taxonomyID >> parentTaxonomyID >> rank >> scientificName) { + TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); + + taxIDsAndEntries.insert({ + taxonomyID, newEntry + }); + } +} + +uint32_t TaxonomyDB::getLowestCommonAncestor( + const std::vector& taxIDs) const { + if (taxIDs.size() == 0) { + return 0; + } + std::vector > paths; + for (auto& taxID : taxIDs) { + bool good = true; + std::vector path; + uint32_t tempTaxID = taxID; + while (tempTaxID != 0) { + path.push_back(tempTaxID); + tempTaxID = getParentTaxID(tempTaxID); + } + if (good) paths.push_back(path); + } + if (paths.size() == 0) { + return 0; + } + for (auto& path : paths) + std::reverse(path.begin(), path.end()); + std::sort(paths.begin(), paths.end(), + [](std::vector i, std::vector j) { + return i.size() < j.size(); + }); + uint32_t consensus = 0; + for (unsigned i = 0; i < paths[0].size(); i++) { + uint32_t temp = 0; + for (auto& path : paths) { + if (temp == 0) + temp = path[i]; + else if (temp != path[i]) { + return consensus; + } + } + consensus = temp; + } + return consensus; +} + +uint32_t TaxonomyDB::getParentTaxID(const uint32_t taxID) const { + auto entry = taxIDsAndEntries.find(taxID); + if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) + return entry->second.parentTaxonomyID; + else + return 0; +} + +std::string TaxonomyDB::getScientificName(const uint32_t taxID) const { + auto entry = taxIDsAndEntries.find(taxID); + if (entry != taxIDsAndEntries.end()) { + return entry->second.scientificName; + } else + return std::string(); +} + +std::string TaxonomyDB::getRank(const uint32_t taxID) const { + auto entry = taxIDsAndEntries.find(taxID); + if (entry != taxIDsAndEntries.end()) { + return entry->second.rank; + } else + return std::string(); +} + +std::string TaxonomyDB::getLineage(uint32_t taxonomyID) const { + std::string lineage; + while (true) { + // 131567 = Cellular organisms + if (taxonomyID != 131567) { + if (lineage.size()) lineage.insert(0, "; "); + lineage.insert(0, getScientificName(taxonomyID)); + if (getRank(taxonomyID) == "species") lineage.clear(); + } + taxonomyID = getParentTaxID(taxonomyID); + if (taxonomyID == 0) { + if (lineage.size()) lineage.append("."); + break; + } + } + return lineage; +} +std::string TaxonomyDB::getMetaPhlAnLineage(uint32_t taxonomyID) const { + std::string rank = getRank(taxonomyID); + if (rank == "superphylum") return std::string(); + std::string lineage; + while (true) { + // 131567 = Cellular organisms + if (taxonomyID != 131567) { + std::string rank = getRank(taxonomyID); + if (rank == "species") { + lineage.insert(0, "|s__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "genus") { + lineage.insert(0, "|g__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "family") { + lineage.insert(0, "|f__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "order") { + lineage.insert(0, "|o__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "class") { + lineage.insert(0, "|c__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "phylum") { + lineage.insert(0, "|p__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "superkingdom") { + lineage.insert(0, "k__"); + lineage.insert(3, getScientificName(taxonomyID)); + } + } + taxonomyID = getParentTaxID(taxonomyID); + if (taxonomyID == 0) { + break; + } + } + std::replace(lineage.begin(), lineage.end(), ' ', '_'); + return lineage; +} + +uint32_t TaxonomyDB::getTaxIDAtRank(const uint32_t taxID, + const std::string& rank) const { + auto entry = taxIDsAndEntries.find(taxID); + while (entry != taxIDsAndEntries.end() && + entry->second.parentTaxonomyID != 1) { + if (entry->second.rank == rank) { + return entry->second.taxonomyID; + } else + entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + } + return 0; +} +int TaxonomyDB::isBelowInTree(uint32_t upper, uint32_t lower) const { + auto entry = taxIDsAndEntries.find(lower); + unsigned level = 0; + while (entry != taxIDsAndEntries.end() && + entry->second.parentTaxonomyID != 1) { + if (entry->first == upper) { + return level; + } else { + entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + level++; + } + } + return -1; +} +bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const { + bool isSubSpecies = false; + auto entry = taxIDsAndEntries.find(taxonomyID); + int numLevels = 0; + while (entry != taxIDsAndEntries.end() && + entry->second.parentTaxonomyID != 1) { + if (entry->second.rank == "species") { + if (numLevels > 0) { + isSubSpecies = true; + } + break; + } else + entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + numLevels++; + } + return isSubSpecies; +} + + + + +#endif /* TAXD_DB_H_ */ From 330e186976fe66a55bdfd6e4591c62ca2c8ef45e Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 16 Feb 2017 13:22:13 -0500 Subject: [PATCH 015/105] Build report and taxDB in Kraken --- scripts/build_kraken_db.sh | 10 ++++ scripts/kraken | 2 + src/Makefile | 7 ++- src/build_taxdb.cpp | 9 +-- src/classify.cpp | 47 ++++++++++----- src/hyperloglogplus.h | 10 ++++ src/report-cols.h | 53 +++++++++++++++++ src/taxdb.h | 118 +++++++++++++++++++++++++++++++++++++ 8 files changed, 235 insertions(+), 21 deletions(-) create mode 100644 src/report-cols.h diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh index 5d42fca..4f64c14 100755 --- a/scripts/build_kraken_db.sh +++ b/scripts/build_kraken_db.sh @@ -169,6 +169,16 @@ else # echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" fi +if [ -e "taxDB" ] +then + echo "Skipping step 4.5, taxDB exists." +else + echo "Creating taxDB (step 4.5 of 5)... " + build_taxdb taxonomy/nodes.dmp taxonomy/names.dmp > taxDB +fi + + + if [ -e "lca.complete" ] then echo "Skipping step 5, LCAs already set." diff --git a/scripts/kraken b/scripts/kraken index c81ed38..1119868 100755 --- a/scripts/kraken +++ b/scripts/kraken @@ -147,6 +147,8 @@ push @flags, "-C", $classified_out if defined $classified_out; push @flags, "-o", $outfile if defined $outfile; push @flags, "-c", if $only_classified_output; push @flags, "-M" if $preload; +push @flags, "-r", $report_file; +push @flags, "-a", $db_prefix[0]."/taxDB"; # handle piping for decompression/merging my @pipe_argv; diff --git a/src/Makefile b/src/Makefile index 48debd2..98bdd00 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,6 @@ CXX = g++ -CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink +CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3 +PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb .PHONY: all install clean @@ -22,6 +22,9 @@ classify: krakendb.o quickfile.o krakenutil.o seqreader.o make_seqid_to_taxid_map: quickfile.o +build_taxdb: taxdb.h + $(CXX) $(CXXFLAGS) -o build_taxdb build_taxdb.cpp + krakenutil.o: krakenutil.cpp krakenutil.hpp $(CXX) $(CXXFLAGS) -c krakenutil.cpp diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index a802aa2..08e649a 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -23,11 +23,12 @@ using namespace std; int main(int argc, char **argv) { - std::string database_dir = argv[0]; + if (argc != 3) { + std::cout << "Provide names.dmp and nodes.dmp\n"; + return 1; + } TaxonomyDB taxdb; taxdb.writeTaxonomyIndex( - std::cout, - database_dir + "/taxonomy/nodes.dmp", - database_dir + "/taxonomy/names.dmp"); + std::cout, argv[1], argv[2]); } diff --git a/src/classify.cpp b/src/classify.cpp index 9dbe48b..2f09ff7 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -23,6 +23,7 @@ #include "quickfile.hpp" #include "seqreader.hpp" #include "hyperloglogplus.h" +#include "taxdb.h" const size_t DEF_WORK_UNIT_SIZE = 500000; @@ -38,13 +39,7 @@ string hitlist_string(vector &taxa, vector &ambig); set get_ancestry(uint32_t taxon); void report_stats(struct timeval time1, struct timeval time2); -struct ReadCounts { - uint32_t n_reads; - uint32_t n_kmers; - HyperLogLogPlusMinus kmers; // unique k-mer count per taxon -}; - -map taxon_counts; // stats per taxon +unordered_map taxon_counts; // stats per taxon int Num_threads = 1; vector DB_filenames; @@ -55,6 +50,7 @@ bool Fastq_input = false; bool Print_classified = false; bool Print_unclassified = false; bool Print_kraken = true; +bool Print_kraken_report = true; bool Populate_memory = false; bool Only_classified_kraken_output = false; bool Print_sequence = true; @@ -62,11 +58,13 @@ bool Print_Progress = false; uint32_t Minimum_hit_count = 1; map Parent_map; vector KrakenDatabases; -string Classified_output_file, Unclassified_output_file, Kraken_output_file; +string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file; ostream *Classified_output; ostream *Unclassified_output; ostream *Kraken_output; +ostream *Report_output; size_t Work_unit_size = DEF_WORK_UNIT_SIZE; +TaxonomyDB taxdb; uint64_t total_classified = 0; uint64_t total_sequences = 0; @@ -152,6 +150,18 @@ int main(int argc, char **argv) { else Kraken_output = &cout; + if (Report_output_file.empty() || Report_output_file == "-") { + Print_kraken_report = false; + } else { + Report_output = new ofstream(Report_output_file.c_str()); + } + + if (!TaxDB_file.empty() && Print_kraken_report) { + taxdb.readTaxonomyIndex(TaxDB_file); + } else { + Print_kraken_report = false; + } + struct timeval tv1, tv2; gettimeofday(&tv1, NULL); for (int i = optind; i < argc; i++) @@ -242,12 +252,11 @@ void process_file(char *filename) { } } // end parallel section - // Write out report - print k-mers and read numbers - for (auto& elem : taxon_counts) { - //elem.first gives you the key (int) - //elem.second gives you the mapped element (vector) - cerr << elem.first << "\t" << elem.second.n_reads << "\t" << - elem.second.n_kmers << "\t" << elem.second.kmers.cardinality() << "\n"; + if (Print_kraken_report) { + // Fill TaxDB with counts + taxdb.fillCounts(taxon_counts); + TaxReport rep = TaxReport(*Report_output, taxdb, false); + rep.printReport("kraken","blu"); } delete reader; @@ -410,7 +419,7 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:M")) != -1) { + while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:")) != -1) { switch (opt) { case 'd' : DB_filenames.push_back(optarg); @@ -458,6 +467,12 @@ void parse_command_line(int argc, char **argv) { case 'o' : Kraken_output_file = optarg; break; + case 'r' : + Report_output_file = optarg; + break; + case 'a' : + TaxDB_file = optarg; + break; case 'u' : sig = atoll(optarg); if (sig <= 0) @@ -498,6 +513,8 @@ void usage(int exit_code) { << "* -i filename Kraken DB index filename" << endl << " -n filename NCBI Taxonomy nodes file" << endl << " -o filename Output file for Kraken output" << endl + << " -r filename Output file for Kraken report output" << endl + << " -a filename TaxDB" << endl << " -t # Number of threads" << endl << " -u # Thread work unit size (in bp)" << endl << " -q Quick operation" << endl diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h index 33f5dc1..8cd2bdc 100644 --- a/src/hyperloglogplus.h +++ b/src/hyperloglogplus.h @@ -388,6 +388,16 @@ class HyperLogLogPlusMinus { } } + HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus* other) { + merge(other); + return *this; + } + + HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus& other) { + merge(&other); + return *this; + } + /** * * @return cardinality estimate diff --git a/src/report-cols.h b/src/report-cols.h new file mode 100644 index 0000000..7087a82 --- /dev/null +++ b/src/report-cols.h @@ -0,0 +1,53 @@ +/* + * report-cols.h + * Copyright (C) 2017 fbreitwieser + * + * Distributed under terms of the MIT license. + */ + +#ifndef REPORT_COLS_H +#define REPORT_COLS_H + +#include + +enum class REPORTCOLS : uint8_t { + SPACED_NAME, + NAME, + TAX_ID, + TAX_RANK, + DEPTH, + GENOME_SIZE, + NUM_READS, + NUM_READS_CLADE, + NUM_UNIQUE_KMERS, + TOTAL_SCORE, + TOTAL_HIT_LENGTH, + ABUNDANCE, + ABUNDANCE_LEN, + PERCENTAGE +}; + + +static const std::map report_col_name_map = { + {"name", REPORTCOLS::NAME}, + {"spaced_name", REPORTCOLS::SPACED_NAME}, + {"taxID", REPORTCOLS::TAX_ID}, + {"taxRank", REPORTCOLS::TAX_RANK}, + {"depth", REPORTCOLS::DEPTH}, + {"genomeSize", REPORTCOLS::GENOME_SIZE}, + {"numReads", REPORTCOLS::NUM_READS}, + {"numReadsClade", REPORTCOLS::NUM_READS_CLADE}, + {"numUniqueKmers", REPORTCOLS::NUM_UNIQUE_KMERS}, + {"totalHitLen", REPORTCOLS::TOTAL_HIT_LENGTH}, + {"totalScore", REPORTCOLS::TOTAL_SCORE}, + {"abundance", REPORTCOLS::ABUNDANCE}, + {"abundance_len", REPORTCOLS::ABUNDANCE_LEN}, + + {"percent", REPORTCOLS::PERCENTAGE}, + {"taxId", REPORTCOLS::TAX_ID}, + {"reads_clade", REPORTCOLS::NUM_READS_CLADE}, // Change to clade reads! + {"reads_stay", REPORTCOLS::NUM_READS}, // Change to clade reads! + +}; + +#endif /* !REPORT_COLS_H */ diff --git a/src/taxdb.h b/src/taxdb.h index da975e3..da11c96 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -2,6 +2,7 @@ * Modified work copyright 2017 Florian Breitwieser * * The original file is part of SLAM + * The modified file is part of a modified Kraken version * * SLAM is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -27,6 +28,17 @@ #include #include #include +#include "hyperloglogplus.h" +#include "report-cols.h" + +typedef uint32_t TaxId; + +struct ReadCounts { + uint32_t n_reads; + uint32_t n_kmers; + HyperLogLogPlusMinus kmers; // unique k-mer count per taxon +}; + void log (const std::string& s) { std::cerr << s << "\n"; @@ -64,12 +76,15 @@ class TaxonomyEntry { } TaxonomyEntry* parent = nullptr; std::vector children; + unsigned numReadsAligned = 0; unsigned numReadsAlignedToChildren = 0; bool used = false; uint64_t genomeSize = 0; uint64_t genomeSizeOfChildren = 0; uint64_t numBelow = 0; + uint64_t numKmers; + HyperLogLogPlusMinus kmers; }; class TaxonomyDB { @@ -93,7 +108,9 @@ class TaxonomyDB { const std::string nodesDumpFileName); bool isSubSpecies(uint32_t taxonomyID) const; int isBelowInTree(uint32_t upper, uint32_t lower) const; + void fillCounts(const unordered_map& taxon_counts); void createPointers(); + void printReport(); }; @@ -366,6 +383,107 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const { return isSubSpecies; } +void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { + for (auto& elem : taxon_counts) { + TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first); + tax->numReadsAligned += elem.second.n_reads; + tax->numKmers += elem.second.n_kmers; + tax->kmers += elem.second.kmers; + + while (tax->parent != nullptr) { + tax = tax->parent; + tax->numReadsAlignedToChildren += elem.second.n_reads; + tax->numKmers += elem.second.n_kmers; + tax->kmers += elem.second.kmers; + } + } +} + + +class TaxReport { +private: + std::ostream& _reportOfb; + TaxonomyDB & _taxdb; + std::vector _report_cols; + uint64_t _total_n_reads; + bool _show_zeros; + + void printLine(TaxonomyEntry& tax, unsigned depth); + +public: + TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); + + void printReport(std::string format, std::string rank); + void printReport(TaxonomyEntry& tax, unsigned depth); +}; + +TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; +} + +void TaxReport::printReport(std::string format, std::string rank) { + _total_n_reads = + _taxdb.taxIDsAndEntries.at(0).numReadsAligned + + _taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren + + _taxdb.taxIDsAndEntries.at(1).numReadsAligned + + _taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren + + _taxdb.taxIDsAndEntries.at(-1).numReadsAligned + + _taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree + + if (format == "kraken") { + // A: print number of unidentified reads + printReport(_taxdb.taxIDsAndEntries.at(0),0u); + // B: print normal results + printReport(_taxdb.taxIDsAndEntries.at(1),0u); + // C: Print Unclassified stuff + printReport(_taxdb.taxIDsAndEntries.at(-1),0u); + } else { + // print stuff at a certain level .. + //_uid_abundance; + //_taxinfo + + } +} + +void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { + + if (_show_zeros || (tax.numReadsAligned+tax.numReadsAlignedToChildren) > 0) { + printLine(tax, depth); + + for (auto child : tax.children) { + printReport(*child, depth+1); + } + } + +} + +void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { + for (auto& col : _report_cols) { + switch (col) { + case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; + case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; + case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; + case REPORTCOLS::DEPTH: _reportOfb << depth; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << 100*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break; + //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; + //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; + case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (tax.numReadsAligned + tax.numReadsAlignedToChildren); break; + case REPORTCOLS::NUM_READS: _reportOfb << tax.numReadsAligned; break; + case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; + //case REPORTCOLS::GENOME_SIZE: ; break; + //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; + //case REPORTCOLS::SUM_SCORE: ; break; + case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break; + default: _reportOfb << "NA"; + } + if (&col == &_report_cols.back()) { + _reportOfb << '\n'; + } else { + _reportOfb << '\t'; + } + } +} + From c60b8a2d4e9d628802a2377332896fbd700b88db Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sat, 18 Feb 2017 10:19:03 -0500 Subject: [PATCH 016/105] Allow compressed output, and fix report file generation --- install_kraken.sh | 1 + scripts/kraken | 9 +- src/Makefile | 6 +- src/classify.cpp | 136 ++++++--- src/gzstream/.Makefile.swp | Bin 0 -> 12288 bytes src/gzstream/COPYING.LIB | 504 ++++++++++++++++++++++++++++++++ src/gzstream/Makefile | 88 ++++++ src/gzstream/README | 6 + src/gzstream/gzstream.C | 165 +++++++++++ src/gzstream/gzstream.h | 121 ++++++++ src/gzstream/index.html | 145 +++++++++ src/gzstream/libgzstream.a | Bin 0 -> 14254 bytes src/gzstream/logo.gif | Bin 0 -> 1651 bytes src/gzstream/test_gunzip.C | 78 +++++ src/gzstream/test_gzip.C | 78 +++++ src/gzstream/version | 1 + src/krakenutil.cpp | 14 +- src/krakenutil.hpp | 9 +- src/make_seqid_to_taxid_map.cpp | 12 +- src/report-cols.h | 1 + src/set_lcas.cpp | 5 +- src/taxdb.h | 52 +++- 22 files changed, 1352 insertions(+), 79 deletions(-) create mode 100644 src/gzstream/.Makefile.swp create mode 100644 src/gzstream/COPYING.LIB create mode 100644 src/gzstream/Makefile create mode 100644 src/gzstream/README create mode 100644 src/gzstream/gzstream.C create mode 100644 src/gzstream/gzstream.h create mode 100644 src/gzstream/index.html create mode 100644 src/gzstream/libgzstream.a create mode 100644 src/gzstream/logo.gif create mode 100644 src/gzstream/test_gunzip.C create mode 100644 src/gzstream/test_gzip.C create mode 100644 src/gzstream/version diff --git a/install_kraken.sh b/install_kraken.sh index f0673a2..e7af3d7 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -39,6 +39,7 @@ fi export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1") mkdir -p "$KRAKEN_DIR" +make -C src clean make -C src install for file in scripts/* do diff --git a/scripts/kraken b/scripts/kraken index 1119868..29cce0d 100755 --- a/scripts/kraken +++ b/scripts/kraken @@ -57,6 +57,7 @@ my $unclassified_out; my $classified_out; my $outfile; my $report_file; +my $print_sequence = 0; GetOptions( "help" => \&display_help, @@ -69,6 +70,7 @@ GetOptions( "min-hits=i" => \$min_hits, "unclassified-out=s" => \$unclassified_out, "classified-out=s" => \$classified_out, + "print-sequence=s" => \$print_sequence, "output=s" => \$outfile, "report-file=s" => \$report_file, "preload" => \$preload, @@ -77,7 +79,7 @@ GetOptions( "gzip-compressed" => \$gunzip, "bzip2-compressed" => \$bunzip2, "only-classified-output" => \$only_classified_output, -); +) or die $!; if (! defined $threads) { $threads = $ENV{"KRAKEN_NUM_THREADS"} || 1; @@ -147,8 +149,9 @@ push @flags, "-C", $classified_out if defined $classified_out; push @flags, "-o", $outfile if defined $outfile; push @flags, "-c", if $only_classified_output; push @flags, "-M" if $preload; -push @flags, "-r", $report_file; +push @flags, "-r", $report_file if defined $report_file; push @flags, "-a", $db_prefix[0]."/taxDB"; +push @flags, "-s" if $print_sequence; # handle piping for decompression/merging my @pipe_argv; @@ -197,7 +200,7 @@ if (@pipe_argv) { } } -print STDERR "$CLASSIFY, @flags, @ARGV\n"; +print STDERR "$CLASSIFY @flags @ARGV\n"; exec $CLASSIFY, @flags, @ARGV; die "$PROG: exec error: $!\n"; diff --git a/src/Makefile b/src/Makefile index 98bdd00..73b6b9c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,7 @@ CXX = g++ -CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3 +CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3 -g PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb +LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream .PHONY: all install clean @@ -18,7 +19,8 @@ db_sort: krakendb.o quickfile.o set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o -classify: krakendb.o quickfile.o krakenutil.o seqreader.o +classify: krakendb.o quickfile.o krakenutil.o seqreader.o taxdb.h + $(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS) make_seqid_to_taxid_map: quickfile.o diff --git a/src/classify.cpp b/src/classify.cpp index 2f09ff7..9f7933e 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -24,6 +24,7 @@ #include "seqreader.hpp" #include "hyperloglogplus.h" #include "taxdb.h" +#include "gzstream.h" const size_t DEF_WORK_UNIT_SIZE = 500000; @@ -38,7 +39,6 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, string hitlist_string(vector &taxa, vector &ambig); set get_ancestry(uint32_t taxon); void report_stats(struct timeval time1, struct timeval time2); - unordered_map taxon_counts; // stats per taxon int Num_threads = 1; @@ -53,16 +53,18 @@ bool Print_kraken = true; bool Print_kraken_report = true; bool Populate_memory = false; bool Only_classified_kraken_output = false; -bool Print_sequence = true; -bool Print_Progress = false; +bool Print_sequence = false; +bool Print_Progress = true; uint32_t Minimum_hit_count = 1; -map Parent_map; +unordered_map Parent_map; vector KrakenDatabases; string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file; ostream *Classified_output; ostream *Unclassified_output; ostream *Kraken_output; ostream *Report_output; +vector Open_fstreams; +vector Open_gzstreams; size_t Work_unit_size = DEF_WORK_UNIT_SIZE; TaxonomyDB taxdb; @@ -70,6 +72,29 @@ uint64_t total_classified = 0; uint64_t total_sequences = 0; uint64_t total_bases = 0; +inline bool ends_with(std::string const & value, std::string const & ending) +{ + if (ending.size() > value.size()) return false; + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); +} + +ostream* cout_or_file(string file) { + if (file == "-") + return &cout; + + if (ends_with(file, ".gz")) { + ogzstream* ogzs = new ogzstream(file.c_str()); + Open_gzstreams.push_back(ogzs); + return ogzs; + } else { + ofstream* ofs = new ofstream(file.c_str()); + Open_fstreams.push_back(ofs); + return ofs; + } +} + + + void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) { QuickFile db_file; db_file.open_file(DB_filename); @@ -92,9 +117,21 @@ int main(int argc, char **argv) { #endif parse_command_line(argc, argv); - if (! Nodes_filename.empty()) { - cerr << "Building parent node map " << endl; - Parent_map = build_parent_map(Nodes_filename); + //if (! Nodes_filename.empty()) { + // cerr << "Building parent node map " << endl; + // Parent_map = build_parent_map(Nodes_filename); + //} + + if (!TaxDB_file.empty()) { + taxdb = TaxonomyDB(TaxDB_file); + for (const auto & tax : taxdb.taxIDsAndEntries) { + if (tax.first != 0) + Parent_map[tax.first] = tax.second.parentTaxonomyID; + } + Parent_map[1] = 0; + } else { + cerr << "TaxDB argument is required!" << endl; + return 1; } if (Populate_memory) @@ -102,7 +139,7 @@ int main(int argc, char **argv) { // TODO: Check DB_filenames and Index_filesnames have the same length for (size_t i=0; i < DB_filenames.size(); ++i) { - cerr << "\t " << DB_filenames[i] << endl; + //cerr << "\t " << DB_filenames[i] << endl; static QuickFile db_file; db_file.open_file(DB_filenames[i]); if (Populate_memory) @@ -128,46 +165,54 @@ int main(int argc, char **argv) { cerr << "\ncomplete." << endl; if (Print_classified) { - if (Classified_output_file == "-") - Classified_output = &cout; - else - Classified_output = new ofstream(Classified_output_file.c_str()); + Classified_output = cout_or_file(Classified_output_file); } if (Print_unclassified) { - if (Unclassified_output_file == "-") - Unclassified_output = &cout; - else - Unclassified_output = new ofstream(Unclassified_output_file.c_str()); + Unclassified_output = cout_or_file(Unclassified_output_file); } if (! Kraken_output_file.empty()) { - if (Kraken_output_file == "-") + if (Kraken_output_file == "off") Print_kraken = false; - else - Kraken_output = new ofstream(Kraken_output_file.c_str()); - } - else - Kraken_output = &cout; - - if (Report_output_file.empty() || Report_output_file == "-") { - Print_kraken_report = false; + else { + cerr << "Writing Kraken output to " << Kraken_output_file << endl; + Kraken_output = cout_or_file(Kraken_output_file); + } } else { - Report_output = new ofstream(Report_output_file.c_str()); + Kraken_output = &cout; } - if (!TaxDB_file.empty() && Print_kraken_report) { - taxdb.readTaxonomyIndex(TaxDB_file); - } else { - Print_kraken_report = false; + if (!Report_output_file.empty()) { + Print_kraken_report = true; + cerr << "Writing Kraken report output to " << Report_output_file << endl; + Report_output = cout_or_file(Report_output_file); } + cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl; + struct timeval tv1, tv2; gettimeofday(&tv1, NULL); for (int i = optind; i < argc; i++) process_file(argv[i]); gettimeofday(&tv2, NULL); + std::cerr << "Finishing up ..\n"; + + if (Print_kraken_report) { + taxdb.fillCounts(taxon_counts); + TaxReport rep = TaxReport(*Report_output, taxdb, false); + rep.printReport("kraken","blu"); + } + + for (ofstream* ofs : Open_fstreams) { + ofs->close(); + } + for (ogzstream* ogzs : Open_gzstreams) { + ogzs->close(); + } + + report_stats(tv1, tv2); return 0; @@ -198,7 +243,6 @@ void report_stats(struct timeval time1, struct timeval time2) { } void process_file(char *filename) { - cerr << "k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl; string file_str(filename); DNASequenceReader *reader; DNASequence dna; @@ -246,19 +290,12 @@ void process_file(char *filename) { (*Unclassified_output) << unclassified_output_ss.str(); total_sequences += work_unit.size(); total_bases += total_nt; - if (Print_Progress) + if (Print_Progress && total_sequences % 100000 < work_unit.size()) cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ..."; } } } // end parallel section - if (Print_kraken_report) { - // Fill TaxDB with counts - taxdb.fillCounts(taxon_counts); - TaxReport rep = TaxReport(*Report_output, taxdb, false); - rep.printReport("kraken","blu"); - } - delete reader; } @@ -275,7 +312,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss) { vector taxa; vector ambig_list; - map hit_counts; + unordered_map hit_counts; uint64_t *kmer_ptr; uint32_t taxon = 0; uint32_t hits = 0; // only maintained if in quick mode @@ -297,9 +334,13 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, if (taxon) break; } + #pragma omp critical + { + taxon_counts[taxon].kmers.add(*kmer_ptr); + ++taxon_counts[taxon].n_kmers; + } + if (taxon) { - taxon_counts[taxon].kmers.add(*kmer_ptr); - ++taxon_counts[taxon].n_kmers; hit_counts[taxon]++; if (Quick_mode && ++hits >= Minimum_hit_count) break; @@ -318,7 +359,9 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, if (call) #pragma omp atomic total_classified++; - ++(taxon_counts[call].n_reads); + + #pragma omp critical + ++(taxon_counts[call].n_reads); if (Print_unclassified || Print_classified) { ostringstream *oss_ptr = call ? &coss : &uoss; @@ -419,7 +462,7 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:")) != -1) { + while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:s")) != -1) { switch (opt) { case 'd' : DB_filenames.push_back(optarg); @@ -470,6 +513,9 @@ void parse_command_line(int argc, char **argv) { case 'r' : Report_output_file = optarg; break; + case 's' : + Print_sequence = true; + break; case 'a' : TaxDB_file = optarg; break; @@ -524,7 +570,7 @@ void usage(int exit_code) { << " -f Input is in FASTQ format" << endl << " -c Only include classified reads in output" << endl << " -M Preload database files" << endl - << " -s Print sequence in Kraken output" << endl + << " -s Print read sequence in Kraken output" << endl << " -h Print this message" << endl << endl << "At least one FASTA or FASTQ file must be specified." << endl diff --git a/src/gzstream/.Makefile.swp b/src/gzstream/.Makefile.swp new file mode 100644 index 0000000000000000000000000000000000000000..f5e077d91d7cee7748a0835509de1d24b28f23d8 GIT binary patch literal 12288 zcmeHNJ8UDz86M|x$=PudpUb%TZ5`m#nOu^RFXy8m1AElt2vZV4QaQ=ZVYoXa2e;ha z%*=|Sj6;BwhJh3|V7N%<1bG2*nj}b?BzBP^RXUeOiU0{x1Syg--z>S*O1XotP~%`1 z`oPuB%s2o4{g0)f=~uti+@YJ!%M91o8GHAY=U#p5(wkR4{xM^Nz7o<*q|idy;YVG8 zjKW8a<)2ryr?~M()k>bZG8TppMHr2xr^~U7MIb{_(UZ!3#qW!-qLo(}o@k>4A6Itp zXdnZT=Pm9F1quZoM}eo=>PoF@XD+?)JUx45|8W)-a|#6t1quZU1quZU1quZU1quZU z1^!PIF!2-YJMiUmbAJxzpHH0ooc~&U6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy zd(MU|JmRFzxWzs5~u=Y;2&RQ>;vH2KnYj|{`xdyKLq+f6(|FL z`wC<40|&rm;1clnrx^Pq@E+g;9Jm5J1N`Ah#=Z}{4qO5Le3`L#fK6Zn`1h9?`z`Pr z;77olz!W$JmVtkLiLpNc-vthUXMv9{G4>(wGvKGdTYv=a0JP4@1 zST}o?KDhcu-@_Bu~}Y8dVwMr$Wz zq_ZjiOX+qVob!4iK*Y{W2UTupVGgWnw`bl^**oVrtjc-lH1hspEHf_k-)Wrkk51iv z6#vYjMTi#Mv*#JJ{q1piPrW-eq``7ea{`>>T%zmOuhX9MnnOJ#4Gm-{^+@=n6VDS` z50W65I(AZ;YQq!9jALhbf{pAQyX?!sfEs)4!{+X`(`w#c3^eh&q7)M7! z!yU5g#MKI5G!2)qR~s+{u{cGFh?(Q_fIKivuo0(sCRJf>dI+GSSRy>qP2#f*y40 zT^*|~P~$RVZYNcRXq~tsPqBH2vJc8Fhsr@#2e>_9S;^bcCPw!dL))8CtkdwNmvFm8 zgGf0pKEjus3VPCkKB+(Q#1)F<0T&yB#F>*@wR>$Fm> z)+)7XrTQ|}Zq%!{>MO6%)!g)1O<;PqzD_k~HJ!3$b6|1RX>ph~HSz+#->k!)ttbg$ zIUFcaGxVGi-MEE!RoZjfHq<5RCP-OYU9FbrcBGA+x#JR5S8BCdxwcxpNwnW}!PSzU z2$2Fjh>*JzX-2a|ocG=}k^>4OQ=*Boj7TP116xW6>P7=I;YxtzFiJ>|q9pKX#E;PV zgeOp2K5-OAJf&!mF5BMSrxrS_P_!)&3La29>7!aeVQqQX(85C7fyR*%ZZ0~p4d)-Zks0-t*7cd7L z{lw7SX7A44evjPUL%QpBI-opUOVx-lDvm^$F0gG~!Yg>KcxbF&An4h`o$E!o^(9VJ zOVDB_3f}HCiekGZ+NMM`OLPrXcA^EoOO(KXpRDbzIJHczd1y-3t7}-AR%R62xG$qq zO({Uus%)i|V-ZYEvDG#G+R~qzuxdc1nhT6T#VcwI>( lygkv5w{6<&kh + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/src/gzstream/Makefile b/src/gzstream/Makefile new file mode 100644 index 0000000..9884a9e --- /dev/null +++ b/src/gzstream/Makefile @@ -0,0 +1,88 @@ +# ============================================================================ +# gzstream, C++ iostream classes wrapping the zlib compression library. +# Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# ============================================================================ +# +# File : Makefile +# Revision : $Revision: 1.3 $ +# Revision_date : $Date: 2001/10/04 15:09:28 $ +# Author(s) : Deepak Bandyopadhyay, Lutz Kettner +# +# ============================================================================ + +# ---------------------------------------------------------------------------- +# adapt these settings to your need: +# add '-DGZSTREAM_NAMESPACE=name' to CPPFLAGS to place the classes +# in its own namespace. Note, this macro needs to be set while creating +# the library as well while compiling applications based on it. +# As an alternative, gzstream.C and gzstream.h can be edited. +# ---------------------------------------------------------------------------- + +# CXX = CC -n32 -LANG:std # for SGI Irix 6.5, MIPSpro CC version 7.30 +CXX = g++ # for Linux RedHat 6.1, g++ version 2.95.2 + +CPPFLAGS = -I. -O +LDFLAGS = -L. -lgzstream -lz +AR = ar cr + +# ---------------------------------------------------------------------------- +# plain simple rules to make and cleanup the library: +# make default; compiles the library +# make test; compiles and executes test. O.K. message marks success. +# make clean; removes temporary files +# make cleanall; removes temporary files, the library, and programs +# ---------------------------------------------------------------------------- + +default: libgzstream.a + +test: test_gzip test_gunzip + ./test_gzip COPYING.LIB gz.tmp.gz + gunzip gz.tmp.gz + diff COPYING.LIB gz.tmp + gzip gz.tmp + ./test_gunzip gz.tmp.gz gz.tmp + diff COPYING.LIB gz.tmp + rm gz.tmp.gz gz.tmp + # *** O.K. Test finished successfully. *** + +gzstream.o : gzstream.C gzstream.h + ${CXX} ${CPPFLAGS} -c -o gzstream.o gzstream.C + +test_gzip.o : test_gzip.C gzstream.h + ${CXX} ${CPPFLAGS} -c -o test_gzip.o test_gzip.C + +test_gunzip.o : test_gunzip.C gzstream.h + ${CXX} ${CPPFLAGS} -c -o test_gunzip.o test_gunzip.C + +libgzstream.a : gzstream.o + ${AR} libgzstream.a gzstream.o + +test_gzip : test_gzip.o libgzstream.a + ${CXX} -o test_gzip test_gzip.o ${LDFLAGS} + +test_gunzip : test_gunzip.o libgzstream.a + ${CXX} -o test_gunzip test_gunzip.o ${LDFLAGS} + +clean : + rm *.o + +cleanall : + rm *.o libgzstream.a test_gzip test_gunzip + +# ============================================================================ +# EOF + diff --git a/src/gzstream/README b/src/gzstream/README new file mode 100644 index 0000000..5fb78b2 --- /dev/null +++ b/src/gzstream/README @@ -0,0 +1,6 @@ + + gzstream + C++ iostream classes wrapping the zlib compression library. +=========================================================================== + + See index.html for documentation and installation instructions. diff --git a/src/gzstream/gzstream.C b/src/gzstream/gzstream.C new file mode 100644 index 0000000..8cb4590 --- /dev/null +++ b/src/gzstream/gzstream.C @@ -0,0 +1,165 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.C +// Revision : $Revision: 1.7 $ +// Revision_date : $Date: 2003/01/08 14:41:27 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#include +#include +#include // for memcpy + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { + if ( is_open()) + return (gzstreambuf*)0; + mode = open_mode; + // no append nor read/write mode + if ((mode & std::ios::ate) || (mode & std::ios::app) + || ((mode & std::ios::in) && (mode & std::ios::out))) + return (gzstreambuf*)0; + char fmode[10]; + char* fmodeptr = fmode; + if ( mode & std::ios::in) + *fmodeptr++ = 'r'; + else if ( mode & std::ios::out) + *fmodeptr++ = 'w'; + *fmodeptr++ = 'b'; + *fmodeptr = '\0'; + file = gzopen( name, fmode); + if (file == 0) + return (gzstreambuf*)0; + opened = 1; + return this; +} + +gzstreambuf * gzstreambuf::close() { + if ( is_open()) { + sync(); + opened = 0; + if ( gzclose( file) == Z_OK) + return this; + } + return (gzstreambuf*)0; +} + +int gzstreambuf::underflow() { // used for input buffer only + if ( gptr() && ( gptr() < egptr())) + return * reinterpret_cast( gptr()); + + if ( ! (mode & std::ios::in) || ! opened) + return EOF; + // Josuttis' implementation of inbuf + int n_putback = gptr() - eback(); + if ( n_putback > 4) + n_putback = 4; + memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + + int num = gzread( file, buffer+4, bufferSize-4); + if (num <= 0) // ERROR or EOF + return EOF; + + // reset buffer pointers + setg( buffer + (4 - n_putback), // beginning of putback area + buffer + 4, // read position + buffer + 4 + num); // end of buffer + + // return next character + return * reinterpret_cast( gptr()); +} + +int gzstreambuf::flush_buffer() { + // Separate the writing of the buffer from overflow() and + // sync() operation. + int w = pptr() - pbase(); + if ( gzwrite( file, pbase(), w) != w) + return EOF; + pbump( -w); + return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only + if ( ! ( mode & std::ios::out) || ! opened) + return EOF; + if (c != EOF) { + *pptr() = c; + pbump(1); + } + if ( flush_buffer() == EOF) + return EOF; + return c; +} + +int gzstreambuf::sync() { + // Changed to use flush_buffer() instead of overflow( EOF) + // which caused improper behavior with std::endl and flush(), + // bug reported by Vincent Ricard. + if ( pptr() && pptr() > pbase()) { + if ( flush_buffer() == EOF) + return -1; + } + return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { + init( &buf); + open( name, mode); +} + +gzstreambase::~gzstreambase() { + buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { + if ( ! buf.open( name, open_mode)) + clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { + if ( buf.is_open()) + if ( ! buf.close()) + clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff --git a/src/gzstream/gzstream.h b/src/gzstream/gzstream.h new file mode 100644 index 0000000..861653f --- /dev/null +++ b/src/gzstream/gzstream.h @@ -0,0 +1,121 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+256; // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); +public: + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/src/gzstream/index.html b/src/gzstream/index.html new file mode 100644 index 0000000..8a9ef8e --- /dev/null +++ b/src/gzstream/index.html @@ -0,0 +1,145 @@ + +Gzstream Library Home Page + + + +

Gzstream Library Home Page

+ +
+
+ + + +
+ + +
+

Introduction

+ +Gzstream is a small C++ library, basically just a wrapper, +that provides the functionality of the +zlib C-library in a C++ iostream. +It is freely available under the LGPL license.

+ +Gzstream has been written by +Deepak Bandyopadhyay and +Lutz Kettner at +the Computational +Geometry Group at UNC Chapel Hill.

+ + +


+

Supported Systems

+ +Gzstream requires a standard compliant C++ compiler (we use the new +header file conventions and the new iostream in the std:: name space) +and, of course, zlib. We used zlib 1.1.3 so far, but see the zlib home page for why you should +upgrade to zlib 1.1.4. So, in theory, the provided sources could run +on many platforms. However, we used only the following few +platforms.

+

+ +

    +
  • PC Linux, RedHat 6.1, g++ version 2.95.2 +
  • PC Linux, Debian, g++ version 2.95.2 and 3.1 +
  • SGI Irix 6.5, MIPSpro CC version 7.30 +

+ + +


+

Installation

+ +Either compile gzstream.C by hand, place it in some library, +and move gzstream.h into the include search path of your +compiler. Or use the provided Makefile, adapt its +variables, and follow the remarks in the Makefile. Two +test programs are provided, test_gzip.C and test_gunzip.C. +The Makefile contains a rule that performs a small test +with these programs.

+ + +


+

Documentation

+ +The library provides two classes, igzstream and ogzstream, +that can be used analogously to ifstream and ofstream +respectively.

+ +The classes are by default in the global name space. This can +be changed by setting the macro GZSTREAM_NAMESPACE to +the desired name space, e.g., by setting the option +-DGZSTREAM_NAMESPACE=gz in the Makefile. +However, this needs to be consistent for both, the library compilation +and the application that uses the library.

+ + +


+

What's Missing

+ +
    +
  • Seek. The zlib library provides the necessary functionality, + but we have not realized that in the wrapper (yet? ;-). +
  • Both streams are based on the same streambuffer. So, they + cannot be used to derive an iogzstream class that would allow + simultaneous reading and writing to the same file. +

+ + +


+

Download and Release Notes

+ +
    +
  • Gzstream library 1.5 (08 Apr 2003): + gzstream.tgz
    + Fixed bug that did not set the state correctly on failure to open or + close a file.
    + Fixed bug in the indexing of the write buffer that + caused the write buffer to shrink continously and finally caused + wrong results when writing compressed files (only observed on some + platforms).

    +

  • Gzstream library 1.4 (27 Apr 2002):
    + Fixed a bug that stopped stream output after calling flush() + or using std::endl.

    +

  • Gzstream library 1.3 (06 Nov 2001):
    + Fixed unsigned char -- signed char bug. Increased buffer size + for better performance.

    +

  • Gzstream library 1.2 (04 Oct 2001):
    + Initial release as gzstream, renamed from zipstream.

    +

  • Zipstream library 1.1 (09 Sep 2001):
    + Initial release. +
+ +
+

Acknowledgements

+ +Credits for finding bugs and improving this software go to: +Vincent Ricard, Peter Milley, Peter J. Torelli, and Ares Lagae. +

+ +


+

Links

+ + + +
+
+ The Computational Geometry Group at UNC Chapel Hill, Jan. 08, 2003. +
+ + + diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a new file mode 100644 index 0000000000000000000000000000000000000000..92861086535726f3d31314ecb9e4bfe8ba8724b8 GIT binary patch literal 14254 zcmdT~4{RJ&dY|>!#3AW=L%8B2Z8e4f4hhb#V;hHmU1#lhOqO6`CqyK<$$GtxSFyd8 zcN{xTOI-F67#X8TTwB!AJFcn(=+s_0wTQwAd@RIwRRuY@F}58rp~R->F^E$w%1YiW-~TMZ*B zs`%Y7+P1flz~O5(P5ZK@RekzJU)CeLW3eMMd3z$08XcG%+CDipn3)(F$xU?~)6~kA z+<0cJt9MsATT#`T9?9i1J~Exn0_@*cW~M^EGv+57C9v7^FqO{`D`+f(L1q83s5REy zV(l)k@pPB*ce=`hWTa^xoLKYF$Yg%lf;bIjCVU)j&mE)SRYs^Ke|#+M+a5PUiqlH0 z%Un-1mpY-mA{J1=x~IQS?TP-L3Z$<*f*9MLN~m9fG}%o}Id|;y5g&_2w;2Z9W8&V{ z27^7etz}!Y81sj(J=vAqSrrPwz;`L6-KUjG64kUwxRj5JY+T!<-70v04a#+JpnQ*0l-@uoOfA84Bc;DV>2E=5z}3;ex6gDwGv}rv=BZ0{Up)<&dG}O(-D7i*Tbh}D zZ3<1;2TkWgb918hZ>phGxLjTL;P=e*xlM5UL2`a@jcIOrFX{a21*p`_`J+j5?nM06 z%^+zy{~^Fr3y&H}CtYhgMuJleKPABNDUlR^&htgC~mQj@@9+ty=Dlx;iwB1GV23s_QHsD4vl~UtA>uw)prH zU{GY|a}c8cbp2UO)Cx>3&y!Abu~3qG4YNXSDPcNqn^IUZFNJ%Yx1{Q%^V0xf$suB$ zoV%e%pSkdF4?qt<+?~%{znmY7^gxm7mp*_oRQNeKj^oHM=W>narccbdlX2&w;CvPP zcl+CuYn=~Vt7UtwIoI77eHGwyJ%A$e zOY0#DNjZiIbfeI5u;Wm%V;{B9`K-tJq{Dg7oc}hCWiN1xg~L_or>31P3}fzWA&Uu& z;Bl&e>~twRTgpD+Wpz}*27PiK79%-H784Y)1Vt=ixa>HxD?3>Ms;pBD;Ov%!4!U^#P zwoUZStC)KoZzW7j53Mvm+%-zDn~QD~=TEe|UAsXH{&G+@jRyMia;P>^bDMT=4Om%` zRxcVW*1lEi0_|XFol!HXMQbc=Ka~F@T zxXWnl-o1B|(bCl3)M_+GW3AD)=50pfp3INQMP7EUs)QUGzpGQ&8MrfutDLhyuW{A%H*oZPfJ#gVG70RDRUkW-Ji3?A_P9s$H zVm;79aQX0>l6MQ#{|+(&Jr9@QRonQDQX6s2+d>0xEeZi)yWpKj(0aU!c0DB8h5QGp zZnfahG;GH>VfP|d)(%^`rWsK64DrPwuicOF%!X@=J2A7f2o|v4zDqnU$YX^g{eWW( z`U#(fyiW|Kq#?0>)1og1gYf@|@E_OiXGza_sR!aSU$rA#|0Irq)72f}#zIYJI9l~? zxDf#85MY22aS!mnPqOzwDr5T;%FgRM!p4&*d%C7094%aHr!(WW1^PxZ0c@Y620s>8pvISt{L7AIZ7Ve~kva}Q4!s9A)NeJn zDWz)V!>vrP?8A_ZaLLcFS3cKO!|IoxTCaYaH>Go%E>gyzU&=t|ADKJpZ2kjYm&4p(;)k^ zB+vUV<|A(ahD*~maB+H)3O6Vf(yrv)02vh%b&W2^1$ryO{=;=&#zU;NP`(u|v@f1= zkV^V9#jeCvTU($4bq2|!ZceIv8&rz+(;6yyrF|SToD?>V$CUG3xWg^IKp?gs_g&QX z<8vA9S1F4Bo!T$^4_E}L?fG3m{rvxw@*8{KIFs>i;u< z`T4(Vl`D89|MA;F$h+*`{b7>t_7FFtw*NhVajQ%1|K~_v`YY9_((Kk-2|G1UsC%-`Q(vH-DeZc9LQ1J7AnEW@18U4q7J=OmW z0ruY~`?ycYN%jBk0C{PskqblKP|9r}@8|zt`uLCgkgER&0_?|$aaOTsDCKyK_~HA%YJVoc{sJ+o{iT*C1MDBhg#{?;Cc-}MyQ=p8D8T-CV(cY(nHVxd>F)>F zKSTD<5FfVRfJ?Ri*JNM%N$gq_7keOD2HgVOfnAVHUnhBjz2p$ddjMsmoW%d0@OfQ(0w$I-BKz3vK*A6pViub`R*g=;z?d9scUO0K|pj(eG0g2 zx_Ba1is<6+gdZNM1Od^-AB@T>T@zoSWfka275M)ldoMu-_dTHGC+{cS+Z8BN1$Oa` zxRP|S()!`}N2QW<4Og~G(!Je)@;Y6^m7tPzF+%+Cn=3&;bTOjKtGYyyx4u-bYd4h& zET?<3nt_njA<}-NLyaHlzB3W^lx?kfqI$(!t6QaO%Ah9+-J3n03hfcdUdP7sOH?%l zso!XOGyp$EdcH&QPf_|Dq<(sy2ORCjDb!*g2(X0!{_g{R6PqtRP)7O@Tz>jL0=$8Z zpIIm)Rf9c0{#XFMBLII0a6f;B1YWZ6kZV>u5uoQWz%dTTAVtEsAmM&bgJy_4-g+tb zEx=3Ky=v`;0XVL|sQ))IuJ>aQPT&BYY>}V!sN^KMZ9*e=Oqv zD)Gn3)f0rjMY!0jg1V;(zfAZn@xMs;G)8O;vh)YA*-yzk@in)z$3Y|0#g~;{Y6&j0Uz2%lGZ;1deaK zF3IQhM!@~z@IU}QBlMKkZTUVtFYwa3jn8MK=K}QnZ2}*G$oou(P zYz{X7E04vr`kXQ;JEJq28BLELhm#j@fM>8mo>@9FgQs^YIJ}b~M`pwko(cx{0FQNK zdfFP#OyqN8sgbOG+&UHo&dO6g{rmdtSnEJ4pG{ldp`xBNfU)#&YQnN7Qdv9SlkVz5 zLw$DJNG_ckp));u$ELDlgVvr*eiHTq)u?&Lc=(YVI>j?M((4{qD%;ikoRdP}=62W1 zY!2KB^sgPNEnv*Zb4>r&M~sflOig6%%4mFHM~$u+7#J~`)I=AAtl8>;=)h&C$iuSm zvtvqLMct@Rm|J?lsOzFN zn9o_msj8UOX#2F7y6SGdI5fovtC%snV^Is_rc(piW3gt)=d)IN zB$dxw_VMwIl^q+(VdMIH{G0AOk-&0U@ofBMfgr@UHGB%vdyt)Xr zPL|GjuGG}_@d5a4GnC4XbcrL5E0t@T&W}%`i@Su6{pt@SO?%LN%@AK!4Uqq(`x|_D zkl{Bmd=CZP9Kfbfd=l}Bo z{NHB$w=@1%82@IbzsUIUT~q3Rm+^07{Qu1Ox&EIrey;x`hT|TVY?u6`#rSZ$Yw2AU z6K?lx4 z|2D?Y<9QF`=ke?b;NKI#{|Li*+%gR3{W8jM&i{K1zZu#g?%xq@JS;=k3ZfoVV-i4Bx@@TqGR*;q&(s1&>2L z5*}hgQrmkM;c9z#Fn&HxPB8vB=#%Yo7|z@K2Mp)!{U+1T=husjpUch9zs&}tY-L5mqpyegkzk$;gb5- z5|28r))9{Uw=jOWPOJUc!1y;aevCPi%8&b?NT{Fl-zQKP=W+g`{gnG1=nr2H<$js! zKlhXSFJq2+xc}5Fv4e?qa$lI^_d`KGU*!HfJ|0ZT#Oe{>#c-KLd^cQvdXka}QrVBl zFQ4^T&F%IeKqU1czwA@2;(D5h7ZZc>$lptOOyvFr;&RUp@hIMa5Z@zm?hT(oNQn1| zoO_E?9`U;nD#t;&Eb*P>TEyP#%+d$|Ejgh>(FdAjIVu#&SE}fK)wkfx5WdD@2ILQ_fk$4ZH#2 zSjJ0|9Agsy4pod$D)Bfv0Q9t*h;^)y@|e6pdE}=hP#`Z6j_V#RX#)8N!jT`gggnB1 zYb4~Cu@wN>m;T6o;njpA|7yYL-l7V>+~@67aJkQWSi$8!udU#6pI4swlm5v4+y#YS z?&r#Lev)7A=l%=vOaH$_ZKy^@AjvQHc?|`Z`?;G4mwM#-cu?V&`?eW{Uyie96@IyI L`-+0g_3*y|@rP<5 literal 0 HcmV?d00001 diff --git a/src/gzstream/logo.gif b/src/gzstream/logo.gif new file mode 100644 index 0000000000000000000000000000000000000000..e259089fbb097573bdc3bab9eccc75e548cf7251 GIT binary patch literal 1651 zcmc(e`&Z0)7{}l3jC|8rW;J1BOOZK!%QYqubK-O%w{_WMSe0qa7^NBs*~ZDCEjE{2 zHl>gOFG4Tw%OhPVV@tKAD-u&_wzdM^M3pT{k%k> zsZik)AVhEP=U)C!P_alxGV;{8)O_9@nx=%#X|xC1Ik4l zX~hCW3}#ShHK%|DkAPxOV#W(s2sog`n9}QI)oO&mvM`7_4J>R49&;^F)_@q`a4EG! zVqlE|%PmIojQYx=2{15l(E!*2l1(tNAOsA9bR%L}SQ9)jquCYe02o>z5P+hfo(4lZ z1S65Ghbj7BUPuzb(BSeR=HWpynpq2r`U?zQkmymXhKUB@gQ1X}h|~9E5Qbrb z)Lxx5OZWtY`$1i6qK%>Y1(hs2n2=zY``|hX9Ox4e-HLGnh$-aMS``|>Cn^mWg#aJm z8t|VDVq5M~4&neQ5QEp-!N!2d!KkoN3X>BdQih2)bLH8H#XwLX&v_fUhn0aifr+rP zk^@QQ0C}pQQltb?TC51W?mf>coRiIViN1|XLSSnCaQxmsZ({k{` z!5dKi|I6KiVEF}5W>`^U0!ngjCQ&2YBy}ZcYW5FzS(q76diGZ8sOgGlb*1O-q&dw` z853BRd-tetMS3mou)a@L!>-)msvhn54E9vDL{|dQgTKZFag@2_B+n5zH zl3jbv`Qyyj4*YJv8Yi3i39S`w{&MV1Q{*ci?_S=cd}_aA@SUt?_tid3e`X)7%l2=x8>-l~`3s zc4}CEM|H!v_)~j+wy8Xo^yk!3qxU{KEczpCJH^|2Rl%KQz)#A#R%SXGZ z!hC7%@wWBW+W12UrtIr|9_QXlIm>V9PP!y{+S}!(Jd8HJ3h}P$eJS%#F>Fj+^mSc$ znwR&i@gdX8ww>D$oRR$F3Hx`0U5~vuxaOOk5g98ZG8BGy@)q9K?O1g^XAjIY8P6T^b7OPUS1Z`_CnE~jyv#Tgo1e4K&2%C6 zxWc5lkY~z1R-jclUNjfyRY?oi-K=Wg(V~%_j47YnF*f4W;=qjw%Wh#H{+AWCwVF%%i?!{u9KPFZ;~;r^lM?AYt`F-@Ezu319F((xBU4VT z5zq;xMY=YJD8_EKzPsYLFv+yrLr#n-tR5aWCpvxEFQ+cu@vToNzq{(}=l6QAr>Wz0 zzNw#!?@#XMW!5j4KlEMA; +#include +#include +#include + +int main( int argc, char*argv[]) { + if ( argc != 3) { + std::cerr << "Usage: " << argv[0] <<" \n"; + return EXIT_FAILURE; + } + // check alternate way of opening file + igzstream in2; + in2.open( argv[1]); + if ( ! in2.good()) { + std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n"; + return EXIT_FAILURE; + } + in2.close(); + if ( ! in2.good()) { + std::cerr << "ERROR: Closing file `" << argv[1] << "' failed.\n"; + return EXIT_FAILURE; + } + // now use the shorter way with the constructor to open the same file + igzstream in( argv[1]); + if ( ! in.good()) { + std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n"; + return EXIT_FAILURE; + } + std::ofstream out( argv[2]); + if ( ! out.good()) { + std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n"; + return EXIT_FAILURE; + } + char c; + while ( in.get(c)) + out << c; + in.close(); + out.close(); + if ( ! in.eof()) { + std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n"; + return EXIT_FAILURE; + } + if ( ! out.good()) { + std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n"; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +// ============================================================================ +// EOF diff --git a/src/gzstream/test_gzip.C b/src/gzstream/test_gzip.C new file mode 100644 index 0000000..0c691ae --- /dev/null +++ b/src/gzstream/test_gzip.C @@ -0,0 +1,78 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : test_gzip.C +// Revision : $Revision: 1.3 $ +// Revision_date : $Date: 2001/10/04 15:09:28 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Short test program reading a file, compressing it, and writing it. +// ============================================================================ + +#include +#include +#include +#include + +int main( int argc, char*argv[]) { + if ( argc != 3) { + std::cerr << "Usage: " << argv[0] <<" \n"; + return EXIT_FAILURE; + } + // check alternate way of opening file + ogzstream out2; + out2.open( argv[2]); + if ( ! out2.good()) { + std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n"; + return EXIT_FAILURE; + } + out2.close(); + if ( ! out2.good()) { + std::cerr << "ERROR: Closing file `" << argv[2] << "' failed.\n"; + return EXIT_FAILURE; + } + // now use the shorter way with the constructor to open the same file + ogzstream out( argv[2]); + if ( ! out.good()) { + std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n"; + return EXIT_FAILURE; + } + std::ifstream in( argv[1]); + if ( ! in.good()) { + std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n"; + return EXIT_FAILURE; + } + char c; + while ( in.get(c)) + out << c; + in.close(); + out.close(); + if ( ! in.eof()) { + std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n"; + return EXIT_FAILURE; + } + if ( ! out.good()) { + std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n"; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +// ============================================================================ +// EOF diff --git a/src/gzstream/version b/src/gzstream/version new file mode 100644 index 0000000..511137d --- /dev/null +++ b/src/gzstream/version @@ -0,0 +1 @@ +1.5 (08 Jan 2003) diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index 0c424c4..48e54e9 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -23,9 +23,9 @@ using namespace std; namespace kraken { - // Build a node->parent map from NCBI Taxonomy nodes.dmp file - map build_parent_map(string filename) { - map pmap; + // Build a node->parent unordered_map from NCBI Taxonomy nodes.dmp file + unordered_map build_parent_map(string filename) { + unordered_map pmap; uint32_t node_id, parent_id; string line; ifstream ifs(filename.c_str()); @@ -47,7 +47,7 @@ namespace kraken { // Return lowest common ancestor of a and b // LCA(0,x) = LCA(x,0) = x // Default ancestor is 1 (root of tree) - uint32_t lca(map &parent_map, + uint32_t lca(unordered_map &parent_map, uint32_t a, uint32_t b) { if (a == 0 || b == 0) @@ -71,12 +71,12 @@ namespace kraken { // Tree resolution: take all hit taxa (plus ancestors), then // return leaf of highest weighted leaf-to-root path. - uint32_t resolve_tree(map &hit_counts, - map &parent_map) + uint32_t resolve_tree(unordered_map &hit_counts, + unordered_map &parent_map) { set max_taxa; uint32_t max_taxon = 0, max_score = 0; - map::iterator it = hit_counts.begin(); + unordered_map::iterator it = hit_counts.begin(); // Sum each taxon's LTR path while (it != hit_counts.end()) { diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp index 196ee1d..30eb67d 100644 --- a/src/krakenutil.hpp +++ b/src/krakenutil.hpp @@ -21,19 +21,20 @@ #define KRAKENUTIL_HPP #include "kraken_headers.hpp" +#include namespace kraken { // Build a map of node to parent from an NCBI taxonomy nodes.dmp file - std::map build_parent_map(std::string filename); + std::unordered_map build_parent_map(std::string filename); // Return the lowest common ancestor of a and b, according to parent_map // NOTE: LCA(0,x) = LCA(x,0) = x - uint32_t lca(std::map &parent_map, + uint32_t lca(std::unordered_map &parent_map, uint32_t a, uint32_t b); // Resolve classification tree - uint32_t resolve_tree(std::map &hit_counts, - std::map &parent_map); + uint32_t resolve_tree(std::unordered_map &hit_counts, + std::unordered_map &parent_map); class KmerScanner { public: diff --git a/src/make_seqid_to_taxid_map.cpp b/src/make_seqid_to_taxid_map.cpp index 8b968aa..c8a30ed 100644 --- a/src/make_seqid_to_taxid_map.cpp +++ b/src/make_seqid_to_taxid_map.cpp @@ -50,12 +50,12 @@ int main(int argc, char **argv) { char *map_filename = argv[1]; char *list_filename = argv[2]; - char *nodes_filename; - char *names_filename; - if (argc == 5) { - nodes_filename = argv[3]; - names_filename = argv[4]; - } + //char *nodes_filename; + //char *names_filename; + //if (argc == 5) { + // nodes_filename = argv[3]; + // names_filename = argv[4]; + //} fill_request_map(list_filename); report_taxo_numbers(map_filename); diff --git a/src/report-cols.h b/src/report-cols.h index 7087a82..a34a755 100644 --- a/src/report-cols.h +++ b/src/report-cols.h @@ -19,6 +19,7 @@ enum class REPORTCOLS : uint8_t { GENOME_SIZE, NUM_READS, NUM_READS_CLADE, + NUM_KMERS, NUM_UNIQUE_KMERS, TOTAL_SCORE, TOTAL_HIT_LENGTH, diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index a0d601a..4e9d40d 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -22,6 +22,7 @@ #include "krakendb.hpp" #include "krakenutil.hpp" #include "seqreader.hpp" +#include #define SKIP_LEN 50000 @@ -45,8 +46,8 @@ bool Allow_extra_kmers = false; bool verbose = false; bool Operate_in_RAM = false; bool One_FASTA_file = false; -map Parent_map; -map ID_to_taxon_map; +unordered_map Parent_map; +unordered_map ID_to_taxon_map; KrakenDB Database; int main(int argc, char **argv) { diff --git a/src/taxdb.h b/src/taxdb.h index da11c96..0d93207 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -34,8 +34,8 @@ typedef uint32_t TaxId; struct ReadCounts { - uint32_t n_reads; - uint32_t n_kmers; + uint64_t n_reads = 0; + uint64_t n_kmers = 0; HyperLogLogPlusMinus kmers; // unique k-mer count per taxon }; @@ -83,10 +83,16 @@ class TaxonomyEntry { uint64_t genomeSize = 0; uint64_t genomeSizeOfChildren = 0; uint64_t numBelow = 0; - uint64_t numKmers; + uint64_t numKmers = 0; HyperLogLogPlusMinus kmers; }; +struct TaxonomyEntryPtr_comp { + bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { + return ((a->numReadsAligned+a->numReadsAlignedToChildren) > (b->numReadsAligned+b->numReadsAlignedToChildren)); + } +}; + class TaxonomyDB { public: TaxonomyDB(const std::string inFileName); @@ -116,12 +122,14 @@ class TaxonomyDB { void TaxonomyDB::createPointers() { for (auto& tax : taxIDsAndEntries) { + if (tax.second.parentTaxonomyID != tax.first) { auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); if (parentIt != taxIDsAndEntries.end()) { tax.second.parent = &(parentIt->second); parentIt->second.children.push_back(&tax.second); } } + } } TaxonomyDB::TaxonomyDB(const std::string inFileName) { log("Building taxonomy index"); @@ -209,13 +217,22 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { uint32_t taxonomyID, parentTaxonomyID; std::string scientificName, rank; - while (inFile >> taxonomyID >> parentTaxonomyID >> rank >> scientificName) { + std::string line; + while (!inFile.eof()) { + inFile >> taxonomyID >> parentTaxonomyID; + inFile.get(); // read tab + std::getline(inFile, scientificName, '\t'); + std::getline(inFile, rank, '\n'); TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); + //cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl; taxIDsAndEntries.insert({ taxonomyID, newEntry }); } + taxIDsAndEntries.insert({ + 0, {0, 0, "no rank", "unclassified" } + }); } uint32_t TaxonomyDB::getLowestCommonAncestor( @@ -385,18 +402,28 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const { void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { for (auto& elem : taxon_counts) { + //cerr << "fill: "<< elem.first << endl; TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first); + //cerr << "fill done: "<< elem.first << endl; tax->numReadsAligned += elem.second.n_reads; tax->numKmers += elem.second.n_kmers; tax->kmers += elem.second.kmers; + //std::cerr << "adding " << elem.second.n_reads << " to " << tax->scientificName << ": "; + while (tax->parent != nullptr) { tax = tax->parent; + //std::cerr << " >> " << tax->scientificName; tax->numReadsAlignedToChildren += elem.second.n_reads; tax->numKmers += elem.second.n_kmers; tax->kmers += elem.second.kmers; } + //std::cerr << endl; } + + for (auto& tax : taxIDsAndEntries) { + std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); + } } @@ -418,7 +445,7 @@ class TaxReport { }; TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { - _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; } void TaxReport::printReport(std::string format, std::string rank) { @@ -426,9 +453,13 @@ void TaxReport::printReport(std::string format, std::string rank) { _taxdb.taxIDsAndEntries.at(0).numReadsAligned + _taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren + _taxdb.taxIDsAndEntries.at(1).numReadsAligned + - _taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren + - _taxdb.taxIDsAndEntries.at(-1).numReadsAligned + - _taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree + _taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren;// + + //_taxdb.taxIDsAndEntries.at(-1).numReadsAligned + + //_taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree + if (_total_n_reads == 0) { + std::cerr << "total number of reads is zero - not creating a report!" << endl; + return; + } if (format == "kraken") { // A: print number of unidentified reads @@ -436,7 +467,7 @@ void TaxReport::printReport(std::string format, std::string rank) { // B: print normal results printReport(_taxdb.taxIDsAndEntries.at(1),0u); // C: Print Unclassified stuff - printReport(_taxdb.taxIDsAndEntries.at(-1),0u); + //printReport(_taxdb.taxIDsAndEntries.at(-1),0u); } else { // print stuff at a certain level .. //_uid_abundance; @@ -464,12 +495,13 @@ void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << 100*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break; //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (tax.numReadsAligned + tax.numReadsAlignedToChildren); break; case REPORTCOLS::NUM_READS: _reportOfb << tax.numReadsAligned; break; case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; + case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; //case REPORTCOLS::SUM_SCORE: ; break; From aa6b1794855939d530a705f6298898ecb040b392 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 20 Feb 2017 13:44:22 -0500 Subject: [PATCH 017/105] Don't use nodes file in classify --- scripts/build_kraken_db.sh | 2 +- scripts/kraken | 7 ------- src/classify.cpp | 13 ------------- src/taxdb.h | 8 ++++++-- 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh index 4f64c14..a464a73 100755 --- a/scripts/build_kraken_db.sh +++ b/scripts/build_kraken_db.sh @@ -169,7 +169,7 @@ else # echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" fi -if [ -e "taxDB" ] +if [ -s "taxDB" ] then echo "Skipping step 4.5, taxDB exists." else diff --git a/scripts/kraken b/scripts/kraken index 29cce0d..6f2e290 100755 --- a/scripts/kraken +++ b/scripts/kraken @@ -95,12 +95,6 @@ if ($@) { die "$PROG: $@"; } -my $taxonomy = $db_prefix[0]."/taxonomy/nodes.dmp"; -if ($quick) { - undef $taxonomy; # Skip loading nodes file, not needed in quick mode -} - - my @kdb_files = map { "$_/database.kdb" } @db_prefix; my @idx_files = map { "$_/database.idx" } @db_prefix; @@ -140,7 +134,6 @@ my @flags; push @flags, map { ("-d", $_) } @kdb_files; push @flags, map { ("-i", $_) } @idx_files; push @flags, "-t", $threads if $threads > 1; -push @flags, "-n", $taxonomy if defined $taxonomy; push @flags, "-q" if $quick; push @flags, "-m", $min_hits if $min_hits > 1; push @flags, "-f" if $fastq_input && ! $paired; # merger always outputs FASTA diff --git a/src/classify.cpp b/src/classify.cpp index 9f7933e..ef7d616 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -44,7 +44,6 @@ unordered_map taxon_counts; // stats per taxon int Num_threads = 1; vector DB_filenames; vector Index_filenames; -string Nodes_filename; bool Quick_mode = false; bool Fastq_input = false; bool Print_classified = false; @@ -117,10 +116,6 @@ int main(int argc, char **argv) { #endif parse_command_line(argc, argv); - //if (! Nodes_filename.empty()) { - // cerr << "Building parent node map " << endl; - // Parent_map = build_parent_map(Nodes_filename); - //} if (!TaxDB_file.empty()) { taxdb = TaxonomyDB(TaxDB_file); @@ -481,9 +476,6 @@ void parse_command_line(int argc, char **argv) { omp_set_num_threads(Num_threads); #endif break; - case 'n' : - Nodes_filename = optarg; - break; case 'q' : Quick_mode = true; break; @@ -542,10 +534,6 @@ void parse_command_line(int argc, char **argv) { cerr << "Missing mandatory option -i" << endl; usage(); } - if (Nodes_filename.empty() && ! Quick_mode) { - cerr << "Must specify one of -q or -n" << endl; - usage(); - } if (optind == argc) { cerr << "No sequence data files specified" << endl; } @@ -557,7 +545,6 @@ void usage(int exit_code) { << "Options: (*mandatory)" << endl << "* -d filename Kraken DB filename" << endl << "* -i filename Kraken DB index filename" << endl - << " -n filename NCBI Taxonomy nodes file" << endl << " -o filename Output file for Kraken output" << endl << " -r filename Output file for Kraken report output" << endl << " -a filename TaxDB" << endl diff --git a/src/taxdb.h b/src/taxdb.h index 0d93207..56bd341 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -402,8 +402,12 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const { void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { for (auto& elem : taxon_counts) { - //cerr << "fill: "<< elem.first << endl; - TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first); + auto it = taxIDsAndEntries.find(elem.first); + if (it == taxIDsAndEntries.end()) { + cerr << "No taxonomy entry for " << elem.first << "!!" << endl; + continue; + } + TaxonomyEntry* tax = &it->second; //cerr << "fill done: "<< elem.first << endl; tax->numReadsAligned += elem.second.n_reads; tax->numKmers += elem.second.n_kmers; From 0587b5ec5aff3c325c9b3cc158e8127cf6525ed9 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 21 Feb 2017 10:04:55 -0500 Subject: [PATCH 018/105] Added generate-taxonomy-ids-for-sequences and typed TaxonomyDB --- scripts/build_kraken_db.sh | 11 +- scripts/kraken-build | 11 ++ src/build_taxdb.cpp | 2 +- src/classify.cpp | 7 +- src/set_lcas.cpp | 74 ++++++++-- src/taxdb.h | 277 +++++++++++++++++++++++-------------- 6 files changed, 255 insertions(+), 127 deletions(-) diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh index a464a73..6d86bf7 100755 --- a/scripts/build_kraken_db.sh +++ b/scripts/build_kraken_db.sh @@ -57,7 +57,7 @@ else echo "Kraken build set to minimize RAM usage." fi -if [ -n "$KRAKEN_REBUILD_DATABASE" ] +if [ "$KRAKEN_REBUILD_DATABASE" == "1" ] then rm -f database.* *.map lca.complete fi @@ -174,7 +174,7 @@ then echo "Skipping step 4.5, taxDB exists." else echo "Creating taxDB (step 4.5 of 5)... " - build_taxdb taxonomy/nodes.dmp taxonomy/names.dmp > taxDB + build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB fi @@ -184,10 +184,15 @@ then echo "Skipping step 5, LCAs already set." else echo "Setting LCAs in database (step 5 of 5)..." + PARAM="" + if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then + echo " Adding taxonomy IDs for sequences" + PARAM=" -a" + fi start_time1=$(date "+%s.%N") find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \ - -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 + -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 touch "lca.complete" echo "Database LCAs set. [$(report_time_elapsed $start_time1)]" diff --git a/scripts/kraken-build b/scripts/kraken-build index bf36ae6..7170a67 100755 --- a/scripts/kraken-build +++ b/scripts/kraken-build @@ -63,6 +63,8 @@ my ( $standard, $upgrade, $clean, + + $add_taxonomy_ids_for_seq ); $threads = $DEF_THREAD_CT; @@ -71,6 +73,8 @@ $kmer_len = $DEF_KMER_LEN; $work_on_disk = ""; $hash_size = ""; $max_db_size = ""; +$add_taxonomy_ids_for_seq = 0; +$rebuild = 0; # variables corresponding to task options my @TASK_LIST = ( @@ -108,6 +112,8 @@ GetOptions( "upgrade" => \$upgrade, "standard" => \$standard, "clean" => \$clean, + + "generate-taxonomy-ids-for-sequences" => \$add_taxonomy_ids_for_seq ) or usage(); if (@ARGV) { @@ -235,6 +241,10 @@ Options: (default: 1) --work-on-disk Perform most operations on disk rather than in RAM (will slow down build in most cases) + --generate-taxonomy-ids-for-sequences + Generate taxonomy IDs for sequences, starting with 1000000000. + Can be useful to resolve classifications with multiple genomes + for one taxonomy ID. EOF exit $exit_code; } @@ -284,6 +294,7 @@ sub standard_installation { sub build_database { $ENV{"KRAKEN_REBUILD_DATABASE"} = $rebuild; + $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq; exec "build_kraken_db.sh"; } diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index 08e649a..8e1f11e 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -27,7 +27,7 @@ int main(int argc, char **argv) { std::cout << "Provide names.dmp and nodes.dmp\n"; return 1; } - TaxonomyDB taxdb; + TaxonomyDB taxdb; taxdb.writeTaxonomyIndex( std::cout, argv[1], argv[2]); diff --git a/src/classify.cpp b/src/classify.cpp index ef7d616..611497f 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -27,6 +27,7 @@ #include "gzstream.h" const size_t DEF_WORK_UNIT_SIZE = 500000; +int New_taxid_start = 1000000000; using namespace std; using namespace kraken; @@ -65,7 +66,7 @@ ostream *Report_output; vector Open_fstreams; vector Open_gzstreams; size_t Work_unit_size = DEF_WORK_UNIT_SIZE; -TaxonomyDB taxdb; +TaxonomyDB taxdb; uint64_t total_classified = 0; uint64_t total_sequences = 0; @@ -118,7 +119,7 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); if (!TaxDB_file.empty()) { - taxdb = TaxonomyDB(TaxDB_file); + taxdb = TaxonomyDB(TaxDB_file); for (const auto & tax : taxdb.taxIDsAndEntries) { if (tax.first != 0) Parent_map[tax.first] = tax.second.parentTaxonomyID; @@ -196,7 +197,7 @@ int main(int argc, char **argv) { if (Print_kraken_report) { taxdb.fillCounts(taxon_counts); - TaxReport rep = TaxReport(*Report_output, taxdb, false); + TaxReport rep = TaxReport(*Report_output, taxdb, false); rep.printReport("kraken","blu"); } diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 4e9d40d..73bbf67 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -22,6 +22,7 @@ #include "krakendb.hpp" #include "krakenutil.hpp" #include "seqreader.hpp" +#include "taxdb.h" #include #define SKIP_LEN 50000 @@ -37,18 +38,23 @@ void process_file(string filename, uint32_t taxid); void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish); int Num_threads = 1; -string DB_filename, Index_filename, Nodes_filename, +string DB_filename, Index_filename, TaxDB_filename, File_to_taxon_map_filename, ID_to_taxon_map_filename, Multi_fasta_filename; bool force_taxid = false; +int New_taxid_start = 1000000000; bool Allow_extra_kmers = false; bool verbose = false; bool Operate_in_RAM = false; bool One_FASTA_file = false; +bool Add_taxIds_for_Sequences = false; + unordered_map Parent_map; unordered_map ID_to_taxon_map; +unordered_map SeqId_added; KrakenDB Database; +TaxonomyDB taxdb; int main(int argc, char **argv) { #ifdef _OPENMP @@ -57,8 +63,16 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); - if (!force_taxid) { - Parent_map = build_parent_map(Nodes_filename); + if (!TaxDB_filename.empty() && !force_taxid) { + taxdb = TaxonomyDB(TaxDB_filename); + for (const auto & tax : taxdb.taxIDsAndEntries) { + if (tax.first != 0) + Parent_map[tax.first] = tax.second.parentTaxonomyID; + } + Parent_map[1] = 0; + } else { + cerr << "TaxDB argument is required!" << endl; + return 1; } QuickFile db_file(DB_filename, "rw"); @@ -96,25 +110,43 @@ int main(int argc, char **argv) { delete temp_ptr; } + + if (Add_taxIds_for_Sequences && !TaxDB_filename.empty()) { + ofstream ofs(TaxDB_filename.c_str()); + taxdb.writeTaxonomyIndex(ofs); + ofs.close(); + } + return 0; } void process_single_file() { - cerr << "Processing multiple FASTA files" << endl; + cerr << "Processing FASTA files" << endl; ifstream map_file(ID_to_taxon_map_filename.c_str()); if (map_file.rdstate() & ifstream::failbit) { err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str()); } - string line; + string line, seq_id; + uint32_t parent_taxid, taxid; while (map_file.good()) { getline(map_file, line); if (line.empty()) break; - string seq_id; - uint32_t taxid; istringstream iss(line); iss >> seq_id; - iss >> taxid; + if (ID_to_taxon_map.find(seq_id) != ID_to_taxon_map.end()) + continue; + + if (Add_taxIds_for_Sequences) { + iss >> parent_taxid; + taxid = ++New_taxid_start; + Parent_map[taxid] = parent_taxid; + auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry(taxid, parent_taxid, "sequence")}); + if (!itEntry.second) + cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the XXX option." << endl; + } else { + iss >> taxid; + } ID_to_taxon_map[seq_id] = taxid; } @@ -142,6 +174,15 @@ void process_single_file() { } else { taxid = ID_to_taxon_map[dna.id]; } + + if (Add_taxIds_for_Sequences) { + auto entryIt = taxdb.taxIDsAndEntries.find(taxid); + if (entryIt == taxdb.taxIDsAndEntries.end()) { + cerr << "Error! Didn't find " << taxid << " in TaxonomyDB!!" << endl; + } else { + entryIt->second.scientificName = dna.header_line; + } + } if (taxid) { #pragma omp parallel for schedule(dynamic) @@ -155,6 +196,7 @@ void process_single_file() { ++seqs_no_taxid; } + cerr << "\rProcessed " << seqs_processed << " sequences"; } cerr << "\r "; @@ -232,7 +274,7 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTv")) != -1) { + while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:a")) != -1) { switch (opt) { case 'f' : File_to_taxon_map_filename = optarg; @@ -263,18 +305,21 @@ void parse_command_line(int argc, char **argv) { case 'T' : force_taxid = true; break; - case 'n' : - Nodes_filename = optarg; - break; case 'v' : verbose = true; break; case 'x' : Allow_extra_kmers = true; break; + case 'a' : + Add_taxIds_for_Sequences = true; + case 'b' : + TaxDB_filename = optarg; + break; case 'M' : Operate_in_RAM = true; break; + default: usage(); break; @@ -282,7 +327,7 @@ void parse_command_line(int argc, char **argv) { } if (DB_filename.empty() || Index_filename.empty() || - Nodes_filename.empty()) + TaxDB_filename.empty()) usage(); if (File_to_taxon_map_filename.empty() && (Multi_fasta_filename.empty() || ID_to_taxon_map_filename.empty())) @@ -300,13 +345,14 @@ void usage(int exit_code) { << "Options: (*mandatory)" << endl << "* -d filename Kraken DB filename" << endl << "* -i filename Kraken DB index filename" << endl - << "* -n filename NCBI Taxonomy nodes file" << endl + << "* -b filename Taxonomy DB file" << endl << " -t # Number of threads" << endl << " -M Copy DB to RAM during operation" << endl << " -x K-mers not found in DB do not cause errors" << endl << " -f filename File to taxon map" << endl << " -F filename Multi-FASTA file with sequence data" << endl << " -m filename Sequence ID to taxon map" << endl + << " -a Add taxonomy IDs (starting with "< tokenise(const std::string &line, const std::string& delimiters) { - std::vector tokens; - // Skip delimiters at beginning. - std::string::size_type lastPos = line.find_first_not_of(delimiters, 0); - std::string::size_type pos = line.find_first_of(delimiters, lastPos); - while (std::string::npos != pos || std::string::npos != lastPos) { - tokens.push_back(line.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = line.find_first_not_of(delimiters, pos); - pos = line.find_first_of(delimiters, lastPos); - } - return tokens; +template +uint64_t string_to_T(string str) { + stringstream stream(str); + T result; + stream >> result; + return result; +} + +std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) { + std::vector tokens(max_fields); + size_t delim_length = delimiter.length(); + size_t last = 0; + size_t i = 0; + + for (size_t next = s.find(delimiter, last); + i < max_fields && next != string::npos; + next = s.find(delimiter, last), ++i) { + tokens[i] = s.substr(last, next-last); + last = next + delim_length; + } + if (i < max_fields) { + tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars); + } + + return tokens; } +template class TaxonomyEntry { public: - uint32_t taxonomyID = 0; - uint32_t parentTaxonomyID = 0; + TAXID taxonomyID = 0; + TAXID parentTaxonomyID = 0; std::string rank; std::string scientificName; TaxonomyEntry() {} - TaxonomyEntry(uint32_t taxonomyID_, uint32_t parentTaxonomyID_, std::string rank_, std::string scientificName_) : + + TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) : + taxonomyID(taxonomyID_), scientificName(scientificName_) {} + + TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) : + taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {} + + TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) : taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {} inline bool operator==(const TaxonomyEntry& other) const { @@ -87,40 +106,45 @@ class TaxonomyEntry { HyperLogLogPlusMinus kmers; }; + +template struct TaxonomyEntryPtr_comp { - bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { + bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { return ((a->numReadsAligned+a->numReadsAlignedToChildren) > (b->numReadsAligned+b->numReadsAlignedToChildren)); } }; +template class TaxonomyDB { public: TaxonomyDB(const std::string inFileName); TaxonomyDB() {}; - std::unordered_map taxIDsAndEntries; + //std::unordered_map seqIDsAndTaxIds; + std::unordered_map > taxIDsAndEntries; void parseNamesDump(const std::string namesDumpFileName); void parseNodesDump(const std::string nodesDumpFileName); - uint32_t getTaxIDAtRank(const uint32_t taxID, const std::string& rank) const; - std::string getScientificName(const uint32_t taxID) const; - std::string getRank(const uint32_t taxID) const; - uint32_t getLowestCommonAncestor(const std::vector& taxIDs) const; - uint32_t getParentTaxID(const uint32_t taxID) const; - std::string getLineage(uint32_t taxonomyID) const; - std::string getMetaPhlAnLineage(uint32_t taxonomyID) const; - char* getIndexFileName(const uint32_t hostTaxID) const; + TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; + std::string getScientificName(const TAXID taxID) const; + std::string getRank(const TAXID taxID) const; + TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; + TAXID getParentTaxID(const TAXID taxID) const; + std::string getLineage(TAXID taxonomyID) const; + std::string getMetaPhlAnLineage(TAXID taxonomyID) const; + char* getIndexFileName(const TAXID hostTaxID) const; void readTaxonomyIndex(const std::string inFileName); + void writeTaxonomyIndex(std::ostream & outs) const; void writeTaxonomyIndex(std::ostream & outs, const std::string namesDumpFileName, const std::string nodesDumpFileName); - bool isSubSpecies(uint32_t taxonomyID) const; - int isBelowInTree(uint32_t upper, uint32_t lower) const; - void fillCounts(const unordered_map& taxon_counts); + bool isSubSpecies(TAXID taxonomyID) const; + int isBelowInTree(TAXID upper, TAXID lower) const; + void fillCounts(const unordered_map& taxon_counts); void createPointers(); void printReport(); }; - -void TaxonomyDB::createPointers() { +template +void TaxonomyDB::createPointers() { for (auto& tax : taxIDsAndEntries) { if (tax.second.parentTaxonomyID != tax.first) { auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); @@ -131,7 +155,9 @@ void TaxonomyDB::createPointers() { } } } -TaxonomyDB::TaxonomyDB(const std::string inFileName) { + +template +TaxonomyDB::TaxonomyDB(const std::string inFileName) { log("Building taxonomy index"); readTaxonomyIndex(inFileName); createPointers(); @@ -139,82 +165,103 @@ TaxonomyDB::TaxonomyDB(const std::string inFileName) { " nodes"); } -void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { +template +void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { std::ifstream nodesDumpFile(nodesDumpFileName); if (!nodesDumpFile.is_open()) throw std::runtime_error("unable to open nodes file"); std::string line; + + TAXID taxonomyID; + TAXID parentTaxonomyID; + std::string rank; + while (nodesDumpFile.good()) { getline(nodesDumpFile, line); - std::vector tokens = tokenise(line, "\t|"); - if (tokens.size() > 2) { - TaxonomyEntry newEntry; - newEntry.taxonomyID = stoi(tokens[0]); - newEntry.parentTaxonomyID = stoi(tokens[1]); - newEntry.rank = tokens[2]; - auto entryIt = taxIDsAndEntries.insert({ - newEntry.taxonomyID, newEntry - }); - if (!entryIt.second) { - entryIt.first->second.taxonomyID = newEntry.taxonomyID; - newEntry.parentTaxonomyID = stoi(tokens[1]); - } + std::vector tokens = tokenise(line, "\t|\t", 3, 2); + if (tokens.size() < 3) { + continue; + } + + taxonomyID = string_to_T(tokens[0]); + parentTaxonomyID = string_to_T(tokens[1]); + rank = tokens[2]; + + auto entryIt = taxIDsAndEntries.find(taxonomyID); + if (entryIt == taxIDsAndEntries.end()) { + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); + } else { + entryIt->second.parentTaxonomyID = parentTaxonomyID; + entryIt->second.rank = rank; } } } -void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { +template +void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { std::ifstream namesDumpFile(namesDumpFileName); if (!namesDumpFile.is_open()) throw std::runtime_error("unable to open names file"); std::string line; + + TAXID taxonomyID; + std::string scientificName; while (namesDumpFile.good()) { getline(namesDumpFile, line); - std::vector tokens = tokenise(line, "|"); - for (auto& token : tokens) { - if (token.size() > 1) { - if (token[0] == '\t') token.erase(0, 1); - if (token[token.size() - 1] == '\t') token.erase(token.size() - 1, 1); - } - } - if (tokens.size() > 3) { - TaxonomyEntry newEntry; - newEntry.taxonomyID = stoi(tokens[0]); - // for(auto & token : tokens) - // std::cout<second.scientificName = newEntry.scientificName; - } + std::vector tokens = tokenise(line, "\t|\t", 4, 2); + if (tokens.size() < 4 || tokens[3] != "scientific name") { + continue; + } + taxonomyID = string_to_T(tokens[0]); + scientificName = tokens[1]; + + auto entryIt = taxIDsAndEntries.find(taxonomyID); + if (entryIt == taxIDsAndEntries.end()) { + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); + } else { + entryIt->second.scientificName = scientificName; } } } -void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, +template +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, const std::string namesDumpFileName, const std::string nodesDumpFileName) { parseNodesDump(nodesDumpFileName); parseNamesDump(namesDumpFileName); - for (auto& entry : taxIDsAndEntries) { - outs << entry.first << "\t" << entry.second.parentTaxonomyID << "\t" - << entry.second.scientificName << "\t" << entry.second.rank << "\n"; + writeTaxonomyIndex(outs); +} + +template +std::vector getSortedKeys(const std::unordered_map& unordered) { + std::vector keys; + keys.reserve (unordered.size()); + for (auto& it : unordered) { + keys.push_back(it.first); } + std::sort (keys.begin(), keys.end()); + return keys; } -void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { +template +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { + for (TAXID& key : getSortedKeys(taxIDsAndEntries)) { + const auto& entry = taxIDsAndEntries.at(key); + outs << key << "\t" << entry.parentTaxonomyID << "\t" + << entry.scientificName << "\t" << entry.rank << "\n"; + } +} + + + +template +void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { std::ifstream inFile(inFileName); if (!inFile.is_open()) throw std::runtime_error("unable to open taxonomy index file"); - uint32_t taxonomyID, parentTaxonomyID; + TAXID taxonomyID, parentTaxonomyID; std::string scientificName, rank; std::string line; @@ -223,7 +270,7 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { inFile.get(); // read tab std::getline(inFile, scientificName, '\t'); std::getline(inFile, rank, '\n'); - TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); + TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); //cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl; taxIDsAndEntries.insert({ @@ -235,16 +282,17 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { }); } -uint32_t TaxonomyDB::getLowestCommonAncestor( - const std::vector& taxIDs) const { +template +TAXID TaxonomyDB::getLowestCommonAncestor( + const std::vector& taxIDs) const { if (taxIDs.size() == 0) { return 0; } - std::vector > paths; + std::vector > paths; for (auto& taxID : taxIDs) { bool good = true; - std::vector path; - uint32_t tempTaxID = taxID; + std::vector path; + TAXID tempTaxID = taxID; while (tempTaxID != 0) { path.push_back(tempTaxID); tempTaxID = getParentTaxID(tempTaxID); @@ -257,12 +305,12 @@ uint32_t TaxonomyDB::getLowestCommonAncestor( for (auto& path : paths) std::reverse(path.begin(), path.end()); std::sort(paths.begin(), paths.end(), - [](std::vector i, std::vector j) { + [](std::vector i, std::vector j) { return i.size() < j.size(); }); - uint32_t consensus = 0; + TAXID consensus = 0; for (unsigned i = 0; i < paths[0].size(); i++) { - uint32_t temp = 0; + TAXID temp = 0; for (auto& path : paths) { if (temp == 0) temp = path[i]; @@ -275,7 +323,8 @@ uint32_t TaxonomyDB::getLowestCommonAncestor( return consensus; } -uint32_t TaxonomyDB::getParentTaxID(const uint32_t taxID) const { +template +TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { auto entry = taxIDsAndEntries.find(taxID); if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) return entry->second.parentTaxonomyID; @@ -283,7 +332,8 @@ uint32_t TaxonomyDB::getParentTaxID(const uint32_t taxID) const { return 0; } -std::string TaxonomyDB::getScientificName(const uint32_t taxID) const { +template +std::string TaxonomyDB::getScientificName(const TAXID taxID) const { auto entry = taxIDsAndEntries.find(taxID); if (entry != taxIDsAndEntries.end()) { return entry->second.scientificName; @@ -291,7 +341,8 @@ std::string TaxonomyDB::getScientificName(const uint32_t taxID) const { return std::string(); } -std::string TaxonomyDB::getRank(const uint32_t taxID) const { +template +std::string TaxonomyDB::getRank(const TAXID taxID) const { auto entry = taxIDsAndEntries.find(taxID); if (entry != taxIDsAndEntries.end()) { return entry->second.rank; @@ -299,7 +350,8 @@ std::string TaxonomyDB::getRank(const uint32_t taxID) const { return std::string(); } -std::string TaxonomyDB::getLineage(uint32_t taxonomyID) const { +template +std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { std::string lineage; while (true) { // 131567 = Cellular organisms @@ -316,7 +368,9 @@ std::string TaxonomyDB::getLineage(uint32_t taxonomyID) const { } return lineage; } -std::string TaxonomyDB::getMetaPhlAnLineage(uint32_t taxonomyID) const { + +template +std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { std::string rank = getRank(taxonomyID); if (rank == "superphylum") return std::string(); std::string lineage; @@ -356,7 +410,8 @@ std::string TaxonomyDB::getMetaPhlAnLineage(uint32_t taxonomyID) const { return lineage; } -uint32_t TaxonomyDB::getTaxIDAtRank(const uint32_t taxID, +template +TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, const std::string& rank) const { auto entry = taxIDsAndEntries.find(taxID); while (entry != taxIDsAndEntries.end() && @@ -368,7 +423,9 @@ uint32_t TaxonomyDB::getTaxIDAtRank(const uint32_t taxID, } return 0; } -int TaxonomyDB::isBelowInTree(uint32_t upper, uint32_t lower) const { + +template +int TaxonomyDB::isBelowInTree(TAXID upper, TAXID lower) const { auto entry = taxIDsAndEntries.find(lower); unsigned level = 0; while (entry != taxIDsAndEntries.end() && @@ -382,7 +439,9 @@ int TaxonomyDB::isBelowInTree(uint32_t upper, uint32_t lower) const { } return -1; } -bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const { + +template +bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { bool isSubSpecies = false; auto entry = taxIDsAndEntries.find(taxonomyID); int numLevels = 0; @@ -400,14 +459,15 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const { return isSubSpecies; } -void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { +template +void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { for (auto& elem : taxon_counts) { auto it = taxIDsAndEntries.find(elem.first); if (it == taxIDsAndEntries.end()) { cerr << "No taxonomy entry for " << elem.first << "!!" << endl; continue; } - TaxonomyEntry* tax = &it->second; + TaxonomyEntry* tax = &it->second; //cerr << "fill done: "<< elem.first << endl; tax->numReadsAligned += elem.second.n_reads; tax->numKmers += elem.second.n_kmers; @@ -426,33 +486,36 @@ void TaxonomyDB::fillCounts(const unordered_map& taxon_cou } for (auto& tax : taxIDsAndEntries) { - std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); + std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); } } +template class TaxReport { private: std::ostream& _reportOfb; - TaxonomyDB & _taxdb; + TaxonomyDB & _taxdb; std::vector _report_cols; uint64_t _total_n_reads; bool _show_zeros; - void printLine(TaxonomyEntry& tax, unsigned depth); + void printLine(TaxonomyEntry& tax, unsigned depth); public: - TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); + TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); void printReport(std::string format, std::string rank); - void printReport(TaxonomyEntry& tax, unsigned depth); + void printReport(TaxonomyEntry& tax, unsigned depth); }; -TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { +template +TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; } -void TaxReport::printReport(std::string format, std::string rank) { +template +void TaxReport::printReport(std::string format, std::string rank) { _total_n_reads = _taxdb.taxIDsAndEntries.at(0).numReadsAligned + _taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren + @@ -480,7 +543,8 @@ void TaxReport::printReport(std::string format, std::string rank) { } } -void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { +template +void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { if (_show_zeros || (tax.numReadsAligned+tax.numReadsAlignedToChildren) > 0) { printLine(tax, depth); @@ -492,7 +556,8 @@ void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { } -void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { +template +void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { for (auto& col : _report_cols) { switch (col) { case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; From 06e7f7ebe6ac745a27361aa7c35830e6b5b37e21 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 21 Feb 2017 17:39:33 -0500 Subject: [PATCH 019/105] Use less critical pragma - better parallelization --- install_kraken.sh | 2 ++ src/classify.cpp | 46 +++++++++++++++++++++++++--------------------- src/taxdb.h | 10 ++++++++++ 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/install_kraken.sh b/install_kraken.sh index e7af3d7..803989c 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -62,3 +62,5 @@ for file in $KRAKEN_DIR/kraken* do [ -x "$file" ] && echo " $file" done + +exit 0 diff --git a/src/classify.cpp b/src/classify.cpp index 611497f..f5bf9d6 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -35,8 +35,9 @@ using namespace kraken; void parse_command_line(int argc, char **argv); void usage(int exit_code=EX_USAGE); void process_file(char *filename); -void classify_sequence(DNASequence &dna, ostringstream &koss, - ostringstream &coss, ostringstream &uoss); +bool classify_sequence(DNASequence &dna, ostringstream &koss, + ostringstream &coss, ostringstream &uoss, + unordered_map&); string hitlist_string(vector &taxa, vector &ambig); set get_ancestry(uint32_t taxon); void report_stats(struct timeval time1, struct timeval time2); @@ -269,15 +270,24 @@ void process_file(char *filename) { if (total_nt == 0) break; + unordered_map my_taxon_counts; + uint64_t my_total_classified = 0; kraken_output_ss.str(""); classified_output_ss.str(""); unclassified_output_ss.str(""); for (size_t j = 0; j < work_unit.size(); j++) - classify_sequence( work_unit[j], kraken_output_ss, - classified_output_ss, unclassified_output_ss ); + my_total_classified += + classify_sequence( work_unit[j], kraken_output_ss, + classified_output_ss, unclassified_output_ss, + my_taxon_counts); #pragma omp critical(write_output) { + total_classified += my_total_classified; + for (auto &it : my_taxon_counts) { + taxon_counts[it.first] += it.second; + } + if (Print_kraken) (*Kraken_output) << kraken_output_ss.str(); if (Print_classified) @@ -286,8 +296,9 @@ void process_file(char *filename) { (*Unclassified_output) << unclassified_output_ss.str(); total_sequences += work_unit.size(); total_bases += total_nt; + //if (Print_Progress && total_sequences % 100000 < work_unit.size()) if (Print_Progress && total_sequences % 100000 < work_unit.size()) - cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ..."; + cerr << "\rProcessed " << total_sequences << " sequences (" << total_classified << " classified) ..."; } } } // end parallel section @@ -304,8 +315,9 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu return taxon; } -void classify_sequence(DNASequence &dna, ostringstream &koss, - ostringstream &coss, ostringstream &uoss) { +bool classify_sequence(DNASequence &dna, ostringstream &koss, + ostringstream &coss, ostringstream &uoss, + unordered_map& my_taxon_counts) { vector taxa; vector ambig_list; unordered_map hit_counts; @@ -330,11 +342,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, if (taxon) break; } - #pragma omp critical - { - taxon_counts[taxon].kmers.add(*kmer_ptr); - ++taxon_counts[taxon].n_kmers; - } + my_taxon_counts[taxon].add_kmer(*kmer_ptr); if (taxon) { hit_counts[taxon]++; @@ -352,12 +360,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, else call = resolve_tree(hit_counts, Parent_map); - if (call) - #pragma omp atomic - total_classified++; - - #pragma omp critical - ++(taxon_counts[call].n_reads); + ++(my_taxon_counts[call].n_reads); if (Print_unclassified || Print_classified) { ostringstream *oss_ptr = call ? &coss : &uoss; @@ -377,14 +380,14 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, } if (! Print_kraken) - return; + return call; if (call) { koss << "C\t"; } else { if (Only_classified_kraken_output) - return; + return false; koss << "U\t"; } koss << dna.id << "\t" << call << "\t" << dna.seq.size() << "\t"; @@ -402,7 +405,8 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, if (Print_sequence) koss << "\t" << dna.seq; - koss << endl; + koss << "\n"; + return call; } string hitlist_string(vector &taxa, vector &ambig) diff --git a/src/taxdb.h b/src/taxdb.h index b4d4093..5fc53c9 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -35,6 +35,16 @@ struct ReadCounts { uint64_t n_reads = 0; uint64_t n_kmers = 0; HyperLogLogPlusMinus kmers; // unique k-mer count per taxon + void add_kmer(uint64_t kmer) { + ++ n_kmers; + kmers.add(kmer); + } + ReadCounts& operator+=(const ReadCounts& b) { + n_reads += b.n_reads; + n_kmers += b.n_kmers; + kmers += kmers; + return *this; + } }; From ff6394483f00d44b8ea82b6560e2dd16f4ea4f4b Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 22 Feb 2017 12:59:24 -0500 Subject: [PATCH 020/105] Fix for using multiple databases for search --- scripts/kraken-build | 9 ++++++--- src/classify.cpp | 47 +++++++++++++++++++++++++------------------- src/set_lcas.cpp | 1 + src/taxdb.h | 2 +- 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/scripts/kraken-build b/scripts/kraken-build index 7170a67..1ddea52 100755 --- a/scripts/kraken-build +++ b/scripts/kraken-build @@ -67,6 +67,8 @@ my ( $add_taxonomy_ids_for_seq ); +my $verbose = 0; + $threads = $DEF_THREAD_CT; $minimizer_len = $DEF_MINIMIZER_LEN; $kmer_len = $DEF_KMER_LEN; @@ -74,7 +76,6 @@ $work_on_disk = ""; $hash_size = ""; $max_db_size = ""; $add_taxonomy_ids_for_seq = 0; -$rebuild = 0; # variables corresponding to task options my @TASK_LIST = ( @@ -112,6 +113,7 @@ GetOptions( "upgrade" => \$upgrade, "standard" => \$standard, "clean" => \$clean, + "verbose" => \$verbose, "generate-taxonomy-ids-for-sequences" => \$add_taxonomy_ids_for_seq ) or usage(); @@ -293,9 +295,10 @@ sub standard_installation { } sub build_database { - $ENV{"KRAKEN_REBUILD_DATABASE"} = $rebuild; + $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0); $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq; - exec "build_kraken_db.sh"; + my $opt = ($verbose? "-x" : ""); + exec "build_kraken_db.sh $opt"; } sub clean_database { diff --git a/src/classify.cpp b/src/classify.cpp index f5bf9d6..891f30b 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -51,14 +51,13 @@ bool Fastq_input = false; bool Print_classified = false; bool Print_unclassified = false; bool Print_kraken = true; -bool Print_kraken_report = true; +bool Print_kraken_report = false; bool Populate_memory = false; bool Only_classified_kraken_output = false; bool Print_sequence = false; bool Print_Progress = true; uint32_t Minimum_hit_count = 1; unordered_map Parent_map; -vector KrakenDatabases; string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file; ostream *Classified_output; ostream *Unclassified_output; @@ -68,6 +67,7 @@ vector Open_fstreams; vector Open_gzstreams; size_t Work_unit_size = DEF_WORK_UNIT_SIZE; TaxonomyDB taxdb; +static vector KrakenDatabases (DB_filenames.size()); uint64_t total_classified = 0; uint64_t total_sequences = 0; @@ -134,25 +134,24 @@ int main(int argc, char **argv) { if (Populate_memory) cerr << "Loading database(s)... " << endl; + static vector idx_files (DB_filenames.size()); + static vector db_files (DB_filenames.size()); + static vector db_indices (DB_filenames.size()); + + // TODO: Check DB_filenames and Index_filesnames have the same length for (size_t i=0; i < DB_filenames.size(); ++i) { - //cerr << "\t " << DB_filenames[i] << endl; - static QuickFile db_file; - db_file.open_file(DB_filenames[i]); + cerr << " Database " << DB_filenames[i] << endl; + db_files[i].open_file(DB_filenames[i]); if (Populate_memory) - db_file.load_file(); - static KrakenDB Database = KrakenDB(db_file.ptr()); - KmerScanner::set_k(Database.get_k()); - - static QuickFile idx_file; - idx_file.open_file(Index_filenames[i]); + db_files[i].load_file(); + + KrakenDatabases.push_back(new KrakenDB(db_files[i].ptr())); + idx_files[i].open_file(Index_filenames[i]); if (Populate_memory) - idx_file.load_file(); - static KrakenDBIndex db_index(idx_file.ptr()); - Database.set_index(&db_index); - - - KrakenDatabases.push_back(&Database); + idx_files[i].load_file(); + db_indices[i] = KrakenDBIndex(idx_files[i].ptr()); + KrakenDatabases[i]->set_index(&db_indices[i]); } // TODO: Check all databases have the same k @@ -325,7 +324,14 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, uint32_t taxon = 0; uint32_t hits = 0; // only maintained if in quick mode - uint64_t current_bin_key; int64_t current_min_pos = 1; int64_t current_max_pos = 0; + + struct db_status { + uint64_t current_bin_key; + int64_t current_min_pos = 1; + int64_t current_max_pos = 0; + }; + + vector db_statuses(KrakenDatabases.size()); if (dna.seq.size() >= KrakenDatabases[0]->get_k()) { KmerScanner scanner(dna.seq); @@ -337,8 +343,9 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, else { ambig_list.push_back(0); - for (auto& db : KrakenDatabases) { - taxon = get_taxon_for_kmer(*db, kmer_ptr, current_bin_key, current_min_pos, current_max_pos); + for (size_t i=0; i Date: Wed, 22 Feb 2017 14:09:51 -0500 Subject: [PATCH 021/105] Update README.md --- README.md | 49 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c414dfe..988deb5 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,45 @@ -Kraken taxonomic sequence classification system +Kraken taxonomic sequence classification system with Unique K-mer Counting =============================================== -Please see the [Kraken webpage] or the [Kraken manual] -for information on installing and operating Kraken. -A local copy of the [Kraken manual] is also present here -in the `docs/` directory (`MANUAL.html` and `MANUAL.markdown`). +[Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. -[Kraken webpage]: http://ccb.jhu.edu/software/kraken/ -[Kraken manual]: http://ccb.jhu.edu/software/kraken/MANUAL.html +kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). + +Here's a small example of a classification against a viral database with k=25. There are three species identified by just one read - Enterobacteria phage BP-4795, Salmonella phage SEN22, Sulfolobus monocaudavirus SMV1. Out of those, the identification of Salmonella phage SEN22 is the strongest, as there read was matched with 116 k-mers that are unique to the sequence, while the match to Sulfolobus monocaudavirus SMV1 is only based on a single 25-mer. + +``` +99.0958 2192 2192 255510 272869 no rank 0 unclassified +0.904159 20 0 2361 2318 no rank 1 root +0.904159 20 0 2361 2318 superkingdom 10239 Viruses +0.904159 20 0 2361 2318 no rank 35237 dsDNA viruses, no RNA stage +0.768535 17 0 2074 2063 order 548681 Herpesvirales +0.768535 17 0 2074 2063 family 10292 Herpesviridae +0.768535 17 0 2074 2063 subfamily 10374 Gammaherpesvirinae +0.768535 17 0 2074 2063 genus 10375 Lymphocryptovirus +0.768535 17 16 2001 1987 species 10376 Human gammaherpesvirus 4 +0.045208 1 1 4 4 sequence 1000041143 KC207814.1 Human herpesvirus 4 strain Mutu, complete genome +0.0904159 2 0 254 254 order 28883 Caudovirales +0.045208 1 0 28 28 family 10699 Siphoviridae +0.045208 1 0 28 28 genus 186765 Lambdavirus +0.045208 1 0 28 28 no rank 335795 unclassified Lambda-like viruses +0.045208 1 1 28 28 species 196242 Enterobacteria phage BP-4795 +0.045208 1 0 116 116 family 10744 Podoviridae +0.045208 1 0 116 116 no rank 196895 unclassified Podoviridae +0.045208 1 0 116 116 no rank 1758253 Escherichia phage phi191 sensu lato +0.045208 1 1 116 116 species 1647458 Salmonella phage SEN22 +0.045208 1 0 1 1 no rank 51368 unclassified dsDNA viruses +0.045208 1 1 1 1 species 1351702 Sulfolobus monocaudavirus SMV1 +``` + +## Usage + +For usage, see `kraken_hll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM). + +### Differences to `kraken` + - Use `kraken_hll --report-file FILENAME ...` to write the kraken report to `FILENAME`. + - Use `kraken_hll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `kraken_hll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same. + - Add a suffix `.gz` to output files to generate gzipped output files + +### Differences to `kraken-build` + - Use `kraken_hll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`. + - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space From 8f4b1841bd23a75d0726b95f8f2847f3d03f3b63 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 11 Apr 2017 22:58:45 -0400 Subject: [PATCH 022/105] Do not need taxonomy nodes anymore --- scripts/kraken | 7 ------- src/classify.cpp | 13 ------------- 2 files changed, 20 deletions(-) diff --git a/scripts/kraken b/scripts/kraken index 29cce0d..6f2e290 100755 --- a/scripts/kraken +++ b/scripts/kraken @@ -95,12 +95,6 @@ if ($@) { die "$PROG: $@"; } -my $taxonomy = $db_prefix[0]."/taxonomy/nodes.dmp"; -if ($quick) { - undef $taxonomy; # Skip loading nodes file, not needed in quick mode -} - - my @kdb_files = map { "$_/database.kdb" } @db_prefix; my @idx_files = map { "$_/database.idx" } @db_prefix; @@ -140,7 +134,6 @@ my @flags; push @flags, map { ("-d", $_) } @kdb_files; push @flags, map { ("-i", $_) } @idx_files; push @flags, "-t", $threads if $threads > 1; -push @flags, "-n", $taxonomy if defined $taxonomy; push @flags, "-q" if $quick; push @flags, "-m", $min_hits if $min_hits > 1; push @flags, "-f" if $fastq_input && ! $paired; # merger always outputs FASTA diff --git a/src/classify.cpp b/src/classify.cpp index 9f7933e..ef7d616 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -44,7 +44,6 @@ unordered_map taxon_counts; // stats per taxon int Num_threads = 1; vector DB_filenames; vector Index_filenames; -string Nodes_filename; bool Quick_mode = false; bool Fastq_input = false; bool Print_classified = false; @@ -117,10 +116,6 @@ int main(int argc, char **argv) { #endif parse_command_line(argc, argv); - //if (! Nodes_filename.empty()) { - // cerr << "Building parent node map " << endl; - // Parent_map = build_parent_map(Nodes_filename); - //} if (!TaxDB_file.empty()) { taxdb = TaxonomyDB(TaxDB_file); @@ -481,9 +476,6 @@ void parse_command_line(int argc, char **argv) { omp_set_num_threads(Num_threads); #endif break; - case 'n' : - Nodes_filename = optarg; - break; case 'q' : Quick_mode = true; break; @@ -542,10 +534,6 @@ void parse_command_line(int argc, char **argv) { cerr << "Missing mandatory option -i" << endl; usage(); } - if (Nodes_filename.empty() && ! Quick_mode) { - cerr << "Must specify one of -q or -n" << endl; - usage(); - } if (optind == argc) { cerr << "No sequence data files specified" << endl; } @@ -557,7 +545,6 @@ void usage(int exit_code) { << "Options: (*mandatory)" << endl << "* -d filename Kraken DB filename" << endl << "* -i filename Kraken DB index filename" << endl - << " -n filename NCBI Taxonomy nodes file" << endl << " -o filename Output file for Kraken output" << endl << " -r filename Output file for Kraken report output" << endl << " -a filename TaxDB" << endl From 77cac1358ff7eaca1b4507135775f933c9184d9a Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 11 Apr 2017 22:59:48 -0400 Subject: [PATCH 023/105] Check if there's a taxonomy entry --- src/taxdb.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/taxdb.h b/src/taxdb.h index 0d93207..56bd341 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -402,8 +402,12 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const { void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { for (auto& elem : taxon_counts) { - //cerr << "fill: "<< elem.first << endl; - TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first); + auto it = taxIDsAndEntries.find(elem.first); + if (it == taxIDsAndEntries.end()) { + cerr << "No taxonomy entry for " << elem.first << "!!" << endl; + continue; + } + TaxonomyEntry* tax = &it->second; //cerr << "fill done: "<< elem.first << endl; tax->numReadsAligned += elem.second.n_reads; tax->numKmers += elem.second.n_kmers; From dcaec2956743c5fb518a5d0240a3198320dfd6cd Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 11 Apr 2017 23:06:55 -0400 Subject: [PATCH 024/105] Use template for ReadCounts --- src/build_taxdb.cpp | 2 +- src/classify.cpp | 23 +++ src/report-cols.h | 3 +- src/taxdb.h | 340 ++++++++++++++++++++++++++++---------------- 4 files changed, 247 insertions(+), 121 deletions(-) diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index 8e1f11e..2710d82 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -27,7 +27,7 @@ int main(int argc, char **argv) { std::cout << "Provide names.dmp and nodes.dmp\n"; return 1; } - TaxonomyDB taxdb; + TaxonomyDB taxdb; taxdb.writeTaxonomyIndex( std::cout, argv[1], argv[2]); diff --git a/src/classify.cpp b/src/classify.cpp index 891f30b..1f5bbac 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -79,6 +79,29 @@ inline bool ends_with(std::string const & value, std::string const & ending) return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); } +struct ReadCounts { + uint64_t n_reads = 0; + uint64_t n_kmers = 0; + HyperLogLogPlusMinus kmers; // unique k-mer count per taxon + void add_kmer(uint64_t kmer) { + ++ n_kmers; + kmers.add(kmer); + } + ReadCounts& operator+=(const ReadCounts& b) { + n_reads += b.n_reads; + n_kmers += b.n_kmers; + kmers += b.kmers; + return *this; + } +}; + +inline +uint64_t reads(const ReadCounts& read_count) { + return(read_count.n_reads); +} + + + ostream* cout_or_file(string file) { if (file == "-") return &cout; diff --git a/src/report-cols.h b/src/report-cols.h index a34a755..007eef5 100644 --- a/src/report-cols.h +++ b/src/report-cols.h @@ -9,6 +9,7 @@ #define REPORT_COLS_H #include +#include enum class REPORTCOLS : uint8_t { SPACED_NAME, @@ -29,7 +30,7 @@ enum class REPORTCOLS : uint8_t { }; -static const std::map report_col_name_map = { +static const std::map report_col_name_map = { {"name", REPORTCOLS::NAME}, {"spaced_name", REPORTCOLS::SPACED_NAME}, {"taxID", REPORTCOLS::TAX_ID}, diff --git a/src/taxdb.h b/src/taxdb.h index ff2b2f3..ce45bf8 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -28,25 +28,10 @@ #include #include #include -#include "hyperloglogplus.h" +#include #include "report-cols.h" -struct ReadCounts { - uint64_t n_reads = 0; - uint64_t n_kmers = 0; - HyperLogLogPlusMinus kmers; // unique k-mer count per taxon - void add_kmer(uint64_t kmer) { - ++ n_kmers; - kmers.add(kmer); - } - ReadCounts& operator+=(const ReadCounts& b) { - n_reads += b.n_reads; - n_kmers += b.n_kmers; - kmers += b.kmers; - return *this; - } -}; - +using namespace std; void log (const std::string& s) { std::cerr << s << "\n"; @@ -60,26 +45,85 @@ uint64_t string_to_T(string str) { return result; } -std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) { +template +inline +uint64_t reads(const T read_count) { + cerr << "No reads function for type!! " << endl; + throw ; + return(0); +} + + + +inline +uint64_t reads(const uint64_t read_count) { + return(read_count); +} + +std::vector in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at = 0) { + std::vector tokens; + size_t i = 0; + size_t next_end = start_at-1; + + for (size_t next_start = s.find(start_char, next_end + 1); \ + next_start != string::npos; + next_start = s.find(start_char, next_end + 1), ++i) { + + next_end = s.find(end_char, next_start + 1); + if (next_end == string::npos) + throw std::runtime_error("unmatched start and end!"); + + tokens.push_back(s.substr(next_start+1, next_end-1)); + } + + return tokens; +} + + + +std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0) { std::vector tokens(max_fields); size_t delim_length = delimiter.length(); size_t last = 0; size_t i = 0; for (size_t next = s.find(delimiter, last); - i < max_fields && next != string::npos; + (max_fields > 0 && i < max_fields) && next != string::npos; next = s.find(delimiter, last), ++i) { tokens[i] = s.substr(last, next-last); last = next + delim_length; } - if (i < max_fields) { + if (max_fields > 0 && i < max_fields) { tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars); } return tokens; } -template +std::vector get_fields(const std::string &s, const std::string& delimiter, vector fields) { + std::vector tokens; + tokens.reserve(fields.size()); + size_t delim_length = delimiter.length(); + size_t last = 0; + size_t i = 0; + size_t current_field = 0; + + for (size_t next = s.find(delimiter, last); + tokens.size() < fields.size() && next != string::npos; + next = s.find(delimiter, last), ++i) { + if (i == fields[current_field]) { + tokens.push_back(s.substr(last, next-last)); + ++current_field; + } + last = next + delim_length; + } + + return tokens; +} + + + +template class TaxonomyEntry { public: TAXID taxonomyID = 0; @@ -106,31 +150,35 @@ class TaxonomyEntry { TaxonomyEntry* parent = nullptr; std::vector children; - unsigned numReadsAligned = 0; - unsigned numReadsAlignedToChildren = 0; + READCOUNTS read_counts = 0; + READCOUNTS read_counts_children = 0; + bool used = false; uint64_t genomeSize = 0; uint64_t genomeSizeOfChildren = 0; uint64_t numBelow = 0; - uint64_t numKmers = 0; - HyperLogLogPlusMinus kmers; }; +template<> +TaxonomyEntry::TaxonomyEntry () { + read_counts = 0; + read_counts_children = 0; +} -template +template struct TaxonomyEntryPtr_comp { - bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { - return ((a->numReadsAligned+a->numReadsAlignedToChildren) > (b->numReadsAligned+b->numReadsAlignedToChildren)); + bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { + return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); } }; -template +template class TaxonomyDB { public: TaxonomyDB(const std::string inFileName); TaxonomyDB() {}; //std::unordered_map seqIDsAndTaxIds; - std::unordered_map > taxIDsAndEntries; + std::unordered_map > taxIDsAndEntries; void parseNamesDump(const std::string namesDumpFileName); void parseNodesDump(const std::string nodesDumpFileName); TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; @@ -138,6 +186,8 @@ class TaxonomyDB { std::string getRank(const TAXID taxID) const; TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; TAXID getParentTaxID(const TAXID taxID) const; + std::unordered_map getParentMap() const; + std::unordered_map getScientificNameMap() const; std::string getLineage(TAXID taxonomyID) const; std::string getMetaPhlAnLineage(TAXID taxonomyID) const; char* getIndexFileName(const TAXID hostTaxID) const; @@ -148,13 +198,34 @@ class TaxonomyDB { const std::string nodesDumpFileName); bool isSubSpecies(TAXID taxonomyID) const; int isBelowInTree(TAXID upper, TAXID lower) const; - void fillCounts(const unordered_map& taxon_counts); + void addCounts(const TAXID taxid, const READCOUNTS& read_counts_); + void fillCounts(const unordered_map& taxon_counts); void createPointers(); void printReport(); }; -template -void TaxonomyDB::createPointers() { +template +std::unordered_map TaxonomyDB::getScientificNameMap() const { + std::unordered_map scientificNameMap; + for (const auto & tax : taxIDsAndEntries) { + scientificNameMap[tax.second.scientificName] = tax.first; + } + return scientificNameMap; +} + +template +unordered_map TaxonomyDB::getParentMap() const { + unordered_map Parent_map; + for (const auto & tax : taxIDsAndEntries) { + if (tax.first != 0) + Parent_map[tax.first] = tax.second.parentTaxonomyID; + } + Parent_map[1] = 1; + return Parent_map; +} + +template +void TaxonomyDB::createPointers() { for (auto& tax : taxIDsAndEntries) { if (tax.second.parentTaxonomyID != tax.first) { auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); @@ -166,8 +237,8 @@ void TaxonomyDB::createPointers() { } } -template -TaxonomyDB::TaxonomyDB(const std::string inFileName) { +template +TaxonomyDB::TaxonomyDB(const std::string inFileName) { log("Building taxonomy index"); readTaxonomyIndex(inFileName); createPointers(); @@ -175,8 +246,8 @@ TaxonomyDB::TaxonomyDB(const std::string inFileName) { " nodes"); } -template -void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { +template +void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { std::ifstream nodesDumpFile(nodesDumpFileName); if (!nodesDumpFile.is_open()) throw std::runtime_error("unable to open nodes file"); @@ -199,7 +270,7 @@ void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { auto entryIt = taxIDsAndEntries.find(taxonomyID); if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); } else { entryIt->second.parentTaxonomyID = parentTaxonomyID; entryIt->second.rank = rank; @@ -207,8 +278,8 @@ void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { } } -template -void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { +template +void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { std::ifstream namesDumpFile(namesDumpFileName); if (!namesDumpFile.is_open()) throw std::runtime_error("unable to open names file"); @@ -227,15 +298,15 @@ void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { auto entryIt = taxIDsAndEntries.find(taxonomyID); if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); } else { entryIt->second.scientificName = scientificName; } } } -template -void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, +template +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, const std::string namesDumpFileName, const std::string nodesDumpFileName) { parseNodesDump(nodesDumpFileName); @@ -254,8 +325,8 @@ std::vector getSortedKeys(const std::unordered_map& return keys; } -template -void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { +template +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { for (TAXID& key : getSortedKeys(taxIDsAndEntries)) { const auto& entry = taxIDsAndEntries.at(key); outs << key << "\t" << entry.parentTaxonomyID << "\t" @@ -265,11 +336,11 @@ void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { -template -void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { +template +void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { std::ifstream inFile(inFileName); if (!inFile.is_open()) - throw std::runtime_error("unable to open taxonomy index file"); + throw std::runtime_error("unable to open taxonomy index file " + inFileName); TAXID taxonomyID, parentTaxonomyID; std::string scientificName, rank; @@ -280,7 +351,7 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { inFile.get(); // read tab std::getline(inFile, scientificName, '\t'); std::getline(inFile, rank, '\n'); - TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); + TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); //cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl; taxIDsAndEntries.insert({ @@ -292,16 +363,16 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { }); } -template -TAXID TaxonomyDB::getLowestCommonAncestor( +template +TAXID TaxonomyDB::getLowestCommonAncestor( const std::vector& taxIDs) const { if (taxIDs.size() == 0) { return 0; } - std::vector > paths; + std::vector > paths; for (auto& taxID : taxIDs) { bool good = true; - std::vector path; + std::vector path; TAXID tempTaxID = taxID; while (tempTaxID != 0) { path.push_back(tempTaxID); @@ -315,7 +386,7 @@ TAXID TaxonomyDB::getLowestCommonAncestor( for (auto& path : paths) std::reverse(path.begin(), path.end()); std::sort(paths.begin(), paths.end(), - [](std::vector i, std::vector j) { + [](std::vector i, std::vector j) { return i.size() < j.size(); }); TAXID consensus = 0; @@ -333,8 +404,8 @@ TAXID TaxonomyDB::getLowestCommonAncestor( return consensus; } -template -TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { +template +TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { auto entry = taxIDsAndEntries.find(taxID); if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) return entry->second.parentTaxonomyID; @@ -342,8 +413,8 @@ TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { return 0; } -template -std::string TaxonomyDB::getScientificName(const TAXID taxID) const { +template +std::string TaxonomyDB::getScientificName(const TAXID taxID) const { auto entry = taxIDsAndEntries.find(taxID); if (entry != taxIDsAndEntries.end()) { return entry->second.scientificName; @@ -351,8 +422,8 @@ std::string TaxonomyDB::getScientificName(const TAXID taxID) const { return std::string(); } -template -std::string TaxonomyDB::getRank(const TAXID taxID) const { +template +std::string TaxonomyDB::getRank(const TAXID taxID) const { auto entry = taxIDsAndEntries.find(taxID); if (entry != taxIDsAndEntries.end()) { return entry->second.rank; @@ -360,8 +431,8 @@ std::string TaxonomyDB::getRank(const TAXID taxID) const { return std::string(); } -template -std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { +template +std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { std::string lineage; while (true) { // 131567 = Cellular organisms @@ -379,8 +450,8 @@ std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { return lineage; } -template -std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { +template +std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { std::string rank = getRank(taxonomyID); if (rank == "superphylum") return std::string(); std::string lineage; @@ -420,8 +491,8 @@ std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { return lineage; } -template -TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, +template +TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, const std::string& rank) const { auto entry = taxIDsAndEntries.find(taxID); while (entry != taxIDsAndEntries.end() && @@ -434,8 +505,8 @@ TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, return 0; } -template -int TaxonomyDB::isBelowInTree(TAXID upper, TAXID lower) const { +template +int TaxonomyDB::isBelowInTree(TAXID upper, TAXID lower) const { auto entry = taxIDsAndEntries.find(lower); unsigned level = 0; while (entry != taxIDsAndEntries.end() && @@ -450,8 +521,8 @@ int TaxonomyDB::isBelowInTree(TAXID upper, TAXID lower) const { return -1; } -template -bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { +template +bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { bool isSubSpecies = false; auto entry = taxIDsAndEntries.find(taxonomyID); int numLevels = 0; @@ -469,70 +540,66 @@ bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { return isSubSpecies; } -template -void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { - for (auto& elem : taxon_counts) { - auto it = taxIDsAndEntries.find(elem.first); +template +void TaxonomyDB::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) { + auto it = taxIDsAndEntries.find(taxid); if (it == taxIDsAndEntries.end()) { - cerr << "No taxonomy entry for " << elem.first << "!!" << endl; - continue; + cerr << "No taxonomy entry for " << taxid << "!!" << endl; + return; } - TaxonomyEntry* tax = &it->second; - //cerr << "fill done: "<< elem.first << endl; - tax->numReadsAligned += elem.second.n_reads; - tax->numKmers += elem.second.n_kmers; - tax->kmers += elem.second.kmers; - - //std::cerr << "adding " << elem.second.n_reads << " to " << tax->scientificName << ": "; + TaxonomyEntry* tax = &it->second; + //cerr << taxid << " rc before: " << tax->read_counts << endl; + tax->read_counts += read_counts_; + //cerr << taxid << " rc after: " << tax->read_counts << endl; while (tax->parent != nullptr) { tax = tax->parent; - //std::cerr << " >> " << tax->scientificName; - tax->numReadsAlignedToChildren += elem.second.n_reads; - tax->numKmers += elem.second.n_kmers; - tax->kmers += elem.second.kmers; + tax->read_counts_children += read_counts_; } - //std::cerr << endl; +} + +template +void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { + for (auto& elem : taxon_counts) { + addCounts(elem.first, elem.second); } for (auto& tax : taxIDsAndEntries) { - std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); + std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); } } -template +template class TaxReport { private: std::ostream& _reportOfb; - TaxonomyDB & _taxdb; + TaxonomyDB & _taxdb; std::vector _report_cols; uint64_t _total_n_reads; bool _show_zeros; - void printLine(TaxonomyEntry& tax, unsigned depth); + void printLine(TaxonomyEntry& tax, unsigned depth); public: - TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); + TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); void printReport(std::string format, std::string rank); - void printReport(TaxonomyEntry& tax, unsigned depth); + void printReport(TaxonomyEntry& tax, unsigned depth); }; -template -TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { +template +TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; } -template -void TaxReport::printReport(std::string format, std::string rank) { +template +void TaxReport::printReport(std::string format, std::string rank) { _total_n_reads = - _taxdb.taxIDsAndEntries.at(0).numReadsAligned + - _taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren + - _taxdb.taxIDsAndEntries.at(1).numReadsAligned + - _taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren;// + - //_taxdb.taxIDsAndEntries.at(-1).numReadsAligned + - //_taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree + reads(_taxdb.taxIDsAndEntries.at(0).read_counts) + + reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) + + reads(_taxdb.taxIDsAndEntries.at(1).read_counts) + + reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// + if (_total_n_reads == 0) { std::cerr << "total number of reads is zero - not creating a report!" << endl; return; @@ -553,34 +620,30 @@ void TaxReport::printReport(std::string format, std::string rank) { } } -template -void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { - - if (_show_zeros || (tax.numReadsAligned+tax.numReadsAlignedToChildren) > 0) { +template +void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { + if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) { printLine(tax, depth); - - for (auto child : tax.children) { + for (auto child : tax.children) printReport(*child, depth+1); - } } - } -template -void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { +template +void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { for (auto& col : _report_cols) { switch (col) { case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break; //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; - case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (tax.numReadsAligned + tax.numReadsAlignedToChildren); break; - case REPORTCOLS::NUM_READS: _reportOfb << tax.numReadsAligned; break; - case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; - case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break; + case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break; + case REPORTCOLS::NUM_READS: _reportOfb << (tax.read_counts); break; + //case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; + //case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; //case REPORTCOLS::SUM_SCORE: ; break; @@ -596,6 +659,45 @@ void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { } + // Return lowest common ancestor of a and b + // LCA(0,x) = LCA(x,0) = x + // Default ancestor is 1 (root of tree) + uint32_t lca(unordered_map &parent_map, + uint32_t a, uint32_t b) + { + if (a == 0 || b == 0) + return a ? a : b; + + // create a path from a to the root + std::unordered_set a_path; + while (a > 0 && a != parent_map[a]) { + if (a == b) + return a; + a_path.insert(a); + a = parent_map[a]; + } + + // search for b in the path from a to the root + while (b > 0 && b != parent_map[b]) { + if (a_path.count(b) > 0) + return b; + b = parent_map[b]; + } + return 1; + } + +template +inline +V find_or_use_default(const std::unordered_map& my_map, const K& query, const V default_value) { + auto itr = my_map.find(query); + + if (itr == my_map.end()) { + return default_value; + } + + return itr->second; +} + #endif /* TAXD_DB_H_ */ From 53560de028f0781395e4200ca4f74fc1420efabe Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Tue, 11 Apr 2017 23:11:48 -0400 Subject: [PATCH 025/105] Renamed to kraken_hll --- install_kraken.sh | 4 ++-- scripts/{kraken => kraken_hll} | 0 scripts/{kraken-build => kraken_hll-build} | 0 scripts/{kraken-filter => kraken_hll-filter} | 0 scripts/{kraken-mpa-report => kraken_hll-mpa-report} | 0 scripts/{kraken-report => kraken_hll-report} | 0 scripts/{kraken-translate => kraken_hll-translate} | 0 scripts/standard_installation.sh | 8 ++++---- 8 files changed, 6 insertions(+), 6 deletions(-) rename scripts/{kraken => kraken_hll} (100%) rename scripts/{kraken-build => kraken_hll-build} (100%) rename scripts/{kraken-filter => kraken_hll-filter} (100%) rename scripts/{kraken-mpa-report => kraken_hll-mpa-report} (100%) rename scripts/{kraken-report => kraken_hll-report} (100%) rename scripts/{kraken-translate => kraken_hll-translate} (100%) diff --git a/install_kraken.sh b/install_kraken.sh index 803989c..b909336 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -19,7 +19,7 @@ set -e -VERSION="0.10.6-unreleased" +VERSION="0.10.7-kraken-hll" if [ -z "$1" ] || [ -n "$2" ] then @@ -58,7 +58,7 @@ echo "Kraken installation complete." echo echo "To make things easier for you, you may want to copy/symlink the following" echo "files into a directory in your PATH:" -for file in $KRAKEN_DIR/kraken* +for file in $KRAKEN_DIR/kraken_hll* do [ -x "$file" ] && echo " $file" done diff --git a/scripts/kraken b/scripts/kraken_hll similarity index 100% rename from scripts/kraken rename to scripts/kraken_hll diff --git a/scripts/kraken-build b/scripts/kraken_hll-build similarity index 100% rename from scripts/kraken-build rename to scripts/kraken_hll-build diff --git a/scripts/kraken-filter b/scripts/kraken_hll-filter similarity index 100% rename from scripts/kraken-filter rename to scripts/kraken_hll-filter diff --git a/scripts/kraken-mpa-report b/scripts/kraken_hll-mpa-report similarity index 100% rename from scripts/kraken-mpa-report rename to scripts/kraken_hll-mpa-report diff --git a/scripts/kraken-report b/scripts/kraken_hll-report similarity index 100% rename from scripts/kraken-report rename to scripts/kraken_hll-report diff --git a/scripts/kraken-translate b/scripts/kraken_hll-translate similarity index 100% rename from scripts/kraken-translate rename to scripts/kraken_hll-translate diff --git a/scripts/standard_installation.sh b/scripts/standard_installation.sh index b542a4f..341e4e0 100755 --- a/scripts/standard_installation.sh +++ b/scripts/standard_installation.sh @@ -31,10 +31,10 @@ then fi check_for_jellyfish.sh -kraken-build --db $KRAKEN_DB_NAME --download-taxonomy -kraken-build --db $KRAKEN_DB_NAME --download-library bacteria -kraken-build --db $KRAKEN_DB_NAME --download-library viruses -kraken-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ +kraken_hll-build --db $KRAKEN_DB_NAME --download-taxonomy +kraken_hll-build --db $KRAKEN_DB_NAME --download-library bacteria +kraken_hll-build --db $KRAKEN_DB_NAME --download-library viruses +kraken_hll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \ --max-db-size "$KRAKEN_MAX_DB_SIZE" \ --minimizer-len $KRAKEN_MINIMIZER_LEN \ From c6cf5bcfd04c0efc5c5bdfaeefe25b4c2dc0fb51 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 12 Apr 2017 10:13:35 -0400 Subject: [PATCH 026/105] Fixed building of kraken_hll --- src/Makefile | 11 +- src/classify.cpp | 31 +----- src/krakenutil.hpp | 2 - src/readcounts.hpp | 30 ++++++ src/set_lcas.cpp | 7 +- src/taxdb.h | 244 ++++++++++++++++++++++++++------------------- 6 files changed, 183 insertions(+), 142 deletions(-) create mode 100644 src/readcounts.hpp diff --git a/src/Makefile b/src/Makefile index 73b6b9c..03f32cb 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ CXX = g++ -CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3 -g +CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream @@ -19,15 +19,14 @@ db_sort: krakendb.o quickfile.o set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o -classify: krakendb.o quickfile.o krakenutil.o seqreader.o taxdb.h +classify: krakendb.o quickfile.o krakenutil.o seqreader.o $(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS) -make_seqid_to_taxid_map: quickfile.o - build_taxdb: taxdb.h - $(CXX) $(CXXFLAGS) -o build_taxdb build_taxdb.cpp -krakenutil.o: krakenutil.cpp krakenutil.hpp +make_seqid_to_taxid_map: quickfile.o + +krakenutil.o: krakenutil.cpp krakenutil.hpp taxdb.h $(CXX) $(CXXFLAGS) -c krakenutil.cpp krakendb.o: krakendb.cpp krakendb.hpp quickfile.hpp diff --git a/src/classify.cpp b/src/classify.cpp index 1f5bbac..981f1f6 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -22,7 +22,7 @@ #include "krakenutil.hpp" #include "quickfile.hpp" #include "seqreader.hpp" -#include "hyperloglogplus.h" +#include "readcounts.hpp" #include "taxdb.h" #include "gzstream.h" @@ -66,7 +66,7 @@ ostream *Report_output; vector Open_fstreams; vector Open_gzstreams; size_t Work_unit_size = DEF_WORK_UNIT_SIZE; -TaxonomyDB taxdb; +TaxonomyDB taxdb; static vector KrakenDatabases (DB_filenames.size()); uint64_t total_classified = 0; @@ -79,29 +79,6 @@ inline bool ends_with(std::string const & value, std::string const & ending) return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); } -struct ReadCounts { - uint64_t n_reads = 0; - uint64_t n_kmers = 0; - HyperLogLogPlusMinus kmers; // unique k-mer count per taxon - void add_kmer(uint64_t kmer) { - ++ n_kmers; - kmers.add(kmer); - } - ReadCounts& operator+=(const ReadCounts& b) { - n_reads += b.n_reads; - n_kmers += b.n_kmers; - kmers += b.kmers; - return *this; - } -}; - -inline -uint64_t reads(const ReadCounts& read_count) { - return(read_count.n_reads); -} - - - ostream* cout_or_file(string file) { if (file == "-") return &cout; @@ -143,7 +120,7 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); if (!TaxDB_file.empty()) { - taxdb = TaxonomyDB(TaxDB_file); + taxdb = TaxonomyDB(TaxDB_file); for (const auto & tax : taxdb.taxIDsAndEntries) { if (tax.first != 0) Parent_map[tax.first] = tax.second.parentTaxonomyID; @@ -220,7 +197,7 @@ int main(int argc, char **argv) { if (Print_kraken_report) { taxdb.fillCounts(taxon_counts); - TaxReport rep = TaxReport(*Report_output, taxdb, false); + TaxReport rep = TaxReport(*Report_output, taxdb, false); rep.printReport("kraken","blu"); } diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp index 30eb67d..97dd041 100644 --- a/src/krakenutil.hpp +++ b/src/krakenutil.hpp @@ -29,8 +29,6 @@ namespace kraken { // Return the lowest common ancestor of a and b, according to parent_map // NOTE: LCA(0,x) = LCA(x,0) = x - uint32_t lca(std::unordered_map &parent_map, - uint32_t a, uint32_t b); // Resolve classification tree uint32_t resolve_tree(std::unordered_map &hit_counts, diff --git a/src/readcounts.hpp b/src/readcounts.hpp new file mode 100644 index 0000000..9676514 --- /dev/null +++ b/src/readcounts.hpp @@ -0,0 +1,30 @@ + +#ifndef READCOUNTS_HPP +#define READCOUNTS_HPP + +#include "kraken_headers.hpp" +#include "hyperloglogplus.h" + +namespace kraken { + struct ReadCounts { + uint64_t n_reads = 0; + uint64_t n_kmers = 0; + HyperLogLogPlusMinus kmers; // unique k-mer count per taxon + void add_kmer(uint64_t kmer) { + ++ n_kmers; + kmers.add(kmer); + } + ReadCounts& operator+=(const ReadCounts& b) { + n_reads += b.n_reads; + n_kmers += b.n_kmers; + kmers += b.kmers; + return *this; + } + }; + + inline + uint64_t reads(const ReadCounts& read_count) { + return(read_count.n_reads); + } +} +#endif diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 3848d5d..0e60887 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -23,6 +23,7 @@ #include "krakenutil.hpp" #include "seqreader.hpp" #include "taxdb.h" +#include "readcounts.hpp" #include #define SKIP_LEN 50000 @@ -54,7 +55,7 @@ unordered_map Parent_map; unordered_map ID_to_taxon_map; unordered_map SeqId_added; KrakenDB Database; -TaxonomyDB taxdb; +TaxonomyDB taxdb; int main(int argc, char **argv) { #ifdef _OPENMP @@ -64,7 +65,7 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); if (!TaxDB_filename.empty() && !force_taxid) { - taxdb = TaxonomyDB(TaxDB_filename); + taxdb = TaxonomyDB(TaxDB_filename); for (const auto & tax : taxdb.taxIDsAndEntries) { if (tax.first != 0) Parent_map[tax.first] = tax.second.parentTaxonomyID; @@ -141,7 +142,7 @@ void process_single_file() { iss >> parent_taxid; taxid = ++New_taxid_start; Parent_map[taxid] = parent_taxid; - auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry(taxid, parent_taxid, "sequence")}); + auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry(taxid, parent_taxid, "sequence")}); if (!itEntry.second) cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the XXX option." << endl; } else { diff --git a/src/taxdb.h b/src/taxdb.h index ce45bf8..ac3344f 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -33,7 +33,129 @@ using namespace std; -void log (const std::string& s) { +void log_msg (const std::string& s); + +template uint64_t string_to_T(std::string str); + +template +inline uint64_t reads(const T read_count); + +inline uint64_t reads(const uint64_t read_count); + +std::vector in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at = 0); + +std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0); + + +std::vector get_fields(const std::string &s, const std::string& delimiter, std::vector fields); + +template +class TaxonomyEntry { + public: + TAXID taxonomyID = 0; + TAXID parentTaxonomyID = 0; + std::string rank; + std::string scientificName; + + TaxonomyEntry() {} + + TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) : + taxonomyID(taxonomyID_), scientificName(scientificName_) {} + + TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) : + taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {} + + TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) : + taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {} + + inline bool operator==(const TaxonomyEntry& other) const; + TaxonomyEntry* parent = nullptr; + std::vector children; + + READCOUNTS read_counts = READCOUNTS(); + READCOUNTS read_counts_children = READCOUNTS(); + + bool used = false; + uint64_t genomeSize = 0; + uint64_t genomeSizeOfChildren = 0; + uint64_t numBelow = 0; +}; + +//template<> +//TaxonomyEntry::TaxonomyEntry () { +// read_counts = 0; +// read_counts_children = 0; +//} + +template +struct TaxonomyEntryPtr_comp { + bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const; +}; + + +template +class TaxonomyDB { + public: + TaxonomyDB(const std::string inFileName); + TaxonomyDB(); + //std::unordered_map seqIDsAndTaxIds; + std::unordered_map > taxIDsAndEntries; + void parseNamesDump(const std::string namesDumpFileName); + void parseNodesDump(const std::string nodesDumpFileName); + TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; + std::string getScientificName(const TAXID taxID) const; + std::string getRank(const TAXID taxID) const; + TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; + TAXID getParentTaxID(const TAXID taxID) const; + std::unordered_map getParentMap() const; + std::unordered_map getScientificNameMap() const; + std::string getLineage(TAXID taxonomyID) const; + std::string getMetaPhlAnLineage(TAXID taxonomyID) const; + char* getIndexFileName(const TAXID hostTaxID) const; + void readTaxonomyIndex(const std::string inFileName); + void writeTaxonomyIndex(std::ostream & outs) const; + void writeTaxonomyIndex(std::ostream & outs, + const std::string namesDumpFileName, + const std::string nodesDumpFileName); + bool isSubSpecies(TAXID taxonomyID) const; + int isBelowInTree(TAXID upper, TAXID lower) const; + void addCounts(const TAXID taxid, const READCOUNTS& read_counts_); + void fillCounts(const std::unordered_map& taxon_counts); + void createPointers(); + void printReport(); +}; + + +template +class TaxReport { +private: + std::ostream& _reportOfb; + TaxonomyDB & _taxdb; + std::vector _report_cols; + uint64_t _total_n_reads; + bool _show_zeros; + + void printLine(TaxonomyEntry& tax, unsigned depth); + +public: + TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); + + void printReport(std::string format, std::string rank); + void printReport(TaxonomyEntry& tax, unsigned depth); +}; + + + // Return lowest common ancestor of a and b + // LCA(0,x) = LCA(x,0) = x + // Default ancestor is 1 (root of tree) +uint32_t lca(std::unordered_map &parent_map, uint32_t a, uint32_t b); + +template +inline +V find_or_use_default(const std::unordered_map& my_map, const K& query, const V default_value); + +//////////////////////////// DEFINITIONS +void log_msg (const std::string& s) { std::cerr << s << "\n"; } @@ -60,7 +182,7 @@ uint64_t reads(const uint64_t read_count) { return(read_count); } -std::vector in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at = 0) { +std::vector in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) { std::vector tokens; size_t i = 0; size_t next_end = start_at-1; @@ -81,7 +203,7 @@ std::vector in_betweens(const std::string &s, const char start_char -std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0) { +std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) { std::vector tokens(max_fields); size_t delim_length = delimiter.length(); size_t last = 0; @@ -123,86 +245,16 @@ std::vector get_fields(const std::string &s, const std::string& del +//template<> +//TaxonomyEntry::TaxonomyEntry () { +// read_counts = 0; +// read_counts_children = 0; +//} template -class TaxonomyEntry { - public: - TAXID taxonomyID = 0; - TAXID parentTaxonomyID = 0; - std::string rank; - std::string scientificName; - - TaxonomyEntry() {} - - TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) : - taxonomyID(taxonomyID_), scientificName(scientificName_) {} - - TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) : - taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {} - - TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) : - taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {} - - inline bool operator==(const TaxonomyEntry& other) const { - return this->taxonomyID == other.taxonomyID && - this->parentTaxonomyID == other.parentTaxonomyID && - this->scientificName == other.scientificName; - } - TaxonomyEntry* parent = nullptr; - std::vector children; - - READCOUNTS read_counts = 0; - READCOUNTS read_counts_children = 0; - - bool used = false; - uint64_t genomeSize = 0; - uint64_t genomeSizeOfChildren = 0; - uint64_t numBelow = 0; -}; - -template<> -TaxonomyEntry::TaxonomyEntry () { - read_counts = 0; - read_counts_children = 0; -} - -template -struct TaxonomyEntryPtr_comp { - bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { - return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); - } -}; +bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { + return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); + } -template -class TaxonomyDB { - public: - TaxonomyDB(const std::string inFileName); - TaxonomyDB() {}; - //std::unordered_map seqIDsAndTaxIds; - std::unordered_map > taxIDsAndEntries; - void parseNamesDump(const std::string namesDumpFileName); - void parseNodesDump(const std::string nodesDumpFileName); - TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; - std::string getScientificName(const TAXID taxID) const; - std::string getRank(const TAXID taxID) const; - TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; - TAXID getParentTaxID(const TAXID taxID) const; - std::unordered_map getParentMap() const; - std::unordered_map getScientificNameMap() const; - std::string getLineage(TAXID taxonomyID) const; - std::string getMetaPhlAnLineage(TAXID taxonomyID) const; - char* getIndexFileName(const TAXID hostTaxID) const; - void readTaxonomyIndex(const std::string inFileName); - void writeTaxonomyIndex(std::ostream & outs) const; - void writeTaxonomyIndex(std::ostream & outs, - const std::string namesDumpFileName, - const std::string nodesDumpFileName); - bool isSubSpecies(TAXID taxonomyID) const; - int isBelowInTree(TAXID upper, TAXID lower) const; - void addCounts(const TAXID taxid, const READCOUNTS& read_counts_); - void fillCounts(const unordered_map& taxon_counts); - void createPointers(); - void printReport(); -}; template std::unordered_map TaxonomyDB::getScientificNameMap() const { @@ -237,12 +289,15 @@ void TaxonomyDB::createPointers() { } } +template +TaxonomyDB::TaxonomyDB() { } + template TaxonomyDB::TaxonomyDB(const std::string inFileName) { - log("Building taxonomy index"); + log_msg("Building taxonomy index"); readTaxonomyIndex(inFileName); createPointers(); - log("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + + log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + " nodes"); } @@ -570,24 +625,6 @@ void TaxonomyDB::fillCounts(const unordered_map -class TaxReport { -private: - std::ostream& _reportOfb; - TaxonomyDB & _taxdb; - std::vector _report_cols; - uint64_t _total_n_reads; - bool _show_zeros; - - void printLine(TaxonomyEntry& tax, unsigned depth); - -public: - TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); - - void printReport(std::string format, std::string rank); - void printReport(TaxonomyEntry& tax, unsigned depth); -}; - template TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; @@ -641,7 +678,7 @@ void TaxReport::printLine(TaxonomyEntry& tax //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break; - case REPORTCOLS::NUM_READS: _reportOfb << (tax.read_counts); break; + case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.read_counts); break; //case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; //case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break; //case REPORTCOLS::GENOME_SIZE: ; break; @@ -662,8 +699,7 @@ void TaxReport::printLine(TaxonomyEntry& tax // Return lowest common ancestor of a and b // LCA(0,x) = LCA(x,0) = x // Default ancestor is 1 (root of tree) - uint32_t lca(unordered_map &parent_map, - uint32_t a, uint32_t b) +uint32_t lca(unordered_map &parent_map, uint32_t a, uint32_t b) { if (a == 0 || b == 0) return a ? a : b; From e76de48066c36941010560a1311fd9f5e3873662 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 12 Apr 2017 10:17:03 -0400 Subject: [PATCH 027/105] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 988deb5..a31d87c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -Kraken taxonomic sequence classification system with Unique K-mer Counting +Kraken taxonomic sequence classification system with unique k-mer counting =============================================== -[Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. +[Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count using the HyperLogLog algorithm. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). From 83c075f1b4cc329730bba8f032445a311e545a82 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 12 Apr 2017 15:10:49 -0400 Subject: [PATCH 028/105] Show numer of unique k-mers --- src/gzstream/Makefile | 2 +- src/gzstream/libgzstream.a | Bin 14254 -> 14622 bytes src/readcounts.hpp | 2 +- src/taxdb.h | 6 ++---- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/gzstream/Makefile b/src/gzstream/Makefile index 9884a9e..4c32088 100644 --- a/src/gzstream/Makefile +++ b/src/gzstream/Makefile @@ -35,7 +35,7 @@ # CXX = CC -n32 -LANG:std # for SGI Irix 6.5, MIPSpro CC version 7.30 CXX = g++ # for Linux RedHat 6.1, g++ version 2.95.2 -CPPFLAGS = -I. -O +CPPFLAGS = -I. -O -fPIC LDFLAGS = -L. -lgzstream -lz AR = ar cr diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a index 92861086535726f3d31314ecb9e4bfe8ba8724b8..5144238131823c9252a7f0c5ff9f3690b8235c5e 100644 GIT binary patch literal 14622 zcmd5?4QyN2b$;ZNEmm#OXY?e?(2>3@_IZlF)Z?EAJxhwMm5d1rJN6|Y*^c(-Ku!s z3*|<5n&5dKJShJGO0TE%k0^zyr|NH{^iLoSQ2r)Jjpd_F#simg4Mz9T=Emb+2k7$g zdC=%EFpA%5dExc7H6y>FMZ-b>95D*NHVSVTTlV=L ztpm>df7Lbad&)>Y8!wI1VyCZi+?p|qpWc|vcYe4e%(o3h6UuPWr z8a0;dZVrM`qY!IpZaYIwn){4VxKel!+j{v4eeRyZ`TT|E{9Dbr@yk#6a*@@#mW)x1 zwRl#wk=3vVgkMokf=6NFfipQ{HII?;7=;&%<#vNE?FoDCdWF$xzhKhcys z1B#9+MVJ2JQEd99^H%aBD|y07exQ`AJo%TqUA5?fLyL^%`nCR$*|f>5CU%OF-{g{pHj;CGo@4UhJ@+tHL3cymXESpo8FVVJruXWix1Oe$!aK zFI@PU5{^8EgQa_AZbPK-Z&zM4k}n#byMwX91sqQ+)?jbS-N8}c=^XX^+iSTE#`5l9 z_(aIa?+$tjZz}Eix7HeaR}RUsxMD0H3>pvoORmf6kM4tkjQ7nI$7xg2@x?QZkDR&k z0*>=bKT&d5@4@v(q25?N;4_xP7mdPo^}kQoS)JqDZk*S}`+Ukz9QIxq*vq=1-w9TK zzMKu@pYf==ORq)_l$QsG-uZQ3;a?+#XN+aL=deORD`-UGVB768&hEywO@p+V(g3AF zN<)-(skE4eA#ANI6{@1X)%<-h2w`ML`(YS|;0YSV{O5467ouGN4?>C~9Hv4PMvo57 zGW7Y2ahmS2!hgc@!~^kCcVFQjZ5HB%SFPyCHk2n!ocyv@ugIrYA_rxpoEFl)!sSTe zr$+Jb)imOEoy`x!lt5a|Phf5}KZ%KPHa|-hkex4Qm&(}_oMj(z=5MrSzM8DC9#?h( zG~jHMIw4A(5DkDiuotCHh*Bp+RVT!Y$*@Z--d!iCcw3zrEBtr7aJdxyfxg0ORrDRZ z-O&+iS>he2aK-92HTrR9iF`U@t}H#S#>*T`*xDw9e(2J3IBcqC#4C<7-YEV#M$xie ze7}llal1;EMGbg$6$iK*ak|u3k3QWf-rJ?%Ln~`*FxK6hQnRKXE~z=g3o@z|&o$3_ zHooG~zT|~@4DdDI_HKO4t9@Qq(;6GcRQV3ZXmx6T=c?wg-fI}wFXAe{w#McEFDXvp zT4?@l%YRPaQD%2E`jJ3z_uiq+fzH-#t!;t!P}`2swos=fc^xV6EbiAl^G%)`Hq`r0 zgD)|_@m&zxTRxZ_*!(toQ?wRey8O)BdKOANzfu z@Q3p2j@6yii(V}NqN6Ae^d$a7{7rc;3eGq9n<9Q+J^DnQ-eAl^ab9l*HcR9vv%9;3S5Sa;apGzdbdzk~1@$UC%WklTa$HCpk* zm{-1!H(;migyLsOPY_b22Yfw+W1y(__yZ?#3>;e*fzDp9H+x_h_ybS}Jm`EJ``iQ?PUq9pz>p73V3lPu@9A$0RIQTpA6?JQIjKz2Z1f%B~ zC}KRGmTejGH{GLp0hy}_`?5}Z!*LzZHSVoSIbB0b)ud}Y8Y=5_4ZpyuNmpNGPB?m1 zO}hGebHX=Pf!JkU@c*uQt13(!H$dtUep1(}T3tnt7)7G1uSEe+R@dm+*Jm~9TA&&PMAvR|!EbiKZ>hp>RfvpAnME(HFl7vLIz-fobe>eU zN*yGeNVjHJRh6z)ZHtOW*Vb3@BA{#3>9$&3!_}yg?ttR^#5h-LJYaXh<1`90F8G2A zz63bxKL;t&lUAbP>g}ZeuUzo&0*-o4QK;4a0NB$m{LcY?lbCn7Cm_8B&n9nqoNz?e zf!$%dBwb)@P`R! zf9`^^lRu-x&r?I~bD$qTO3>~gO&PVv0sau-Um+a#IHd0o-k>%rO$#ej)6N0zWOqHt zqyCQ(Kkjize+sx0f5-)&bHVQ?{TZ6VxCbJA)rJ3I;%}lUtoAtIKj*^#GUCuSJH770 zUk~kcYS(puZxZXgpA_Dr@bWsZ-W8y(%SBJZ1wWH1HFUhV5BGBYiiS} z>E!GJ9A?9P$!L{UaWFr=h?gZAc$P~f51Yw}!{%6Ga;i$CboxHNsIE||7=EViiR@$&9H6_Mz9fKc$%#b9%w-ajxolsuw-@aVG|*_1gsGnR(7!rhj0BphcFSgr_# zlCOqrw%bhe$h52Rnr2IFbA9_HT=AxQ_wOl<3LM?Fwy!gr%E9esZ+}v)Vzr9x$fi<< z(_>@3Qy^gt+yOvFCo@pqrUvppSShE|qtwB*wMD6&O=n>`bc{g?_mrkNJ)&lWnZrr) zyX)@GPNa`cCuSDtPPW#jcFt$VQ#rUHe|HhuK3BePuhmrd>*QMHz{S}) zba9XJaj@!|R=sy*)vtBV@gd0n#yVzOJ0b8Ef!{{&bMRWC-6HVM3;fRnepcX{1%8Ea z{Qm3^`1|SI4)x=^5&Lr^;mE&L;9Fhzw+ViHcV|5@!QU$IJ%S%ko0)$^@Z-@T<6ji` zHi3Ur;Ozo`N#M5&d?&oOB4K+!0uSq7a>36FT*l|8gxllustf-w1iy^Wn&6l538I5Q znA{8x`xz5>Sl|W1(I2V*$AZ5{@INQ`W&hp?@25yu9fgPO-YRhXrf0m9aC;nf34R&3 zj|+b3=iP!|#vvu}n6UfT0^cR@H-vsU&i(LSjAW0`e!{W6GCrRa{4zdY68thge9}@Vi!0}9!X`V8uXD5RqMI6_+X2N9r z5f2mIL@Dxrg7TQK9OAg2G6g7uco%~rZ`Tth9>jN09upbQZh>><$W2(QBkbdOn6GCnu^5VninCESJO6z26 zc40b~7|~kcu*OO!IG0IHC0gOrHm9|YCUOa_btIeB;P14Aw&K4-D^6zcL%Vf6nKV=L z$<%Dl1btH}7q+XY!2_Ne)Ht${e|VB$CNi1C0sv|B9^9~hom)*&mEaBVA&>1}rxntb zU)V0FLxA^y^GImuiJSCETXY3MqK033Duy zfCHA+@l1M-S6~=SzX#zx#kCIdVZyGXGzck<<FV|1r$8P}~?bGL^B?P})+3^x$ zWoTJ1p%_zSb(6RA+spWkf!KOm!`OZ=)Jd|jX>CyH)c&V#vZRJd7~B6|c(_^IejHP1 z-)mF+8DixA0v1Jf|NjC|C;yL-Jl~s=cvb(+Lxq$7!4FvySdWDM<2#w%|Azo`^8cwC z{ePV0A(iwr$>aEtWN-iTQ0a%q-v4irJp0Q!xXkIZP~p`6Z&3TWpMt=LYaz=q{xV=r z?N@^x>d-eyEYFw+gi&6%^?wsi79h4`mCE+H{2RzP`Tr>SZxAE;kLx$vXT1TkZy$d_ zvd{OCq^2a#<*k!#3AW=L%8B2Z8e4f4hhb#V;hHmU1#lhOqO6`CqyK<$$GtxSFyd8 zcN{xTOI-F67#X8TTwB!AJFcn(=+s_0wTQwAd@RIwRRuY@F}58rp~R->F^E$w%1YiW-~TMZ*B zs`%Y7+P1flz~O5(P5ZK@RekzJU)CeLW3eMMd3z$08XcG%+CDipn3)(F$xU?~)6~kA z+<0cJt9MsATT#`T9?9i1J~Exn0_@*cW~M^EGv+57C9v7^FqO{`D`+f(L1q83s5REy zV(l)k@pPB*ce=`hWTa^xoLKYF$Yg%lf;bIjCVU)j&mE)SRYs^Ke|#+M+a5PUiqlH0 z%Un-1mpY-mA{J1=x~IQS?TP-L3Z$<*f*9MLN~m9fG}%o}Id|;y5g&_2w;2Z9W8&V{ z27^7etz}!Y81sj(J=vAqSrrPwz;`L6-KUjG64kUwxRj5JY+T!<-70v04a#+JpnQ*0l-@uoOfA84Bc;DV>2E=5z}3;ex6gDwGv}rv=BZ0{Up)<&dG}O(-D7i*Tbh}D zZ3<1;2TkWgb918hZ>phGxLjTL;P=e*xlM5UL2`a@jcIOrFX{a21*p`_`J+j5?nM06 z%^+zy{~^Fr3y&H}CtYhgMuJleKPABNDUlR^&htgC~mQj@@9+ty=Dlx;iwB1GV23s_QHsD4vl~UtA>uw)prH zU{GY|a}c8cbp2UO)Cx>3&y!Abu~3qG4YNXSDPcNqn^IUZFNJ%Yx1{Q%^V0xf$suB$ zoV%e%pSkdF4?qt<+?~%{znmY7^gxm7mp*_oRQNeKj^oHM=W>narccbdlX2&w;CvPP zcl+CuYn=~Vt7UtwIoI77eHGwyJ%A$e zOY0#DNjZiIbfeI5u;Wm%V;{B9`K-tJq{Dg7oc}hCWiN1xg~L_or>31P3}fzWA&Uu& z;Bl&e>~twRTgpD+Wpz}*27PiK79%-H784Y)1Vt=ixa>HxD?3>Ms;pBD;Ov%!4!U^#P zwoUZStC)KoZzW7j53Mvm+%-zDn~QD~=TEe|UAsXH{&G+@jRyMia;P>^bDMT=4Om%` zRxcVW*1lEi0_|XFol!HXMQbc=Ka~F@T zxXWnl-o1B|(bCl3)M_+GW3AD)=50pfp3INQMP7EUs)QUGzpGQ&8MrfutDLhyuW{A%H*oZPfJ#gVG70RDRUkW-Ji3?A_P9s$H zVm;79aQX0>l6MQ#{|+(&Jr9@QRonQDQX6s2+d>0xEeZi)yWpKj(0aU!c0DB8h5QGp zZnfahG;GH>VfP|d)(%^`rWsK64DrPwuicOF%!X@=J2A7f2o|v4zDqnU$YX^g{eWW( z`U#(fyiW|Kq#?0>)1og1gYf@|@E_OiXGza_sR!aSU$rA#|0Irq)72f}#zIYJI9l~? zxDf#85MY22aS!mnPqOzwDr5T;%FgRM!p4&*d%C7094%aHr!(WW1^PxZ0c@Y620s>8pvISt{L7AIZ7Ve~kva}Q4!s9A)NeJn zDWz)V!>vrP?8A_ZaLLcFS3cKO!|IoxTCaYaH>Go%E>gyzU&=t|ADKJpZ2kjYm&4p(;)k^ zB+vUV<|A(ahD*~maB+H)3O6Vf(yrv)02vh%b&W2^1$ryO{=;=&#zU;NP`(u|v@f1= zkV^V9#jeCvTU($4bq2|!ZceIv8&rz+(;6yyrF|SToD?>V$CUG3xWg^IKp?gs_g&QX z<8vA9S1F4Bo!T$^4_E}L?fG3m{rvxw@*8{KIFs>i;u< z`T4(Vl`D89|MA;F$h+*`{b7>t_7FFtw*NhVajQ%1|K~_v`YY9_((Kk-2|G1UsC%-`Q(vH-DeZc9LQ1J7AnEW@18U4q7J=OmW z0ruY~`?ycYN%jBk0C{PskqblKP|9r}@8|zt`uLCgkgER&0_?|$aaOTsDCKyK_~HA%YJVoc{sJ+o{iT*C1MDBhg#{?;Cc-}MyQ=p8D8T-CV(cY(nHVxd>F)>F zKSTD<5FfVRfJ?Ri*JNM%N$gq_7keOD2HgVOfnAVHUnhBjz2p$ddjMsmoW%d0@OfQ(0w$I-BKz3vK*A6pViub`R*g=;z?d9scUO0K|pj(eG0g2 zx_Ba1is<6+gdZNM1Od^-AB@T>T@zoSWfka275M)ldoMu-_dTHGC+{cS+Z8BN1$Oa` zxRP|S()!`}N2QW<4Og~G(!Je)@;Y6^m7tPzF+%+Cn=3&;bTOjKtGYyyx4u-bYd4h& zET?<3nt_njA<}-NLyaHlzB3W^lx?kfqI$(!t6QaO%Ah9+-J3n03hfcdUdP7sOH?%l zso!XOGyp$EdcH&QPf_|Dq<(sy2ORCjDb!*g2(X0!{_g{R6PqtRP)7O@Tz>jL0=$8Z zpIIm)Rf9c0{#XFMBLII0a6f;B1YWZ6kZV>u5uoQWz%dTTAVtEsAmM&bgJy_4-g+tb zEx=3Ky=v`;0XVL|sQ))IuJ>aQPT&BYY>}V!sN^KMZ9*e=Oqv zD)Gn3)f0rjMY!0jg1V;(zfAZn@xMs;G)8O;vh)YA*-yzk@in)z$3Y|0#g~;{Y6&j0Uz2%lGZ;1deaK zF3IQhM!@~z@IU}QBlMKkZTUVtFYwa3jn8MK=K}QnZ2}*G$oou(P zYz{X7E04vr`kXQ;JEJq28BLELhm#j@fM>8mo>@9FgQs^YIJ}b~M`pwko(cx{0FQNK zdfFP#OyqN8sgbOG+&UHo&dO6g{rmdtSnEJ4pG{ldp`xBNfU)#&YQnN7Qdv9SlkVz5 zLw$DJNG_ckp));u$ELDlgVvr*eiHTq)u?&Lc=(YVI>j?M((4{qD%;ikoRdP}=62W1 zY!2KB^sgPNEnv*Zb4>r&M~sflOig6%%4mFHM~$u+7#J~`)I=AAtl8>;=)h&C$iuSm zvtvqLMct@Rm|J?lsOzFN zn9o_msj8UOX#2F7y6SGdI5fovtC%snV^Is_rc(piW3gt)=d)IN zB$dxw_VMwIl^q+(VdMIH{G0AOk-&0U@ofBMfgr@UHGB%vdyt)Xr zPL|GjuGG}_@d5a4GnC4XbcrL5E0t@T&W}%`i@Su6{pt@SO?%LN%@AK!4Uqq(`x|_D zkl{Bmd=CZP9Kfbfd=l}Bo z{NHB$w=@1%82@IbzsUIUT~q3Rm+^07{Qu1Ox&EIrey;x`hT|TVY?u6`#rSZ$Yw2AU z6K?lx4 z|2D?Y<9QF`=ke?b;NKI#{|Li*+%gR3{W8jM&i{K1zZu#g?%xq@JS;=k3ZfoVV-i4Bx@@TqGR*;q&(s1&>2L z5*}hgQrmkM;c9z#Fn&HxPB8vB=#%Yo7|z@K2Mp)!{U+1T=husjpUch9zs&}tY-L5mqpyegkzk$;gb5- z5|28r))9{Uw=jOWPOJUc!1y;aevCPi%8&b?NT{Fl-zQKP=W+g`{gnG1=nr2H<$js! zKlhXSFJq2+xc}5Fv4e?qa$lI^_d`KGU*!HfJ|0ZT#Oe{>#c-KLd^cQvdXka}QrVBl zFQ4^T&F%IeKqU1czwA@2;(D5h7ZZc>$lptOOyvFr;&RUp@hIMa5Z@zm?hT(oNQn1| zoO_E?9`U;nD#t;&Eb*P>TEyP#%+d$|Ejgh>(FdAjIVu#&SE}fK)wkfx5WdD@2ILQ_fk$4ZH#2 zSjJ0|9Agsy4pod$D)Bfv0Q9t*h;^)y@|e6pdE}=hP#`Z6j_V#RX#)8N!jT`gggnB1 zYb4~Cu@wN>m;T6o;njpA|7yYL-l7V>+~@67aJkQWSi$8!udU#6pI4swlm5v4+y#YS z?&r#Lev)7A=l%=vOaH$_ZKy^@AjvQHc?|`Z`?;G4mwM#-cu?V&`?eW{Uyie96@IyI L`-+0g_3*y|@rP<5 diff --git a/src/readcounts.hpp b/src/readcounts.hpp index 9676514..486edbd 100644 --- a/src/readcounts.hpp +++ b/src/readcounts.hpp @@ -9,7 +9,7 @@ namespace kraken { struct ReadCounts { uint64_t n_reads = 0; uint64_t n_kmers = 0; - HyperLogLogPlusMinus kmers; // unique k-mer count per taxon + HyperLogLogPlusMinus kmers; // unique k-mer count per taxon void add_kmer(uint64_t kmer) { ++ n_kmers; kmers.add(kmer); diff --git a/src/taxdb.h b/src/taxdb.h index ac3344f..c8dd2bd 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -175,8 +175,6 @@ uint64_t reads(const T read_count) { return(0); } - - inline uint64_t reads(const uint64_t read_count) { return(read_count); @@ -679,8 +677,8 @@ void TaxReport::printLine(TaxonomyEntry& tax //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break; case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.read_counts); break; - //case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; - //case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break; + case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.read_counts.kmers.cardinality(); break; + case REPORTCOLS::NUM_KMERS: _reportOfb << tax.read_counts.n_kmers; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; //case REPORTCOLS::SUM_SCORE: ; break; From cf4aeae02705e5250bc9dbeec93bb5d1c9caf3f3 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 4 May 2017 18:30:40 -0400 Subject: [PATCH 029/105] Start all scripts with kraken_hll --- scripts/kraken_hll | 4 +- ...ibrary.sh => kraken_hll-add_to_library.sh} | 0 scripts/kraken_hll-build | 2 +- ...ld_kraken_db.sh => kraken_hll-build_db.sh} | 56 +++++++++++-------- ...h.sh => kraken_hll-check_for_jellyfish.sh} | 20 +++++-- .../{clean_db.sh => kraken_hll-clean_db.sh} | 0 ...file.pl => kraken_hll-cp_into_tempfile.pl} | 0 ...=> kraken_hll-download_genomic_library.sh} | 0 ...omy.sh => kraken_hll-download_taxonomy.sh} | 26 +++------ ...ad_merger.pl => kraken_hll-read_merger.pl} | 0 .../{shrink_db.sh => kraken_hll-shrink_db.sh} | 0 ...sh => kraken_hll-standard_installation.sh} | 0 ...upgrade_db.sh => kraken_hll-upgrade_db.sh} | 0 ...ers.pl => kraken_hll-verify_gi_numbers.pl} | 0 scripts/report_gi_numbers.pl | 51 ----------------- src/set_lcas.cpp | 5 +- 16 files changed, 63 insertions(+), 101 deletions(-) rename scripts/{add_to_library.sh => kraken_hll-add_to_library.sh} (100%) rename scripts/{build_kraken_db.sh => kraken_hll-build_db.sh} (76%) rename scripts/{check_for_jellyfish.sh => kraken_hll-check_for_jellyfish.sh} (68%) rename scripts/{clean_db.sh => kraken_hll-clean_db.sh} (100%) rename scripts/{cp_into_tempfile.pl => kraken_hll-cp_into_tempfile.pl} (100%) rename scripts/{download_genomic_library.sh => kraken_hll-download_genomic_library.sh} (100%) rename scripts/{download_taxonomy.sh => kraken_hll-download_taxonomy.sh} (75%) rename scripts/{read_merger.pl => kraken_hll-read_merger.pl} (100%) rename scripts/{shrink_db.sh => kraken_hll-shrink_db.sh} (100%) rename scripts/{standard_installation.sh => kraken_hll-standard_installation.sh} (100%) rename scripts/{upgrade_db.sh => kraken_hll-upgrade_db.sh} (100%) rename scripts/{verify_gi_numbers.pl => kraken_hll-verify_gi_numbers.pl} (100%) delete mode 100755 scripts/report_gi_numbers.pl diff --git a/scripts/kraken_hll b/scripts/kraken_hll index 6f2e290..b31fca3 100755 --- a/scripts/kraken_hll +++ b/scripts/kraken_hll @@ -206,8 +206,8 @@ sub usage { Usage: $PROG [options] Options: - --db NAME Name for Kraken DB - (default: $default_db) + --db NAME Name for Kraken DB (default: $default_db) + --report-file FILENAME Write Kraken report to FILENAME --threads NUM Number of threads (default: $def_thread_ct) --fasta-input Input is FASTA format --fastq-input Input is FASTQ format diff --git a/scripts/add_to_library.sh b/scripts/kraken_hll-add_to_library.sh similarity index 100% rename from scripts/add_to_library.sh rename to scripts/kraken_hll-add_to_library.sh diff --git a/scripts/kraken_hll-build b/scripts/kraken_hll-build index 1ddea52..8367fdd 100755 --- a/scripts/kraken_hll-build +++ b/scripts/kraken_hll-build @@ -298,7 +298,7 @@ sub build_database { $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0); $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq; my $opt = ($verbose? "-x" : ""); - exec "build_kraken_db.sh $opt"; + exec "kraken_hll-build_db.sh $opt"; } sub clean_database { diff --git a/scripts/build_kraken_db.sh b/scripts/kraken_hll-build_db.sh similarity index 76% rename from scripts/build_kraken_db.sh rename to scripts/kraken_hll-build_db.sh index 6d86bf7..a8d3293 100755 --- a/scripts/build_kraken_db.sh +++ b/scripts/kraken_hll-build_db.sh @@ -37,6 +37,7 @@ function report_time_elapsed() { } start_time=$(date "+%s.%N") +script_dir=`dirname $0` DATABASE_DIR="$KRAKEN_DB_NAME" FIND_OPTS=-L @@ -59,17 +60,26 @@ fi if [ "$KRAKEN_REBUILD_DATABASE" == "1" ] then - rm -f database.* *.map lca.complete + rm -f database.* *.map lca.complete library/seq-files.txt fi +if [ !-f "library/seq-files.txt" ]; then + echo "Finding all library files" + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library/seq-files.txt +fi +N_FILES=`cat library/seq-files.txt | wc -l` +echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)" + if [ -e "database.jdb" ] then echo "Skipping step 1, k-mer set already exists." else - echo "Creating k-mer set (step 1 of 5)..." + echo "Creating k-mer set (step 1 of 6)..." start_time1=$(date "+%s.%N") - check_for_jellyfish.sh + JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh` + echo "Using $JELLYFISH_BIN" + [[ "$JELLYFISH_BIN" != "" ]] || exit 1 # Estimate hash size as 1.15 * chars in library FASTA files if [ -z "$KRAKEN_HASH_SIZE" ] then @@ -77,14 +87,14 @@ else echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'" fi - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ - jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ + cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \ + $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ -o database /dev/fd/0 # Merge only if necessary if [ -e "database_1" ] then - jellyfish merge -o database.jdb.tmp database_* + $JELLYFISH_BIN merge -o database.jdb.tmp database_* else mv database_0 database.jdb.tmp fi @@ -111,7 +121,7 @@ else then echo "Skipping step 2, database reduction unnecessary." else - echo "Reducing database size (step 2 of 5)..." + echo "Reducing database size (step 2 of 6)..." max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc) idx_size_gb=$(printf %.2f $(echo "$idx_size/2^30" | bc) ) if (( $(echo "$max_kdb_size < 0" | bc) == 1 )) @@ -143,7 +153,7 @@ if [ -e "database.kdb" ] then echo "Skipping step 3, k-mer set already sorted." else - echo "Sorting k-mer set (step 3 of 5)..." + echo "Sorting k-mer set (step 3 of 6)..." start_time1=$(date "+%s.%N") db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \ -d database.jdb -o database.kdb.tmp \ @@ -159,38 +169,37 @@ if [ -e "seqid2taxid.map" ] then echo "Skipping step 4, seqID to taxID map already complete." else - echo "Creating seqID to taxID map (step 4 of 5)... [blu]" -# start_time1=$(date "+%s.%N") -# make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \ -# > seqid2taxid.map.tmp -# mv seqid2taxid.map.tmp seqid2taxid.map -# line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') - -# echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" + echo "Creating seqID to taxID map (step 4 of 6).." + start_time1=$(date "+%s.%N") + cat library/seq-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library/seq-headers.txt + join -t $'\t' nucl_gb.accession2taxid.sorted library/seq-headers.txt > seqid2taxid.map.tmp + mv seqid2taxid.map.tmp seqid2taxid.map + line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') + + echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" fi if [ -s "taxDB" ] then - echo "Skipping step 4.5, taxDB exists." + echo "Skipping step 5, taxDB exists." else - echo "Creating taxDB (step 4.5 of 5)... " + echo "Creating taxDB (step 5 of 6)... " build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB fi - if [ -e "lca.complete" ] then - echo "Skipping step 5, LCAs already set." + echo "Skipping step 6, LCAs already set." else - echo "Setting LCAs in database (step 5 of 5)..." + echo "Setting LCAs in database (step 6 of 6)..." PARAM="" if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then echo " Adding taxonomy IDs for sequences" PARAM=" -a" fi start_time1=$(date "+%s.%N") - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \ + cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \ set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \ -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 touch "lca.complete" @@ -198,4 +207,5 @@ else echo "Database LCAs set. [$(report_time_elapsed $start_time1)]" fi -echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]" +echo "Database construction complete. [Total: $(report_time_elapsed $start_time)] +You can delete all files but database.{kdb,idx} and taxDB now, if you want" diff --git a/scripts/check_for_jellyfish.sh b/scripts/kraken_hll-check_for_jellyfish.sh similarity index 68% rename from scripts/check_for_jellyfish.sh rename to scripts/kraken_hll-check_for_jellyfish.sh index 63cc620..9143b62 100755 --- a/scripts/check_for_jellyfish.sh +++ b/scripts/kraken_hll-check_for_jellyfish.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright 2013-2015, Derrick Wood +# modified by Florian Breitwieser, 2017 # # This file is part of the Kraken taxonomic sequence classification system. # @@ -24,12 +25,23 @@ set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands -JELLYFISH_VERSION=$(jellyfish --version | awk '{print $2}') +JELLYFISH_BIN="jellyfish" +if hash jellyfish1 2>/dev/null; then + JELLYFISH_BIN="jellyfish1" +elif hash jellyfish 2>/dev/null; then + JELLYFISH_BIN="jellyfish" +else + echo "Did not find jellyfish!" 1>&2 + exit 1 +fi + +JELLYFISH_VERSION=$( $JELLYFISH_BIN --version | awk '{print $2}') if [[ $JELLYFISH_VERSION =~ ^1\. ]] then - echo "Found jellyfish v$JELLYFISH_VERSION" + echo "Found jellyfish v$JELLYFISH_VERSION" 1>&2 else - echo "Found jellyfish v$JELLYFISH_VERSION" - echo "Kraken requires jellyfish version 1" + echo "Found jellyfish v$JELLYFISH_VERSION" 1>&2 + echo "Kraken requires jellyfish version 1" 1>&2 exit 1 fi +echo $JELLYFISH_BIN diff --git a/scripts/clean_db.sh b/scripts/kraken_hll-clean_db.sh similarity index 100% rename from scripts/clean_db.sh rename to scripts/kraken_hll-clean_db.sh diff --git a/scripts/cp_into_tempfile.pl b/scripts/kraken_hll-cp_into_tempfile.pl similarity index 100% rename from scripts/cp_into_tempfile.pl rename to scripts/kraken_hll-cp_into_tempfile.pl diff --git a/scripts/download_genomic_library.sh b/scripts/kraken_hll-download_genomic_library.sh similarity index 100% rename from scripts/download_genomic_library.sh rename to scripts/kraken_hll-download_genomic_library.sh diff --git a/scripts/download_taxonomy.sh b/scripts/kraken_hll-download_taxonomy.sh similarity index 75% rename from scripts/download_taxonomy.sh rename to scripts/kraken_hll-download_taxonomy.sh index fa73616..fc27842 100755 --- a/scripts/download_taxonomy.sh +++ b/scripts/kraken_hll-download_taxonomy.sh @@ -31,30 +31,18 @@ THIS_DIR=$PWD mkdir -p "$TAXONOMY_DIR" cd "$TAXONOMY_DIR" -if [ ! -e "gimap.dlflag" ] +if [ ! -e "nucl_gb.accession2taxid.flag" ] then - wget $FTP_SERVER/pub/taxonomy/gi_taxid_nucl.dmp.gz - touch gimap.dlflag - echo "Downloaded GI to taxon map" -fi - -if [ ! -e "taxdump.dlflag" ] -then - wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz - touch taxdump.dlflag - echo "Downloaded taxonomy tree data" -fi - -if [ ! -e "gimap.flag" ] -then - gunzip gi_taxid_nucl.dmp.gz - touch gimap.flag - echo "Uncompressed GI to taxon map" + wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz + time gunzip -c nucl_gb.accession2taxid.gz | cut -f 2,3 | sort -k 1,1 > nucl_gb.accession2taxid.sorted + touch nucl_gb.accession2taxid.flag + echo "Downloaded and sorted GB to taxon map" fi if [ ! -e "taxdump.flag" ] then + wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz tar zxf taxdump.tar.gz touch taxdump.flag - echo "Uncompressed taxonomy tree data" + echo "Downloaded and uncompressed taxonomy tree data" fi diff --git a/scripts/read_merger.pl b/scripts/kraken_hll-read_merger.pl similarity index 100% rename from scripts/read_merger.pl rename to scripts/kraken_hll-read_merger.pl diff --git a/scripts/shrink_db.sh b/scripts/kraken_hll-shrink_db.sh similarity index 100% rename from scripts/shrink_db.sh rename to scripts/kraken_hll-shrink_db.sh diff --git a/scripts/standard_installation.sh b/scripts/kraken_hll-standard_installation.sh similarity index 100% rename from scripts/standard_installation.sh rename to scripts/kraken_hll-standard_installation.sh diff --git a/scripts/upgrade_db.sh b/scripts/kraken_hll-upgrade_db.sh similarity index 100% rename from scripts/upgrade_db.sh rename to scripts/kraken_hll-upgrade_db.sh diff --git a/scripts/verify_gi_numbers.pl b/scripts/kraken_hll-verify_gi_numbers.pl similarity index 100% rename from scripts/verify_gi_numbers.pl rename to scripts/kraken_hll-verify_gi_numbers.pl diff --git a/scripts/report_gi_numbers.pl b/scripts/report_gi_numbers.pl deleted file mode 100755 index 0d07b85..0000000 --- a/scripts/report_gi_numbers.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/perl - -# Copyright 2013-2015, Derrick Wood -# -# This file is part of the Kraken taxonomic sequence classification system. -# -# Kraken is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Kraken is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Kraken. If not, see . - -# Reads multi-FASTA input and for each sequence ID reports a -# tab-delimited line: -# -# -# or in the case of a sequence with Kraken taxid information: -# -# TAXID -# -# Assumes all sequence IDs actually have GI numbers or Kraken -# taxid information. - -use strict; -use warnings; -use File::Basename; - -my $PROG = basename $0; - -while (<>) { - next unless /^>(\S+)/; - my $seq_id = $1; - if ($seq_id =~ /(^|\|)kraken:taxid\|(\d+)/) { - - print "TAXID\t$2\t$seq_id\t$_\n"; - next; - } - - if ($seq_id !~ /(^|\|)gi\|(\d+)/) { - die "$PROG: sequence ID $seq_id lacks GI number, aborting.\n"; - } - - print "$2\t$seq_id\t$_\n"; -} diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 0e60887..61504d7 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -144,7 +144,7 @@ void process_single_file() { Parent_map[taxid] = parent_taxid; auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry(taxid, parent_taxid, "sequence")}); if (!itEntry.second) - cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the XXX option." << endl; + cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl; } else { iss >> taxid; } @@ -172,6 +172,9 @@ void process_single_file() { string prefix = "kraken:taxid|"; if (dna.id.substr(0,prefix.size()) == prefix) { taxid = std::atoi(dna.id.substr(prefix.size()).c_str()); + const auto strBegin = dna.header_line.find_first_not_of("\t "); + if (strBegin != std::string::npos) + dna.header_line = dna.header_line.substr(strBegin); } else { taxid = ID_to_taxon_map[dna.id]; } From e683cbf7eae40e75f2de07c0d9cec62437705cc3 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 4 May 2017 19:44:21 -0400 Subject: [PATCH 030/105] Refactor TaxonomyDB constructor --- src/taxdb.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/taxdb.h b/src/taxdb.h index c8dd2bd..0e449e5 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -96,12 +96,10 @@ struct TaxonomyEntryPtr_comp { template class TaxonomyDB { public: + TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName); TaxonomyDB(const std::string inFileName); - TaxonomyDB(); - //std::unordered_map seqIDsAndTaxIds; - std::unordered_map > taxIDsAndEntries; - void parseNamesDump(const std::string namesDumpFileName); - void parseNodesDump(const std::string nodesDumpFileName); + void writeTaxonomyIndex(std::ostream & outs) const; + TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; std::string getScientificName(const TAXID taxID) const; std::string getRank(const TAXID taxID) const; @@ -111,18 +109,21 @@ class TaxonomyDB { std::unordered_map getScientificNameMap() const; std::string getLineage(TAXID taxonomyID) const; std::string getMetaPhlAnLineage(TAXID taxonomyID) const; - char* getIndexFileName(const TAXID hostTaxID) const; - void readTaxonomyIndex(const std::string inFileName); - void writeTaxonomyIndex(std::ostream & outs) const; - void writeTaxonomyIndex(std::ostream & outs, - const std::string namesDumpFileName, - const std::string nodesDumpFileName); + bool isSubSpecies(TAXID taxonomyID) const; int isBelowInTree(TAXID upper, TAXID lower) const; + void addCounts(const TAXID taxid, const READCOUNTS& read_counts_); void fillCounts(const std::unordered_map& taxon_counts); - void createPointers(); void printReport(); + + std::unordered_map > taxIDsAndEntries; + private: + TaxonomyDB(); + void readTaxonomyIndex(const std::string inFileName); + void parseNamesDump(const std::string namesDumpFileName); + void parseNodesDump(const std::string nodesDumpFileName); + void createPointers(); }; @@ -292,13 +293,21 @@ TaxonomyDB::TaxonomyDB() { } template TaxonomyDB::TaxonomyDB(const std::string inFileName) { - log_msg("Building taxonomy index"); + log_msg("Building taxonomy index from " + inFileName); readTaxonomyIndex(inFileName); createPointers(); log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + " nodes"); } +template +TaxonomyDB::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) { + log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName); + parseNodesDump(nodesDumpFileName); + parseNamesDump(namesDumpFileName); + log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + " nodes"); +} + template void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { std::ifstream nodesDumpFile(nodesDumpFileName); @@ -358,15 +367,6 @@ void TaxonomyDB::parseNamesDump(const std::string namesDumpFil } } -template -void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, - const std::string namesDumpFileName, - const std::string nodesDumpFileName) { - parseNodesDump(nodesDumpFileName); - parseNamesDump(namesDumpFileName); - writeTaxonomyIndex(outs); -} - template std::vector getSortedKeys(const std::unordered_map& unordered) { std::vector keys; From ffea4a786f5fbcb937780c4117edc40f6dd86de0 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 4 May 2017 19:44:43 -0400 Subject: [PATCH 031/105] Build taxdb in the end --- scripts/kraken_hll-build_db.sh | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh index a8d3293..da4187d 100755 --- a/scripts/kraken_hll-build_db.sh +++ b/scripts/kraken_hll-build_db.sh @@ -41,6 +41,7 @@ script_dir=`dirname $0` DATABASE_DIR="$KRAKEN_DB_NAME" FIND_OPTS=-L +JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh` if [ ! -d "$DATABASE_DIR" ] then @@ -77,7 +78,6 @@ else echo "Creating k-mer set (step 1 of 6)..." start_time1=$(date "+%s.%N") - JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh` echo "Using $JELLYFISH_BIN" [[ "$JELLYFISH_BIN" != "" ]] || exit 1 # Estimate hash size as 1.15 * chars in library FASTA files @@ -179,20 +179,11 @@ else echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" fi -if [ -s "taxDB" ] -then - echo "Skipping step 5, taxDB exists." -else - echo "Creating taxDB (step 5 of 6)... " - build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB -fi - - if [ -e "lca.complete" ] then - echo "Skipping step 6, LCAs already set." + echo "Skipping step 5, LCAs already set." else - echo "Setting LCAs in database (step 6 of 6)..." + echo "Setting LCAs in database (step 5 of 6)..." PARAM="" if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then echo " Adding taxonomy IDs for sequences" @@ -207,5 +198,15 @@ else echo "Database LCAs set. [$(report_time_elapsed $start_time1)]" fi +if [ -s "taxDB" ] +then + echo "Skipping step 6, taxDB exists." +else + echo "Creating taxDB (step 6 of 6)... " + build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB.dmp + mv taxDB.tmp taxDB +fi + + echo "Database construction complete. [Total: $(report_time_elapsed $start_time)] You can delete all files but database.{kdb,idx} and taxDB now, if you want" From c550a1b8ef0b92e9c586f1a53d0f75f85dc4e4f2 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 4 May 2017 19:45:12 -0400 Subject: [PATCH 032/105] Update build_taxdb.cpp --- src/build_taxdb.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index 2710d82..3432294 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -27,8 +27,7 @@ int main(int argc, char **argv) { std::cout << "Provide names.dmp and nodes.dmp\n"; return 1; } - TaxonomyDB taxdb; - taxdb.writeTaxonomyIndex( - std::cout, argv[1], argv[2]); + TaxonomyDB taxdb(argv[1], argv[2]); + taxdb.writeTaxonomyIndex(std::cout); } From 112b89d5b31cdc9048eb4bbe4edbf33e2488c491 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 5 May 2017 13:19:47 -0400 Subject: [PATCH 033/105] Include number of k-mers in database in taxDB --- scripts/kraken_hll-build_db.sh | 3 +- src/build_taxdb.cpp | 20 +++-- src/classify.cpp | 2 +- src/gzstream/libgzstream.a | Bin 14622 -> 14814 bytes src/krakendb.cpp | 21 +++++ src/krakendb.hpp | 5 ++ src/report-cols.h | 1 + src/taxdb.h | 145 +++++++++++++++++++++------------ 8 files changed, 140 insertions(+), 57 deletions(-) diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh index da4187d..75a678d 100755 --- a/scripts/kraken_hll-build_db.sh +++ b/scripts/kraken_hll-build_db.sh @@ -203,7 +203,8 @@ then echo "Skipping step 6, taxDB exists." else echo "Creating taxDB (step 6 of 6)... " - build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB.dmp + jellyfish1 dump database.kdb | grep '^>' | sed 's/.//' | sort | uniq -c | sort -rn | sed 's/^ *\([0-9]\+\) \+\([0-9]\+\)$/\2\t\1/' > database.taxon_count + build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count > taxDB.tmp mv taxDB.tmp taxDB fi diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index 3432294..f4a4957 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -18,16 +18,26 @@ */ #include "taxdb.h" - +#include "quickfile.hpp" #include +#include +#include + using namespace std; int main(int argc, char **argv) { - if (argc != 3) { - std::cout << "Provide names.dmp and nodes.dmp\n"; + if (argc < 3 || argc > 4) { + std::cerr << "Usage: a.out names.dmp nodes.dmp [taxon-counts]\n"; return 1; } - TaxonomyDB taxdb(argv[1], argv[2]); + TaxonomyDB taxdb {(string)argv[1], (string)argv[2]}; + if (argc == 4) { + ifstream ifs(argv[3]); + uint32_t taxon; uint64_t count; + while (ifs >> taxon >> count) { + taxdb.setGenomeSize(taxon, count); + } + taxdb.genomeSizes_are_set = true; + } taxdb.writeTaxonomyIndex(std::cout); - } diff --git a/src/classify.cpp b/src/classify.cpp index 981f1f6..690715e 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -196,7 +196,7 @@ int main(int argc, char **argv) { std::cerr << "Finishing up ..\n"; if (Print_kraken_report) { - taxdb.fillCounts(taxon_counts); + taxdb.setReadCounts(taxon_counts); TaxReport rep = TaxReport(*Report_output, taxdb, false); rep.printReport("kraken","blu"); } diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a index 5144238131823c9252a7f0c5ff9f3690b8235c5e..916d33d4ec4647b96c6db9a1c8625146f6079612 100644 GIT binary patch literal 14814 zcmdT~4RBo5b$+YoA7R{;jN@93OS7^KmV;4tWl2atJiA`WdJ2mzEZK4V!_#WDlD0@% z&F+&WnKV|jVEjhjjN7CgI)zLolXhV8H_0T73!Pw4u#>c9oN2;%rb9w$eq6**+6;!4 z6!o0@&RLy(`yR-2(oW9k?Y-~2=kK2T^X`x8?@Z=WBfHjoBvek3=Juv=Q}fQ2a3})S_&Xfsell!Y1}@SxG&-3dF(HOS z>70kdZQ0`#*vjZN=V!)Kp7p`#VKbDu=W2(tvUt-{1CI8o2fDDj14&=b~Z`q!N7Ozf!&1Ylq| z6wr2QOK*&7+UZ-C@==wIYCE*;iZ=-FweXiVQ!dok9 z_B~^yE^UXO?YtNgq^B3?E7ks zhmB%~o-piqUCk5m#KN|7i$Dt@C;2@mx#4PI8sw{M9-P6#S8ASEb>vv===|AUqws1i zxIFu`@@(>$VZUjA7J08et4|)ZFBM*@E4)=_^0Od8__H0W}8<`|`pE&tZ)g);q~sCt0Zyqww9@Bl8$M z^-Ato^~T)l#h$REL&G>w{T-p2j`<_=XG>9E=>)8&&agKc#c0&9A3BXuD`p{5|K_+g zM@&(ub=(&&k|9H~7aV0zVWN~R++*0!m9cy2nfwqDcvfrkN{#2z$=Ba$)XVBXUuRI&2iW8UjjF z;jP7*&iRGwRd1SCzenbg3!8YYaYt8Z%4yDd)iCquR!Q)4=WCug4-SCYdExOY!(MI7 z9jrFyqE`(21FQcR&9u<^E)BtmJD_?SMcw{!%>H|0&RvrPb~xE{x~h$fT{zKbkk(NeqO^h1Fr{rOEuM6`dYPVh z^@YMoXfx>2@gC^EVR#yhV&PNR%k6j@z0=hc2N*4GewyN`hvN4*O-8L;zH+ zv#=h5lHj31pc9nX(b%#1*g>kV{dTwgM$CS}D1KcH0v5Sg=&wR|HSI!S1alV(8BC0e zg$XKv?DSH0b}4(Bvm7ZLx9gp8tHwv72s0cP(s;cXr!dATjPVed1AB1_W1PYmr!aaA zL?9-|xtu01H$8(wt zhV)X$Q{%k&QLL+KIit8i)w|fDk~vX=!%P$7OiXyI9a-t94ii?J(-}6q2hma)SDyyV z;$onBG9xMkMg2#jR81wFc@=qco&=28JyzV-Ff2oc*Z|01W~R&?D{y{>vnL2vUyiXHd4N?+%8}pzNuYv0%8cN;d*Z z9LgMV71xK!uO`st_%)&Y!g_y;%07Z+9l_c^#FEDR%$SuNfYi!4=?Ld?>Ct4Pm7cb= z#=)eO)EWo!c@4JDgf_Z2FHE@!ZJbMXxfMEUDV*Fb{T3s*~q`lc*Zi5 zxmDo<|ARxLgKTTLJDIFxdNOxw;iXvTeueux-U8BZzN$A>Y z7sCAb<*g)L<1f`x*~cJzgXsTiEdbasJot@s(;orgg+Jhfj%jHwGaPS ziJ!l_PCLQQ$%p@Wz&DF=hieF=e}SjIYN?-aL~sJA7yWP+-jUY90P(8VE+2fK4}Px? zo(J5^Ph2N%R;O!@A45)}J?f+9OMqiMk3))tYa65|;k{mrmmUY!{=rAjuYB-T(3Met z7j74?qzoi@L_#lCVpHCA>9e@Ui^3a;D1VbE~&+krs28> z>B~O+-y!}68ro_d1pJqM_+LkyVxSb$&wu;yZva2M{ND_Cy_o0ui`A@fwPIA7Ik(40 zPu2&2MCn=jF4T~pUsw3jcLCS2NYDD{dDREM0j5Z=diD6=5BcCf@xgD0_VCi5^T8kY z!C&;jH^7wbrT=cg>&17l#8bKih-+q7V&Cpq!Zh=f17^1=OAhy1i8j;BWN|@Rc^%Z^ zal2A}Z$+y2?b*}Y*=P2}b|*SbZ7e;OnwWum2)ODRtk5d%REAIBJy!)s>7Hcxgu3ji zV0d3wU3~|8tw_s2GM`DArCYe}6o8S`NHS+yxn#!5cc(f#(O$3BI+{%-N9m&L!1z>V ze9+vV&QHR^(dT-t{DQA^;UzDx=)P-kw8!JN>0R#B!1s2?^-LCg^mV!oicMfls}TJ^ zT*eKbn95}=2$Ac$^EY-W_nv`)-jq(}Iw53DW;a9!9xF*6mThLlgrQ1 z1a}6`c(NL7pUe7-d?NJZs3GR&nS%u)iX=X&D~(sanT&iXU&o1_~59Dz0jzb zoUUc2rjsVreOe86_kOW*cL>PAy^*j9a?{Cy%<)JQ1T%ZDz)YvRJvk zZttpljxMlV7I`IK3At=XO!N4dukmFKkL8X3mJD3br#tuGRq6)LjIccE)_mH6yXnrJ z)abQ{-IhGib&i&LYA+yznH;?DR6TbeOqOHWL2BUTd85=$Wb@Gfn};BU{l96B z4X7btS~x1+zwY+@NOo#0IX*)->dWl`t<(A8v<3g)aBUH4nOypBiRGGVpBR9R|4=eB z+Nu5-V7YSO;>0Anc$f0A&wX66&sm139obgM|Jpf@T^tnnO#)v{>pEDTXxjunC~#bJ zvYuJOab1ji8OA>^_;D@7_+JVB9RmNh4?mv~qW)V2Kd$?c(9cGASpScN{#Jo62>#my z{-)rU`m1Rz>ejzO;1QwcLxiKB(r&BZmv(mv9M3e^|4#@U_X3P31-?_@a{|Y6OXmNv z!0!-v5Z2R3STBip3S72Vf^e*tY_Fp}{3*dN<7o+g*uJqHE9jN9i0e}~}ziol}+SN|gd?^rDB z^^D+`^}>Ax5+-}#VSn0SKY)b(%W--@;Ih5`oN&}5>-DJMm-WK^1`_gjz{7T56?jzO z|0!@;FFp!#kE8Xl-#|kBavW_ExQqktbC8f<_UAr<%l1tYj>86J*#9BHZwP!$@OKN` z61dcJKjBy}JdIH#~qG+BsV|$faK=y zbzp_dIRB>o#I{C4f8@M}eG- zO$hubb=ep-z{Cuv<_sPu9=eqwa_|gC0r3$P-Mj+rc&nX{BD8=bw%Nk z)IbZWb2OP@{MNw3{?rlk&5)vf+BOiXX*bGAt%N$f;#IBfP^X_fO%Ejz_dD)2RsCHR z`vbDi_l2y$#irH_UiE)zt0VCk31a<&@VM)beG2VYxfFkr>~s4Aiz2uGc;D*f|7nsR zA~{Z6X4H4w%m33iITGhd5dFt_#_j*ZfO+}<+A{rrn&lyt^j{?3EkVe;>;D2^7*}`u z8{{ba%WB;3KZF9W`Wx6Wfw-Sy{WrkF@{GR>m{-wLKlyJEBl?fJ-2R8izPtZT6Jrg@bK>?t>f`?xNWOsuDGj;a?*Mr({~z=4 zALkmk|7jolFA?Lc%U;O!-tS}ob+SJK&qmTVdWS{v-O-Y{LpZBp} zx8333{*U&zz{C0&e}n9EoLKK=vfs!6rNFL;(|1Uo;8L<`O(|QJVL1lZ!?;DvHGH*d zlK-Bofc%R>imgoB8oa~Et#OXY?e?(2>3@_IZlF)Z?EAJxhwMm5d1rJN6|Y*^c(-Ku!s z3*|<5n&5dKJShJGO0TE%k0^zyr|NH{^iLoSQ2r)Jjpd_F#simg4Mz9T=Emb+2k7$g zdC=%EFpA%5dExc7H6y>FMZ-b>95D*NHVSVTTlV=L ztpm>df7Lbad&)>Y8!wI1VyCZi+?p|qpWc|vcYe4e%(o3h6UuPWr z8a0;dZVrM`qY!IpZaYIwn){4VxKel!+j{v4eeRyZ`TT|E{9Dbr@yk#6a*@@#mW)x1 zwRl#wk=3vVgkMokf=6NFfipQ{HII?;7=;&%<#vNE?FoDCdWF$xzhKhcys z1B#9+MVJ2JQEd99^H%aBD|y07exQ`AJo%TqUA5?fLyL^%`nCR$*|f>5CU%OF-{g{pHj;CGo@4UhJ@+tHL3cymXESpo8FVVJruXWix1Oe$!aK zFI@PU5{^8EgQa_AZbPK-Z&zM4k}n#byMwX91sqQ+)?jbS-N8}c=^XX^+iSTE#`5l9 z_(aIa?+$tjZz}Eix7HeaR}RUsxMD0H3>pvoORmf6kM4tkjQ7nI$7xg2@x?QZkDR&k z0*>=bKT&d5@4@v(q25?N;4_xP7mdPo^}kQoS)JqDZk*S}`+Ukz9QIxq*vq=1-w9TK zzMKu@pYf==ORq)_l$QsG-uZQ3;a?+#XN+aL=deORD`-UGVB768&hEywO@p+V(g3AF zN<)-(skE4eA#ANI6{@1X)%<-h2w`ML`(YS|;0YSV{O5467ouGN4?>C~9Hv4PMvo57 zGW7Y2ahmS2!hgc@!~^kCcVFQjZ5HB%SFPyCHk2n!ocyv@ugIrYA_rxpoEFl)!sSTe zr$+Jb)imOEoy`x!lt5a|Phf5}KZ%KPHa|-hkex4Qm&(}_oMj(z=5MrSzM8DC9#?h( zG~jHMIw4A(5DkDiuotCHh*Bp+RVT!Y$*@Z--d!iCcw3zrEBtr7aJdxyfxg0ORrDRZ z-O&+iS>he2aK-92HTrR9iF`U@t}H#S#>*T`*xDw9e(2J3IBcqC#4C<7-YEV#M$xie ze7}llal1;EMGbg$6$iK*ak|u3k3QWf-rJ?%Ln~`*FxK6hQnRKXE~z=g3o@z|&o$3_ zHooG~zT|~@4DdDI_HKO4t9@Qq(;6GcRQV3ZXmx6T=c?wg-fI}wFXAe{w#McEFDXvp zT4?@l%YRPaQD%2E`jJ3z_uiq+fzH-#t!;t!P}`2swos=fc^xV6EbiAl^G%)`Hq`r0 zgD)|_@m&zxTRxZ_*!(toQ?wRey8O)BdKOANzfu z@Q3p2j@6yii(V}NqN6Ae^d$a7{7rc;3eGq9n<9Q+J^DnQ-eAl^ab9l*HcR9vv%9;3S5Sa;apGzdbdzk~1@$UC%WklTa$HCpk* zm{-1!H(;migyLsOPY_b22Yfw+W1y(__yZ?#3>;e*fzDp9H+x_h_ybS}Jm`EJ``iQ?PUq9pz>p73V3lPu@9A$0RIQTpA6?JQIjKz2Z1f%B~ zC}KRGmTejGH{GLp0hy}_`?5}Z!*LzZHSVoSIbB0b)ud}Y8Y=5_4ZpyuNmpNGPB?m1 zO}hGebHX=Pf!JkU@c*uQt13(!H$dtUep1(}T3tnt7)7G1uSEe+R@dm+*Jm~9TA&&PMAvR|!EbiKZ>hp>RfvpAnME(HFl7vLIz-fobe>eU zN*yGeNVjHJRh6z)ZHtOW*Vb3@BA{#3>9$&3!_}yg?ttR^#5h-LJYaXh<1`90F8G2A zz63bxKL;t&lUAbP>g}ZeuUzo&0*-o4QK;4a0NB$m{LcY?lbCn7Cm_8B&n9nqoNz?e zf!$%dBwb)@P`R! zf9`^^lRu-x&r?I~bD$qTO3>~gO&PVv0sau-Um+a#IHd0o-k>%rO$#ej)6N0zWOqHt zqyCQ(Kkjize+sx0f5-)&bHVQ?{TZ6VxCbJA)rJ3I;%}lUtoAtIKj*^#GUCuSJH770 zUk~kcYS(puZxZXgpA_Dr@bWsZ-W8y(%SBJZ1wWH1HFUhV5BGBYiiS} z>E!GJ9A?9P$!L{UaWFr=h?gZAc$P~f51Yw}!{%6Ga;i$CboxHNsIE||7=EViiR@$&9H6_Mz9fKc$%#b9%w-ajxolsuw-@aVG|*_1gsGnR(7!rhj0BphcFSgr_# zlCOqrw%bhe$h52Rnr2IFbA9_HT=AxQ_wOl<3LM?Fwy!gr%E9esZ+}v)Vzr9x$fi<< z(_>@3Qy^gt+yOvFCo@pqrUvppSShE|qtwB*wMD6&O=n>`bc{g?_mrkNJ)&lWnZrr) zyX)@GPNa`cCuSDtPPW#jcFt$VQ#rUHe|HhuK3BePuhmrd>*QMHz{S}) zba9XJaj@!|R=sy*)vtBV@gd0n#yVzOJ0b8Ef!{{&bMRWC-6HVM3;fRnepcX{1%8Ea z{Qm3^`1|SI4)x=^5&Lr^;mE&L;9Fhzw+ViHcV|5@!QU$IJ%S%ko0)$^@Z-@T<6ji` zHi3Ur;Ozo`N#M5&d?&oOB4K+!0uSq7a>36FT*l|8gxllustf-w1iy^Wn&6l538I5Q znA{8x`xz5>Sl|W1(I2V*$AZ5{@INQ`W&hp?@25yu9fgPO-YRhXrf0m9aC;nf34R&3 zj|+b3=iP!|#vvu}n6UfT0^cR@H-vsU&i(LSjAW0`e!{W6GCrRa{4zdY68thge9}@Vi!0}9!X`V8uXD5RqMI6_+X2N9r z5f2mIL@Dxrg7TQK9OAg2G6g7uco%~rZ`Tth9>jN09upbQZh>><$W2(QBkbdOn6GCnu^5VninCESJO6z26 zc40b~7|~kcu*OO!IG0IHC0gOrHm9|YCUOa_btIeB;P14Aw&K4-D^6zcL%Vf6nKV=L z$<%Dl1btH}7q+XY!2_Ne)Ht${e|VB$CNi1C0sv|B9^9~hom)*&mEaBVA&>1}rxntb zU)V0FLxA^y^GImuiJSCETXY3MqK033Duy zfCHA+@l1M-S6~=SzX#zx#kCIdVZyGXGzck<<FV|1r$8P}~?bGL^B?P})+3^x$ zWoTJ1p%_zSb(6RA+spWkf!KOm!`OZ=)Jd|jX>CyH)c&V#vZRJd7~B6|c(_^IejHP1 z-)mF+8DixA0v1Jf|NjC|C;yL-Jl~s=cvb(+Lxq$7!4FvySdWDM<2#w%|Azo`^8cwC z{ePV0A(iwr$>aEtWN-iTQ0a%q-v4irJp0Q!xXkIZP~p`6Z&3TWpMt=LYaz=q{xV=r z?N@^x>d-eyEYFw+gi&6%^?wsi79h4`mCE+H{2RzP`Tr>SZxAE;kLx$vXT1TkZy$d_ zvd{OCq^2a#<*k using std::string; using std::vector; @@ -68,6 +69,26 @@ KrakenDB::KrakenDB(char *ptr) { key_len = key_bits / 8 + !! (key_bits % 8); } +std::unordered_map KrakenDB::count_taxons() { + throw std::runtime_error("count_taxons() is not working"); + // Not working currently!! + char *ptr = get_pair_ptr(); + size_t pair_sz = pair_size(); + + std::unordered_map taxon_counts; + for (uint64_t i = 0; i < key_ct; i++) { + uint32_t* taxon = (uint32_t *) ptr + pair_sz * i + key_len; + if (taxon == NULL) { + std::cerr << "taxon is NULL (i is " << i << " and key_ct is " << key_ct << ")" << std::endl; + } else { + uint32_t taxon_i = *taxon; + ++taxon_counts[taxon_i]; + } + } + return taxon_counts; +} + + // Creates an index, indicating starting positions of each bin // Bins contain k-mer/taxon pairs with k-mers that share a bin key void KrakenDB::make_index(string index_filename, uint8_t nt) { diff --git a/src/krakendb.hpp b/src/krakendb.hpp index c30eeb5..f586026 100644 --- a/src/krakendb.hpp +++ b/src/krakendb.hpp @@ -21,6 +21,7 @@ #define KRAKENDB_HPP #include "kraken_headers.hpp" +#include namespace kraken { class KrakenDBIndex { @@ -60,6 +61,10 @@ namespace kraken { uint32_t *kmer_query(uint64_t kmer, uint64_t *last_bin_key, int64_t *min_pos, int64_t *max_pos, bool retry_on_failure=true); + + + // return a count of k-mers for all taxons + std::unordered_map count_taxons(); // return "bin key" for kmer, based on index // If idx_nt not specified, use index's value diff --git a/src/report-cols.h b/src/report-cols.h index 007eef5..ff19275 100644 --- a/src/report-cols.h +++ b/src/report-cols.h @@ -22,6 +22,7 @@ enum class REPORTCOLS : uint8_t { NUM_READS_CLADE, NUM_KMERS, NUM_UNIQUE_KMERS, + NUM_KMERS_IN_DATABASE, TOTAL_SCORE, TOTAL_HIT_LENGTH, ABUNDANCE, diff --git a/src/taxdb.h b/src/taxdb.h index 0e449e5..12518da 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -65,15 +65,16 @@ class TaxonomyEntry { TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) : taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {} - TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) : - taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {} + TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) : + taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_), + genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) {} inline bool operator==(const TaxonomyEntry& other) const; TaxonomyEntry* parent = nullptr; std::vector children; - READCOUNTS read_counts = READCOUNTS(); - READCOUNTS read_counts_children = READCOUNTS(); + READCOUNTS readCounts = READCOUNTS(); + READCOUNTS readCountsOfChildren = READCOUNTS(); bool used = false; uint64_t genomeSize = 0; @@ -83,8 +84,8 @@ class TaxonomyEntry { //template<> //TaxonomyEntry::TaxonomyEntry () { -// read_counts = 0; -// read_counts_children = 0; +// readCounts = 0; +// readCountsOfChildren = 0; //} template @@ -97,7 +98,8 @@ template class TaxonomyDB { public: TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName); - TaxonomyDB(const std::string inFileName); + TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false); + TaxonomyDB(); void writeTaxonomyIndex(std::ostream & outs) const; TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; @@ -113,17 +115,21 @@ class TaxonomyDB { bool isSubSpecies(TAXID taxonomyID) const; int isBelowInTree(TAXID upper, TAXID lower) const; - void addCounts(const TAXID taxid, const READCOUNTS& read_counts_); - void fillCounts(const std::unordered_map& taxon_counts); + void setGenomeSizes(const std::unordered_map & genomeSizes); + void setReadCounts(const std::unordered_map& readCounts); + void setGenomeSize(const TAXID taxid, const uint64_t genomeSize); + void addReadCount(const TAXID taxid, const READCOUNTS& readCounts_); + void printReport(); std::unordered_map > taxIDsAndEntries; + bool genomeSizes_are_set = false; private: - TaxonomyDB(); - void readTaxonomyIndex(const std::string inFileName); + std::unordered_map > + readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes); void parseNamesDump(const std::string namesDumpFileName); void parseNodesDump(const std::string nodesDumpFileName); - void createPointers(); + void createPointers(std::unordered_map >& taxIDsAndEntries); }; @@ -243,15 +249,14 @@ std::vector get_fields(const std::string &s, const std::string& del } - //template<> //TaxonomyEntry::TaxonomyEntry () { -// read_counts = 0; -// read_counts_children = 0; +// readCounts = 0; +// readCountsOfChildren = 0; //} template bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { - return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); + return ((reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren))); } @@ -276,7 +281,7 @@ unordered_map TaxonomyDB::getParentMap() const { } template -void TaxonomyDB::createPointers() { +void TaxonomyDB::createPointers(std::unordered_map >& taxIDsAndEntries) { for (auto& tax : taxIDsAndEntries) { if (tax.second.parentTaxonomyID != tax.first) { auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); @@ -292,20 +297,17 @@ template TaxonomyDB::TaxonomyDB() { } template -TaxonomyDB::TaxonomyDB(const std::string inFileName) { - log_msg("Building taxonomy index from " + inFileName); - readTaxonomyIndex(inFileName); - createPointers(); - log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + - " nodes"); -} +TaxonomyDB::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) : + taxIDsAndEntries( readTaxonomyIndex(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes) + { } template TaxonomyDB::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) { log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName); parseNodesDump(nodesDumpFileName); parseNamesDump(namesDumpFileName); - log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + " nodes"); + createPointers(taxIDsAndEntries); + log_msg("Built a tree with " + std::to_string(taxIDsAndEntries.size()) + " taxa"); } template @@ -382,29 +384,49 @@ template void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { for (TAXID& key : getSortedKeys(taxIDsAndEntries)) { const auto& entry = taxIDsAndEntries.at(key); - outs << key << "\t" << entry.parentTaxonomyID << "\t" - << entry.scientificName << "\t" << entry.rank << "\n"; + outs << key << '\t' << entry.parentTaxonomyID << '\t' + << entry.scientificName << '\t' << entry.rank; + if (genomeSizes_are_set) { + outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren; + } + outs << '\n'; } + outs.flush(); } - +template +void TaxonomyDB::setGenomeSizes(const std::unordered_map & genomeSizes) { + for (const auto& it : genomeSizes) { + setGenomeSize(it.first, it.second); + } + genomeSizes_are_set = true; +} template -void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { +std::unordered_map > + TaxonomyDB::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) { + log_msg("Reading taxonomy index from " + inFileName); std::ifstream inFile(inFileName); if (!inFile.is_open()) throw std::runtime_error("unable to open taxonomy index file " + inFileName); + std::unordered_map > taxIDsAndEntries; TAXID taxonomyID, parentTaxonomyID; std::string scientificName, rank; + uint64_t genomeSize, genomeSizeOfChildren = 0; std::string line; while (!inFile.eof()) { inFile >> taxonomyID >> parentTaxonomyID; inFile.get(); // read tab std::getline(inFile, scientificName, '\t'); - std::getline(inFile, rank, '\n'); - TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); + if (hasGenomeSizes) { + std::getline(inFile, rank, '\t'); + inFile >> genomeSize >> genomeSizeOfChildren; + } else { + std::getline(inFile, rank, '\n'); + } + TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName, genomeSize, genomeSizeOfChildren); //cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl; taxIDsAndEntries.insert({ @@ -414,6 +436,9 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileNam taxIDsAndEntries.insert({ 0, {0, 0, "no rank", "unclassified" } }); + createPointers(taxIDsAndEntries); + log_msg("Finished, read " + std::to_string(taxIDsAndEntries.size()) + " taxa"); + return(taxIDsAndEntries); } template @@ -594,27 +619,46 @@ bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { } template -void TaxonomyDB::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) { +void TaxonomyDB::addReadCount(const TAXID taxid, const READCOUNTS& readCounts_) { + auto it = taxIDsAndEntries.find(taxid); + if (it == taxIDsAndEntries.end()) { + cerr << "No taxonomy entry for " << taxid << "!!" << endl; + return; + } + TaxonomyEntry* tax = &it->second; + //cerr << taxid << " rc before: " << tax->readCounts << endl; + tax->readCounts += readCounts_; + //cerr << taxid << " rc after: " << tax->readCounts << endl; + + while (tax->parent != nullptr) { + tax = tax->parent; + tax->readCountsOfChildren += readCounts_; + } +} + +template +void TaxonomyDB::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) { auto it = taxIDsAndEntries.find(taxid); if (it == taxIDsAndEntries.end()) { cerr << "No taxonomy entry for " << taxid << "!!" << endl; return; } TaxonomyEntry* tax = &it->second; - //cerr << taxid << " rc before: " << tax->read_counts << endl; - tax->read_counts += read_counts_; - //cerr << taxid << " rc after: " << tax->read_counts << endl; + tax->genomeSize += genomeSize; while (tax->parent != nullptr) { tax = tax->parent; - tax->read_counts_children += read_counts_; + //std::cerr << "setting genomeSizeOfChildren of parent" << std::endl; + tax->genomeSizeOfChildren += genomeSize; } } + + template -void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { - for (auto& elem : taxon_counts) { - addCounts(elem.first, elem.second); +void TaxonomyDB::setReadCounts(const unordered_map& readCounts) { + for (auto& elem : readCounts) { + addReadCount(elem.first, elem.second); } for (auto& tax : taxIDsAndEntries) { @@ -625,16 +669,16 @@ void TaxonomyDB::fillCounts(const unordered_map TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { - _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS_IN_DATABASE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; } template void TaxReport::printReport(std::string format, std::string rank) { _total_n_reads = - reads(_taxdb.taxIDsAndEntries.at(0).read_counts) + - reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) + - reads(_taxdb.taxIDsAndEntries.at(1).read_counts) + - reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// + + reads(_taxdb.taxIDsAndEntries.at(0).readCounts) + + reads(_taxdb.taxIDsAndEntries.at(0).readCountsOfChildren) + + reads(_taxdb.taxIDsAndEntries.at(1).readCounts) + + reads(_taxdb.taxIDsAndEntries.at(1).readCountsOfChildren);// + if (_total_n_reads == 0) { std::cerr << "total number of reads is zero - not creating a report!" << endl; return; @@ -657,7 +701,7 @@ void TaxReport::printReport(std::string format, std::string ra template void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { - if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) { + if (_show_zeros || (reads(tax.readCounts)+reads(tax.readCountsOfChildren)) > 0) { printLine(tax, depth); for (auto child : tax.children) printReport(*child, depth+1); @@ -672,13 +716,14 @@ void TaxReport::printLine(TaxonomyEntry& tax case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break; //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; - case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break; - case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.read_counts); break; - case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.read_counts.kmers.cardinality(); break; - case REPORTCOLS::NUM_KMERS: _reportOfb << tax.read_counts.n_kmers; break; + case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break; + case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.readCounts); break; + case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break; + case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break; + case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; //case REPORTCOLS::SUM_SCORE: ; break; From 78a61d44008ba03e3b521174325e8ffb7b41c75e Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sat, 6 May 2017 23:53:59 -0400 Subject: [PATCH 034/105] Update to read columns --- scripts/kraken_hll-build_db.sh | 4 +-- src/classify.cpp | 15 ++++++++- src/report-cols.h | 15 +++++++-- src/taxdb.h | 58 ++++++++++++++++++++++++++-------- 4 files changed, 73 insertions(+), 19 deletions(-) diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh index 75a678d..fa90e6b 100755 --- a/scripts/kraken_hll-build_db.sh +++ b/scripts/kraken_hll-build_db.sh @@ -203,8 +203,8 @@ then echo "Skipping step 6, taxDB exists." else echo "Creating taxDB (step 6 of 6)... " - jellyfish1 dump database.kdb | grep '^>' | sed 's/.//' | sort | uniq -c | sort -rn | sed 's/^ *\([0-9]\+\) \+\([0-9]\+\)$/\2\t\1/' > database.taxon_count - build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count > taxDB.tmp + time $JELLYFISH_BIN histo --high 100000000 database.kdb | tee database.taxon_count + build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp mv taxDB.tmp taxDB fi diff --git a/src/classify.cpp b/src/classify.cpp index 690715e..a2a61b3 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -120,7 +120,7 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); if (!TaxDB_file.empty()) { - taxdb = TaxonomyDB(TaxDB_file); + taxdb = TaxonomyDB(TaxDB_file, true); for (const auto & tax : taxdb.taxIDsAndEntries) { if (tax.first != 0) Parent_map[tax.first] = tax.second.parentTaxonomyID; @@ -198,6 +198,19 @@ int main(int argc, char **argv) { if (Print_kraken_report) { taxdb.setReadCounts(taxon_counts); TaxReport rep = TaxReport(*Report_output, taxdb, false); + rep.setReportCols({ + "percReadsClade", + "numReadsClade", + "numReadsTaxon", + "numUniqueKmersClade", + "numUniqueKmersTaxon", + "numKmersClade", + "numKmersTaxon", + "numKmersInDatabaseClade", + "numKmersInDatabaseTaxon", + "taxID", + "taxRank", + "indentedName"}); rep.printReport("kraken","blu"); } diff --git a/src/report-cols.h b/src/report-cols.h index ff19275..2392bd8 100644 --- a/src/report-cols.h +++ b/src/report-cols.h @@ -21,8 +21,11 @@ enum class REPORTCOLS : uint8_t { NUM_READS, NUM_READS_CLADE, NUM_KMERS, + NUM_KMERS_CLADE, NUM_UNIQUE_KMERS, + NUM_UNIQUE_KMERS_CLADE, NUM_KMERS_IN_DATABASE, + NUM_KMERS_IN_DATABASE_CLADE, TOTAL_SCORE, TOTAL_HIT_LENGTH, ABUNDANCE, @@ -33,19 +36,25 @@ enum class REPORTCOLS : uint8_t { static const std::map report_col_name_map = { {"name", REPORTCOLS::NAME}, - {"spaced_name", REPORTCOLS::SPACED_NAME}, + {"indentedName", REPORTCOLS::SPACED_NAME}, {"taxID", REPORTCOLS::TAX_ID}, {"taxRank", REPORTCOLS::TAX_RANK}, {"depth", REPORTCOLS::DEPTH}, {"genomeSize", REPORTCOLS::GENOME_SIZE}, - {"numReads", REPORTCOLS::NUM_READS}, + {"numReadsTaxon", REPORTCOLS::NUM_READS}, {"numReadsClade", REPORTCOLS::NUM_READS_CLADE}, - {"numUniqueKmers", REPORTCOLS::NUM_UNIQUE_KMERS}, + {"numKmersTaxon", REPORTCOLS::NUM_KMERS}, + {"numKmersClade", REPORTCOLS::NUM_KMERS_CLADE}, + {"numUniqueKmersTaxon", REPORTCOLS::NUM_UNIQUE_KMERS}, + {"numUniqueKmersClade", REPORTCOLS::NUM_UNIQUE_KMERS_CLADE}, + {"numKmersInDatabaseTaxon", REPORTCOLS::NUM_KMERS_IN_DATABASE}, + {"numKmersInDatabaseClade", REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE}, {"totalHitLen", REPORTCOLS::TOTAL_HIT_LENGTH}, {"totalScore", REPORTCOLS::TOTAL_SCORE}, {"abundance", REPORTCOLS::ABUNDANCE}, {"abundance_len", REPORTCOLS::ABUNDANCE_LEN}, + {"percReadsClade", REPORTCOLS::PERCENTAGE}, {"percent", REPORTCOLS::PERCENTAGE}, {"taxId", REPORTCOLS::TAX_ID}, {"reads_clade", REPORTCOLS::NUM_READS_CLADE}, // Change to clade reads! diff --git a/src/taxdb.h b/src/taxdb.h index 12518da..3b825f2 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -138,17 +138,18 @@ class TaxReport { private: std::ostream& _reportOfb; TaxonomyDB & _taxdb; - std::vector _report_cols; uint64_t _total_n_reads; bool _show_zeros; - void printLine(TaxonomyEntry& tax, unsigned depth); public: TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); - void printReport(std::string format, std::string rank); void printReport(TaxonomyEntry& tax, unsigned depth); + void setReportCols(std::vector names); + + std::vector _report_col_names; + std::vector _report_cols; }; @@ -668,8 +669,23 @@ void TaxonomyDB::setReadCounts(const unordered_map -TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { - _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS_IN_DATABASE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; + TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; +} + + +template +void TaxReport::setReportCols(std::vector names) { + _report_cols.clear(); + for (auto& s : names) { + auto it = report_col_name_map.find(s); + if (it == report_col_name_map.end()) { + throw std::runtime_error(s + " is not a valid report column name"); + } + _report_cols.push_back(it->second); + } + _report_col_names = names; + } template @@ -683,6 +699,19 @@ void TaxReport::printReport(std::string format, std::string ra std::cerr << "total number of reads is zero - not creating a report!" << endl; return; } + if (_report_cols.size() == _report_col_names.size()) { + // print header + bool first_one = true; + for (std::string s : _report_col_names) { + if (first_one) { + first_one = false; + } else { + _reportOfb << '\t'; + } + _reportOfb << s; + } + _reportOfb << endl; + } if (format == "kraken") { // A: print number of unidentified reads @@ -712,18 +741,21 @@ template void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { for (auto& col : _report_cols) { switch (col) { - case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; + case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; - case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; - case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break; - //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; + case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; + case REPORTCOLS::DEPTH: _reportOfb << depth; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break; + //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; + case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.readCounts); break; case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break; - case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.readCounts); break; case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break; - case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break; - case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; + case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << (tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality()); break; + case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break; + case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers; break; + case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; + case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; //case REPORTCOLS::SUM_SCORE: ; break; From 7ae1f5de0e3f3bdae8decdad0fa118d8884f7161 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 13 Aug 2017 14:34:07 -0400 Subject: [PATCH 035/105] Added taxdb.cpp --- src/taxdb.cpp | 584 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 584 insertions(+) create mode 100644 src/taxdb.cpp diff --git a/src/taxdb.cpp b/src/taxdb.cpp new file mode 100644 index 0000000..41ba0ee --- /dev/null +++ b/src/taxdb.cpp @@ -0,0 +1,584 @@ +#include "taxdb.h" +using namespace std; + +void log_msg (const std::string& s) { + std::cerr << s << "\n"; +} + +template +uint64_t string_to_T(string str) { + stringstream stream(str); + T result; + stream >> result; + return result; +} + +template +inline +uint64_t reads(const T read_count) { + cerr << "No reads function for type!! " << endl; + throw ; + return(0); +} + + + +inline +uint64_t reads(const uint64_t read_count) { + return(read_count); +} + +std::vector in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) { + std::vector tokens; + size_t i = 0; + size_t next_end = start_at-1; + + for (size_t next_start = s.find(start_char, next_end + 1); \ + next_start != string::npos; + next_start = s.find(start_char, next_end + 1), ++i) { + + next_end = s.find(end_char, next_start + 1); + if (next_end == string::npos) + throw std::runtime_error("unmatched start and end!"); + + tokens.push_back(s.substr(next_start+1, next_end-1)); + } + + return tokens; +} + + + +std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) { + std::vector tokens(max_fields); + size_t delim_length = delimiter.length(); + size_t last = 0; + size_t i = 0; + + for (size_t next = s.find(delimiter, last); + (max_fields > 0 && i < max_fields) && next != string::npos; + next = s.find(delimiter, last), ++i) { + tokens[i] = s.substr(last, next-last); + last = next + delim_length; + } + if (max_fields > 0 && i < max_fields) { + tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars); + } + + return tokens; +} + +std::vector get_fields(const std::string &s, const std::string& delimiter, vector fields) { + std::vector tokens; + tokens.reserve(fields.size()); + size_t delim_length = delimiter.length(); + size_t last = 0; + size_t i = 0; + size_t current_field = 0; + + for (size_t next = s.find(delimiter, last); + tokens.size() < fields.size() && next != string::npos; + next = s.find(delimiter, last), ++i) { + if (i == fields[current_field]) { + tokens.push_back(s.substr(last, next-last)); + ++current_field; + } + last = next + delim_length; + } + + return tokens; +} + + + +//template<> +//TaxonomyEntry::TaxonomyEntry () { +// read_counts = 0; +// read_counts_children = 0; +//} +template +bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { + return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); + } + + +template +std::unordered_map TaxonomyDB::getScientificNameMap() const { + std::unordered_map scientificNameMap; + for (const auto & tax : taxIDsAndEntries) { + scientificNameMap[tax.second.scientificName] = tax.first; + } + return scientificNameMap; +} + +template +unordered_map TaxonomyDB::getParentMap() const { + unordered_map Parent_map; + for (const auto & tax : taxIDsAndEntries) { + if (tax.first != 0) + Parent_map[tax.first] = tax.second.parentTaxonomyID; + } + Parent_map[1] = 1; + return Parent_map; +} + +template +void TaxonomyDB::createPointers() { + for (auto& tax : taxIDsAndEntries) { + if (tax.second.parentTaxonomyID != tax.first) { + auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); + if (parentIt != taxIDsAndEntries.end()) { + tax.second.parent = &(parentIt->second); + parentIt->second.children.push_back(&tax.second); + } + } + } +} + +template +TaxonomyDB::TaxonomyDB() { } + +template +TaxonomyDB::TaxonomyDB(const std::string inFileName) { + log_msg("Building taxonomy index"); + readTaxonomyIndex(inFileName); + createPointers(); + log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + + " nodes"); +} + +template +void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { + std::ifstream nodesDumpFile(nodesDumpFileName); + if (!nodesDumpFile.is_open()) + throw std::runtime_error("unable to open nodes file"); + std::string line; + + TAXID taxonomyID; + TAXID parentTaxonomyID; + std::string rank; + + while (nodesDumpFile.good()) { + getline(nodesDumpFile, line); + std::vector tokens = tokenise(line, "\t|\t", 3, 2); + if (tokens.size() < 3) { + continue; + } + + taxonomyID = string_to_T(tokens[0]); + parentTaxonomyID = string_to_T(tokens[1]); + rank = tokens[2]; + + auto entryIt = taxIDsAndEntries.find(taxonomyID); + if (entryIt == taxIDsAndEntries.end()) { + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); + } else { + entryIt->second.parentTaxonomyID = parentTaxonomyID; + entryIt->second.rank = rank; + } + } +} + +template +void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { + std::ifstream namesDumpFile(namesDumpFileName); + if (!namesDumpFile.is_open()) + throw std::runtime_error("unable to open names file"); + std::string line; + + TAXID taxonomyID; + std::string scientificName; + while (namesDumpFile.good()) { + getline(namesDumpFile, line); + std::vector tokens = tokenise(line, "\t|\t", 4, 2); + if (tokens.size() < 4 || tokens[3] != "scientific name") { + continue; + } + taxonomyID = string_to_T(tokens[0]); + scientificName = tokens[1]; + + auto entryIt = taxIDsAndEntries.find(taxonomyID); + if (entryIt == taxIDsAndEntries.end()) { + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); + } else { + entryIt->second.scientificName = scientificName; + } + } +} + +template +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, + const std::string namesDumpFileName, + const std::string nodesDumpFileName) { + parseNodesDump(nodesDumpFileName); + parseNamesDump(namesDumpFileName); + writeTaxonomyIndex(outs); +} + +template +std::vector getSortedKeys(const std::unordered_map& unordered) { + std::vector keys; + keys.reserve (unordered.size()); + for (auto& it : unordered) { + keys.push_back(it.first); + } + std::sort (keys.begin(), keys.end()); + return keys; +} + +template +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { + for (TAXID& key : getSortedKeys(taxIDsAndEntries)) { + const auto& entry = taxIDsAndEntries.at(key); + outs << key << "\t" << entry.parentTaxonomyID << "\t" + << entry.scientificName << "\t" << entry.rank << "\n"; + } +} + + + +template +void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { + std::ifstream inFile(inFileName); + if (!inFile.is_open()) + throw std::runtime_error("unable to open taxonomy index file " + inFileName); + + TAXID taxonomyID, parentTaxonomyID; + std::string scientificName, rank; + + std::string line; + while (!inFile.eof()) { + inFile >> taxonomyID >> parentTaxonomyID; + inFile.get(); // read tab + std::getline(inFile, scientificName, '\t'); + std::getline(inFile, rank, '\n'); + TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); + + //cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl; + taxIDsAndEntries.insert({ + taxonomyID, newEntry + }); + } + taxIDsAndEntries.insert({ + 0, {0, 0, "no rank", "unclassified" } + }); +} + +template +TAXID TaxonomyDB::getLowestCommonAncestor( + const std::vector& taxIDs) const { + if (taxIDs.size() == 0) { + return 0; + } + std::vector > paths; + for (auto& taxID : taxIDs) { + bool good = true; + std::vector path; + TAXID tempTaxID = taxID; + while (tempTaxID != 0) { + path.push_back(tempTaxID); + tempTaxID = getParentTaxID(tempTaxID); + } + if (good) paths.push_back(path); + } + if (paths.size() == 0) { + return 0; + } + for (auto& path : paths) + std::reverse(path.begin(), path.end()); + std::sort(paths.begin(), paths.end(), + [](std::vector i, std::vector j) { + return i.size() < j.size(); + }); + TAXID consensus = 0; + for (unsigned i = 0; i < paths[0].size(); i++) { + TAXID temp = 0; + for (auto& path : paths) { + if (temp == 0) + temp = path[i]; + else if (temp != path[i]) { + return consensus; + } + } + consensus = temp; + } + return consensus; +} + +template +TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { + auto entry = taxIDsAndEntries.find(taxID); + if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) + return entry->second.parentTaxonomyID; + else + return 0; +} + +template +std::string TaxonomyDB::getScientificName(const TAXID taxID) const { + auto entry = taxIDsAndEntries.find(taxID); + if (entry != taxIDsAndEntries.end()) { + return entry->second.scientificName; + } else + return std::string(); +} + +template +std::string TaxonomyDB::getRank(const TAXID taxID) const { + auto entry = taxIDsAndEntries.find(taxID); + if (entry != taxIDsAndEntries.end()) { + return entry->second.rank; + } else + return std::string(); +} + +template +std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { + std::string lineage; + while (true) { + // 131567 = Cellular organisms + if (taxonomyID != 131567) { + if (lineage.size()) lineage.insert(0, "; "); + lineage.insert(0, getScientificName(taxonomyID)); + if (getRank(taxonomyID) == "species") lineage.clear(); + } + taxonomyID = getParentTaxID(taxonomyID); + if (taxonomyID == 0) { + if (lineage.size()) lineage.append("."); + break; + } + } + return lineage; +} + +template +std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { + std::string rank = getRank(taxonomyID); + if (rank == "superphylum") return std::string(); + std::string lineage; + while (true) { + // 131567 = Cellular organisms + if (taxonomyID != 131567) { + std::string rank = getRank(taxonomyID); + if (rank == "species") { + lineage.insert(0, "|s__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "genus") { + lineage.insert(0, "|g__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "family") { + lineage.insert(0, "|f__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "order") { + lineage.insert(0, "|o__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "class") { + lineage.insert(0, "|c__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "phylum") { + lineage.insert(0, "|p__"); + lineage.insert(4, getScientificName(taxonomyID)); + } else if (rank == "superkingdom") { + lineage.insert(0, "k__"); + lineage.insert(3, getScientificName(taxonomyID)); + } + } + taxonomyID = getParentTaxID(taxonomyID); + if (taxonomyID == 0) { + break; + } + } + std::replace(lineage.begin(), lineage.end(), ' ', '_'); + return lineage; +} + +template +TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, + const std::string& rank) const { + auto entry = taxIDsAndEntries.find(taxID); + while (entry != taxIDsAndEntries.end() && + entry->second.parentTaxonomyID != 1) { + if (entry->second.rank == rank) { + return entry->second.taxonomyID; + } else + entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + } + return 0; +} + +template +int TaxonomyDB::isBelowInTree(TAXID upper, TAXID lower) const { + auto entry = taxIDsAndEntries.find(lower); + unsigned level = 0; + while (entry != taxIDsAndEntries.end() && + entry->second.parentTaxonomyID != 1) { + if (entry->first == upper) { + return level; + } else { + entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + level++; + } + } + return -1; +} + +template +bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { + bool isSubSpecies = false; + auto entry = taxIDsAndEntries.find(taxonomyID); + int numLevels = 0; + while (entry != taxIDsAndEntries.end() && + entry->second.parentTaxonomyID != 1) { + if (entry->second.rank == "species") { + if (numLevels > 0) { + isSubSpecies = true; + } + break; + } else + entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + numLevels++; + } + return isSubSpecies; +} + +template +void TaxonomyDB::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) { + auto it = taxIDsAndEntries.find(taxid); + if (it == taxIDsAndEntries.end()) { + cerr << "No taxonomy entry for " << taxid << "!!" << endl; + return; + } + TaxonomyEntry* tax = &it->second; + //cerr << taxid << " rc before: " << tax->read_counts << endl; + tax->read_counts += read_counts_; + //cerr << taxid << " rc after: " << tax->read_counts << endl; + + while (tax->parent != nullptr) { + tax = tax->parent; + tax->read_counts_children += read_counts_; + } +} + +template +void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { + for (auto& elem : taxon_counts) { + addCounts(elem.first, elem.second); + } + + for (auto& tax : taxIDsAndEntries) { + std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); + } +} + + +template +TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; +} + +template +void TaxReport::printReport(std::string format, std::string rank) { + _total_n_reads = + reads(_taxdb.taxIDsAndEntries.at(0).read_counts) + + reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) + + reads(_taxdb.taxIDsAndEntries.at(1).read_counts) + + reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// + + if (_total_n_reads == 0) { + std::cerr << "total number of reads is zero - not creating a report!" << endl; + return; + } + + if (format == "kraken") { + // A: print number of unidentified reads + printReport(_taxdb.taxIDsAndEntries.at(0),0u); + // B: print normal results + printReport(_taxdb.taxIDsAndEntries.at(1),0u); + // C: Print Unclassified stuff + //printReport(_taxdb.taxIDsAndEntries.at(-1),0u); + } else { + // print stuff at a certain level .. + //_uid_abundance; + //_taxinfo + + } +} + +template +void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { + if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) { + printLine(tax, depth); + for (auto child : tax.children) + printReport(*child, depth+1); + } +} + +template +void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { + for (auto& col : _report_cols) { + switch (col) { + case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; + case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; + case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; + case REPORTCOLS::DEPTH: _reportOfb << depth; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break; + //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; + //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; + case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break; + case REPORTCOLS::NUM_READS: _reportOfb << (tax.read_counts); break; + //case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; + //case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break; + //case REPORTCOLS::GENOME_SIZE: ; break; + //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; + //case REPORTCOLS::SUM_SCORE: ; break; + case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break; + default: _reportOfb << "NA"; + } + if (&col == &_report_cols.back()) { + _reportOfb << '\n'; + } else { + _reportOfb << '\t'; + } + } +} + + + // Return lowest common ancestor of a and b + // LCA(0,x) = LCA(x,0) = x + // Default ancestor is 1 (root of tree) +uint32_t lca(unordered_map &parent_map, uint32_t a, uint32_t b) + { + if (a == 0 || b == 0) + return a ? a : b; + + // create a path from a to the root + std::unordered_set a_path; + while (a > 0 && a != parent_map[a]) { + if (a == b) + return a; + a_path.insert(a); + a = parent_map[a]; + } + + // search for b in the path from a to the root + while (b > 0 && b != parent_map[b]) { + if (a_path.count(b) > 0) + return b; + b = parent_map[b]; + } + return 1; + } + +template +inline +V find_or_use_default(const std::unordered_map& my_map, const K& query, const V default_value) { + auto itr = my_map.find(query); + + if (itr == my_map.end()) { + return default_value; + } + + return itr->second; +} + + + From c5318e0245d5997251fa4db695984959f5dec3f3 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 25 Aug 2017 10:49:05 -0400 Subject: [PATCH 036/105] Add UID mapping to Kraken-HLL --- scripts/kraken_hll | 12 + scripts/kraken_hll-build_db.sh | 17 +- src/build_taxdb.cpp | 2 +- src/classify.cpp | 93 +++++- src/db_sort.cpp | 3 + src/krakenutil.cpp | 151 ++++++++- src/krakenutil.hpp | 14 +- src/query_taxdb.cpp | 138 ++++++++ src/read_uid_mapping.cpp | 76 +++++ src/set_lcas.cpp | 172 ++++++++-- src/taxdb.cpp | 584 --------------------------------- src/taxdb.h | 47 ++- 12 files changed, 655 insertions(+), 654 deletions(-) create mode 100644 src/query_taxdb.cpp create mode 100644 src/read_uid_mapping.cpp delete mode 100644 src/taxdb.cpp diff --git a/scripts/kraken_hll b/scripts/kraken_hll index b31fca3..e2d8412 100755 --- a/scripts/kraken_hll +++ b/scripts/kraken_hll @@ -58,6 +58,7 @@ my $classified_out; my $outfile; my $report_file; my $print_sequence = 0; +my $uid_mapping = 0; GetOptions( "help" => \&display_help, @@ -78,6 +79,7 @@ GetOptions( "check-names" => \$check_names, "gzip-compressed" => \$gunzip, "bzip2-compressed" => \$bunzip2, + "uid-mapping" => \$uid_mapping, "only-classified-output" => \$only_classified_output, ) or die $!; @@ -145,6 +147,16 @@ push @flags, "-M" if $preload; push @flags, "-r", $report_file if defined $report_file; push @flags, "-a", $db_prefix[0]."/taxDB"; push @flags, "-s" if $print_sequence; +if ($uid_mapping) { + my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid"; + if (!-f $uid_mapping_file) { + print STDERR "Missing required file $uid_mapping_file for UID mapping.\n"; + exit(1); + } + push @flags, "-I", $uid_mapping_file; +} else { + +} # handle piping for decompression/merging my @pipe_argv; diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh index fa90e6b..402dc45 100755 --- a/scripts/kraken_hll-build_db.sh +++ b/scripts/kraken_hll-build_db.sh @@ -23,6 +23,7 @@ set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands +set -x function report_time_elapsed() { curr_time=$(date "+%s.%N") @@ -61,14 +62,14 @@ fi if [ "$KRAKEN_REBUILD_DATABASE" == "1" ] then - rm -f database.* *.map lca.complete library/seq-files.txt + rm -f database.* *.map lca.complete library-files.txt fi -if [ !-f "library/seq-files.txt" ]; then +if [ ! -f "library-files.txt" ]; then echo "Finding all library files" - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library/seq-files.txt + find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt fi -N_FILES=`cat library/seq-files.txt | wc -l` +N_FILES=`cat library-files.txt | wc -l` echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)" if [ -e "database.jdb" ] @@ -87,7 +88,7 @@ else echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'" fi - cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \ + cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ -o database /dev/fd/0 @@ -171,8 +172,8 @@ then else echo "Creating seqID to taxID map (step 4 of 6).." start_time1=$(date "+%s.%N") - cat library/seq-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library/seq-headers.txt - join -t $'\t' nucl_gb.accession2taxid.sorted library/seq-headers.txt > seqid2taxid.map.tmp + cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt + join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp mv seqid2taxid.map.tmp seqid2taxid.map line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') @@ -190,7 +191,7 @@ else PARAM=" -a" fi start_time1=$(date "+%s.%N") - cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \ + cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \ -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 touch "lca.complete" diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index f4a4957..6f33763 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -27,7 +27,7 @@ using namespace std; int main(int argc, char **argv) { if (argc < 3 || argc > 4) { - std::cerr << "Usage: a.out names.dmp nodes.dmp [taxon-counts]\n"; + std::cerr << "Usage: build_taxdb names.dmp nodes.dmp [taxon-counts]\n"; return 1; } TaxonomyDB taxdb {(string)argv[1], (string)argv[2]}; diff --git a/src/classify.cpp b/src/classify.cpp index a2a61b3..990012f 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -25,6 +25,7 @@ #include "readcounts.hpp" #include "taxdb.h" #include "gzstream.h" +#include const size_t DEF_WORK_UNIT_SIZE = 500000; int New_taxid_start = 1000000000; @@ -56,6 +57,12 @@ bool Populate_memory = false; bool Only_classified_kraken_output = false; bool Print_sequence = false; bool Print_Progress = true; + +bool Map_UIDs = false; +string UID_to_TaxID_map_filename; +map > UID_to_taxids_map; +QuickFile UID_to_TaxID_map_file; + uint32_t Minimum_hit_count = 1; unordered_map Parent_map; string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file; @@ -94,8 +101,6 @@ ostream* cout_or_file(string file) { } } - - void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) { QuickFile db_file; db_file.open_file(DB_filename); @@ -112,6 +117,35 @@ void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) database.set_index(&db_index); } +vector get_taxids_for_uid(uint32_t uid, char* fptr) { + size_t int_size = sizeof(int); + size_t block_size = sizeof(int)*2; + // TODO: Just get a uint64_t and shift the bits, probably faster + uint32_t taxid = *(uint32_t*)(fptr+(uid-1)*block_size); + uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size); + + vector taxids = {taxid}; + while (parent_uid != 0) { + taxid = *(uint32_t*)(fptr+(parent_uid-1)*block_size); + parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size); + taxids.push_back(taxid); + } + std::sort(taxids.begin(), taxids.end()); + return(taxids); +} + +vector get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map >& uid_map ) { + auto it = uid_map.find(uid); + if (it != uid_map.end()) { + return it->second; + } + vector taxids = get_taxids_for_uid(uid, fptr); + uid_map[uid] = taxids; + return(taxids); +} + + + int main(int argc, char **argv) { #ifdef _OPENMP omp_set_num_threads(1); @@ -119,11 +153,25 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); + if (Map_UIDs) { + if (DB_filenames.size() > 1) { + cerr << "Cannot use more than one database with UID mapping!" << endl; + return 1; + } + + cerr << "Reading UID mapping file " << UID_to_TaxID_map_filename << endl; + UID_to_TaxID_map_file.open_file(UID_to_TaxID_map_filename); + if (Populate_memory) { + UID_to_TaxID_map_file.load_file(); + } + } + if (!TaxDB_file.empty()) { - taxdb = TaxonomyDB(TaxDB_file, true); + // TODO: Define if the taxDB has read counts or not!! + taxdb = TaxonomyDB(TaxDB_file, false); for (const auto & tax : taxdb.taxIDsAndEntries) { if (tax.first != 0) - Parent_map[tax.first] = tax.second.parentTaxonomyID; + Parent_map[tax.first] = tax.second.parentTaxonomyID; } Parent_map[1] = 0; } else { @@ -287,11 +335,12 @@ void process_file(char *filename) { kraken_output_ss.str(""); classified_output_ss.str(""); unclassified_output_ss.str(""); - for (size_t j = 0; j < work_unit.size(); j++) + for (size_t j = 0; j < work_unit.size(); j++) { my_total_classified += classify_sequence( work_unit[j], kraken_output_ss, classified_output_ss, unclassified_output_ss, my_taxon_counts); + } #pragma omp critical(write_output) { @@ -330,6 +379,7 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, unordered_map& my_taxon_counts) { + // TODO: use vector::reserve vector taxa; vector ambig_list; unordered_map hit_counts; @@ -356,12 +406,15 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, else { ambig_list.push_back(0); + // go through multiple databases to map k-mer for (size_t i=0; i= Minimum_hit_count ? taxon : 0; - else - call = resolve_tree(hit_counts, Parent_map); + if (Map_UIDs) { + if (Quick_mode) { + cerr << "Quick mode not available when mapping UIDs" << endl; + exit(1); + } else { + call = resolve_uids2(hit_counts, Parent_map, UID_to_TaxID_map_file.ptr()); + } + } else { + if (Quick_mode) + call = hits >= Minimum_hit_count ? taxon : 0; + else + call = resolve_tree(hit_counts, Parent_map); + } ++(my_taxon_counts[call].n_reads); @@ -482,7 +544,7 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:s")) != -1) { + while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:sI:")) != -1) { switch (opt) { case 'd' : DB_filenames.push_back(optarg); @@ -545,6 +607,10 @@ void parse_command_line(int argc, char **argv) { case 'M' : Populate_memory = true; break; + case 'I' : + UID_to_TaxID_map_filename = optarg; + Map_UIDs = true; + break; default: usage(); break; @@ -573,6 +639,7 @@ void usage(int exit_code) { << " -o filename Output file for Kraken output" << endl << " -r filename Output file for Kraken report output" << endl << " -a filename TaxDB" << endl + << " -I filename UID to TaxId map" << endl << " -t # Number of threads" << endl << " -u # Thread work unit size (in bp)" << endl << " -q Quick operation" << endl diff --git a/src/db_sort.cpp b/src/db_sort.cpp index 1bafef3..713119a 100644 --- a/src/db_sort.cpp +++ b/src/db_sort.cpp @@ -44,6 +44,7 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); + cerr << "db_sort: Getting database into memory ..."; QuickFile input_db_file(Input_DB_filename); KrakenDB *input_db = new KrakenDB(input_db_file.ptr()); Key_len = input_db->get_key_len(); @@ -62,10 +63,12 @@ int main(int argc, char **argv) { input_db = new KrakenDB(header); input_db_file.close_file(); // Stop using memory-mapped file + cerr << "db_sort: Sorting ..."; char *data = new char[ key_ct * (Key_len + val_len) ]; // Populate data w/ pairs from DB and sort bins in parallel bin_and_sort_data(*input_db, data, db_index); + cerr << "db_sort: Sorting complete - writing database to disk ..." << endl; ofstream output_file(Output_DB_filename.c_str(), std::ofstream::binary); output_file.write(header, skip_len); output_file.write(data, key_ct * (Key_len + val_len)); diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index 48e54e9..28ca837 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -17,12 +17,14 @@ * along with Kraken. If not, see . */ +#include "assert_helpers.h" #include "kraken_headers.hpp" #include "krakenutil.hpp" using namespace std; namespace kraken { + // Build a node->parent unordered_map from NCBI Taxonomy nodes.dmp file unordered_map build_parent_map(string filename) { unordered_map pmap; @@ -47,7 +49,7 @@ namespace kraken { // Return lowest common ancestor of a and b // LCA(0,x) = LCA(x,0) = x // Default ancestor is 1 (root of tree) - uint32_t lca(unordered_map &parent_map, + uint32_t lca(const unordered_map &parent_map, uint32_t a, uint32_t b) { if (a == 0 || b == 0) @@ -57,35 +59,41 @@ namespace kraken { set a_path; while (a > 0) { a_path.insert(a); - a = parent_map[a]; + assert(parent_map.find(a) != parent_map.end()); + a = parent_map.at(a); } // search for b in the path from a to the root while (b > 0) { if (a_path.count(b) > 0) return b; - b = parent_map[b]; + assert(parent_map.find(b) != parent_map.end()); + b = parent_map.at(b); } return 1; } // Tree resolution: take all hit taxa (plus ancestors), then // return leaf of highest weighted leaf-to-root path. - uint32_t resolve_tree(unordered_map &hit_counts, - unordered_map &parent_map) + uint32_t resolve_tree(const unordered_map &hit_counts, + const unordered_map &parent_map) { set max_taxa; uint32_t max_taxon = 0, max_score = 0; - unordered_map::iterator it = hit_counts.begin(); // Sum each taxon's LTR path - while (it != hit_counts.end()) { + for (auto it = hit_counts.begin(); + it != hit_counts.end(); ++it) { uint32_t taxon = it->first; uint32_t node = taxon; uint32_t score = 0; while (node > 0) { - score += hit_counts[node]; - node = parent_map[node]; + auto it2 = hit_counts.find(node); + if (it2 != hit_counts.end()) { + score += it2->second; + } + node = parent_map.at(node); + } if (score > max_score) { @@ -98,8 +106,6 @@ namespace kraken { max_taxa.insert(max_taxon); max_taxa.insert(taxon); } - - it++; } // If two LTR paths are tied for max, return LCA of all @@ -113,6 +119,129 @@ namespace kraken { return max_taxon; } + + // Tree resolution: take all hit taxa (plus ancestors), then + // return leaf of highest weighted leaf-to-root path. + uint32_t resolve_uids( + const unordered_map &uid_hit_counts, + const unordered_map &parent_map, + const vector< vector > &UID_to_taxids_vec) { + unordered_map taxid_counts; + unordered_map frac_taxid_counts; + + if (uid_hit_counts.size() == 0) { + return(0); + } + + for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { + uint32_t uid = it->first; + double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size()); + for (auto taxid : UID_to_taxids_vec[uid-1]) { + taxid_counts[taxid] += it->second; + frac_taxid_counts[taxid] += frac_count; + } + } + vector max_taxids; + uint32_t max_count = 0; + double max_frac_count = 0; + for (auto it : taxid_counts) { + if (it.second == max_count) { + if (frac_taxid_counts[it.first] == max_frac_count) { + max_taxids.push_back(it.first); + } else if (frac_taxid_counts[it.first] > max_frac_count) { + max_frac_count = frac_taxid_counts[it.first]; + max_taxids = { it.first }; + } + } else if (it.second > max_count) { + max_taxids = { it.first }; + max_count = it.second; + max_frac_count = frac_taxid_counts[it.first]; + } + } + + uint32_t max_taxon = max_taxids[0]; + auto sit = max_taxids.begin(); + for (++sit; sit != max_taxids.end(); ++sit) { + max_taxon = lca(parent_map, max_taxon, *sit); + + } + + // return the taxid that appeared most often + return max_taxon; + } + + // Tree resolution: take all hit taxa (plus ancestors), then + // return leaf of highest weighted leaf-to-root path. + uint32_t resolve_uids2( + const unordered_map &uid_hit_counts, + const unordered_map &parent_map, + char* fptr) { + unordered_map taxid_counts; + unordered_map frac_taxid_counts; + + if (uid_hit_counts.size() == 0) { + return(0); + } + + size_t int_size = sizeof(int); + size_t block_size = sizeof(int)*2; + for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { + uint32_t uid = it->first; + if (uid == 0) { + continue; + } + uint32_t taxid; + // TODO: Just get a uint64_t and shift the bits, probably faster + vector taxids; + do { + taxid = *(uint32_t*)(fptr+(uid-1)*block_size); + uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size); + + taxid_counts[taxid] += it->second; + taxids.push_back(taxid); + } while (uid != 0); + + double frac_count = (double)it->second / (double)taxids.size(); + for (uint32_t taxid : taxids) { + frac_taxid_counts[taxid] += frac_count; + } + } + + if (taxid_counts.size() == 0) { + return(0); + } + vector max_taxids; + uint32_t max_count = 0; + double max_frac_count = 0; + for (auto it : taxid_counts) { + if (it.second == max_count) { + if (frac_taxid_counts[it.first] == max_frac_count) { + max_taxids.push_back(it.first); + } else if (frac_taxid_counts[it.first] > max_frac_count) { + max_frac_count = frac_taxid_counts[it.first]; + max_taxids = { it.first }; + } + } else if (it.second > max_count) { + max_taxids = { it.first }; + max_count = it.second; + max_frac_count = frac_taxid_counts[it.first]; + } + } + + uint32_t max_taxon = max_taxids[0]; + auto sit = max_taxids.begin(); + for (++sit; sit != max_taxids.end(); ++sit) { + max_taxon = lca(parent_map, max_taxon, *sit); + + } + + // return the taxid that appeared most often + return max_taxon; + } + + + + uint8_t KmerScanner::k = 0; uint64_t KmerScanner::kmer_mask = 0; uint32_t KmerScanner::mini_kmer_mask = 0; diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp index 97dd041..854e26b 100644 --- a/src/krakenutil.hpp +++ b/src/krakenutil.hpp @@ -31,8 +31,18 @@ namespace kraken { // NOTE: LCA(0,x) = LCA(x,0) = x // Resolve classification tree - uint32_t resolve_tree(std::unordered_map &hit_counts, - std::unordered_map &parent_map); + uint32_t resolve_tree(const std::unordered_map &hit_counts, + const std::unordered_map &parent_map); + + uint32_t resolve_uids( + const std::unordered_map &uid_hit_counts, + const std::unordered_map &parent_map, + const std::vector< std::vector > &UID_to_taxids_vec); + + uint32_t resolve_uids2( + const std::unordered_map &uid_hit_counts, + const std::unordered_map &parent_map, + char* fptr); class KmerScanner { public: diff --git a/src/query_taxdb.cpp b/src/query_taxdb.cpp new file mode 100644 index 0000000..7412792 --- /dev/null +++ b/src/query_taxdb.cpp @@ -0,0 +1,138 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the Kraken taxonomic sequence classification system. + * + * Kraken is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Kraken is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ + +#include "taxdb.h" +#include +#include +#include +#include +#include +#include + +using namespace std; + +string return_rank; + +void process_taxID(char mode, uint32_t taxID); +void process_taxIDs(char mode, vector taxIDs); +size_t parse_command_line(int argc, char **argv); +void usage(int exit_code=EX_USAGE); + +TaxonomyDB taxdb; + +int main(int argc, char **argv) { + size_t optind = parse_command_line(argc, argv); + + string line; + uint32_t taxID; + char mode = *argv[optind++]; + for (;optind < argc; ++optind) { + if (strcmp(argv[optind],"-") == 0) { + // read STDIN + if (mode == 'l') { + while (getline(std::cin, line)) { + stringstream ss(line); + vector taxIDs; + while (ss >> taxID) { + taxIDs.push_back(taxID); + } + process_taxIDs(mode,taxIDs); + } + } + while (std::cin >> taxID) { + process_taxID(mode,taxID); + } + } else { + taxID = atol(argv[optind]); + process_taxID(mode,taxID); + } + } + + exit(1); +} +void process_taxIDs(char mode, vector taxIDs) { + switch (mode) { + + case 'r': + if (!return_rank.empty()) { + cout << taxdb.getTaxIDAtRank(taxIDs[0], return_rank) << '\n'; + } + break; + case 'l': + cout << taxdb.getEntry(taxdb.getLowestCommonAncestor(taxIDs)).rank << endl; + break; + default: + usage(); + break; + } +} + + +void process_taxID(char mode, uint32_t taxID) { + switch (mode) { + case 'r': + if (!return_rank.empty()) { + cout << taxdb.getTaxIDAtRank(taxID, return_rank) << '\n'; + } + break; + case 'l': + default: + usage(); + break; + } +} + +size_t parse_command_line(int argc, char **argv) { + int opt; + long long sig; + + if (argc > 1 && strcmp(argv[1], "-h") == 0) + usage(0); + + while ((opt = getopt(argc, argv, "r:m:")) != -1) { + switch (opt) { + case 'r': + return_rank = optarg; + break; + default: + usage(); + break; + } + } + + if (argv[optind] == NULL || argv[optind + 1] == NULL) { + printf("Mandatory argument(s) missing\n"); + exit(1); + } + + taxdb.readTaxonomyIndex(argv[optind++], false); + return optind; +} + +void usage(int exit_code) { + cerr << "Usage: query_taxdb [options] taxDB mode [taxIDs]" << endl + << endl + << "Options: (*mandatory)" << endl + << " -m mode Mode: l for LCA, r for rank" << endl + << " -r rank Output parent rank of taxIDs" << endl + << " -h Print this message" << endl + << endl; + exit(exit_code); +} + diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp new file mode 100644 index 0000000..76b839a --- /dev/null +++ b/src/read_uid_mapping.cpp @@ -0,0 +1,76 @@ + +#include "kraken_headers.hpp" +#include "quickfile.hpp" +#include +#include + +using namespace std; +using namespace kraken; + +vector get_taxids_for_uid(uint32_t uid, char* fptr) { + size_t int_size = sizeof(int); + size_t block_size = sizeof(int)*2; + // TODO: Just get a uint64_t and shift the bits, probably faster + uint32_t taxid = *(uint32_t*)(fptr+(uid-1)*block_size); + uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size); + + vector taxids = {taxid}; + while (parent_uid != 0) { + taxid = *(uint32_t*)(fptr+(parent_uid-1)*block_size); + parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size); + taxids.push_back(taxid); + } + std::sort(taxids.begin(), taxids.end()); + return(taxids); +} + + +vector get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map >& uid_map ) { + auto it = uid_map.find(uid); + if (it != uid_map.end()) { + return it->second; + } + vector taxids = get_taxids_for_uid(uid, fptr); + uid_map[uid] = taxids; + return(taxids); +} + +int main(int argc, char **argv) { + if (argc < 2) { + std::cerr << "Usage: read_uid_mapping []" + "The file is supposed to have lines terminated by '\n'." + << std::endl; + return 1; + } + char *filename = argv[1]; + kraken::QuickFile UID_to_TaxID_map_file; + UID_to_TaxID_map_file.open_file(filename); + + char* fptr = UID_to_TaxID_map_file.ptr(); + if (argc == 2) { + vector< vector > UIDs_to_taxids; + uint32_t UID = 1; + size_t int_size = sizeof(UID); + size_t i = 0; + for (size_t pos = 0; pos < UID_to_TaxID_map_file.size(); pos += 2*int_size) { + uint32_t* taxid_ptr = (uint32_t*)(fptr+pos); + uint32_t* parent_uid = (uint32_t*)(fptr+pos+int_size); + //UIDs_to_taxids.push_back( { UIDs_to_taxids[] } ); + //pos += int_size; + cout << ++i << '\t' << *taxid_ptr << '\t' << *parent_uid << endl; + } + } else { + unordered_map > UID_to_TaxID_map; + for (int i=2; i taxids = get_taxids_for_uid(UID, UID_to_TaxID_map, fptr); + cout << UID << '\t'; + for (auto t : taxids) { + cout << t << ' '; + } + cout << endl; + } + } + + return 0; +} diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 61504d7..1396a7f 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -1,3 +1,4 @@ +// vim: noai:ts=2:sw=2:expandtab:smarttab /* * Copyright 2013-2015, Derrick Wood * @@ -25,6 +26,7 @@ #include "taxdb.h" #include "readcounts.hpp" #include +#include #define SKIP_LEN 50000 @@ -39,7 +41,8 @@ void process_file(string filename, uint32_t taxid); void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish); int Num_threads = 1; -string DB_filename, Index_filename, TaxDB_filename, +string DB_filename, Index_filename, + Output_DB_filename, TaxDB_filename, File_to_taxon_map_filename, ID_to_taxon_map_filename, Multi_fasta_filename; bool force_taxid = false; @@ -50,8 +53,23 @@ bool verbose = false; bool Operate_in_RAM = false; bool One_FASTA_file = false; bool Add_taxIds_for_Sequences = false; +bool Use_uids_instead_of_taxids = false; +bool Output_UID_map_to_STDOUT = false; +bool Pretend = false; +string UID_map_filename; +ofstream UID_map_file; + +uint32_t current_uid = 0; +uint32_t max_uid = -1; unordered_map Parent_map; +//unordered_multimap Children_map; +//typedef std::_Rb_tree_iterator, unsigned int> > map_it; +//typedef std::_Rb_tree_iterator, unsigned int> > map_it; +typedef const vector* map_it; +vector< map_it > UID_to_taxids_vec; +map< vector, uint32_t> Taxids_to_UID_map; + unordered_map ID_to_taxon_map; unordered_map SeqId_added; KrakenDB Database; @@ -65,15 +83,26 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); if (!TaxDB_filename.empty() && !force_taxid) { - taxdb = TaxonomyDB(TaxDB_filename); - for (const auto & tax : taxdb.taxIDsAndEntries) { - if (tax.first != 0) - Parent_map[tax.first] = tax.second.parentTaxonomyID; - } - Parent_map[1] = 0; + taxdb = TaxonomyDB(TaxDB_filename); + for (const auto & tax : taxdb.taxIDsAndEntries) { + if (tax.first != 0) + Parent_map[tax.first] = tax.second.parentTaxonomyID; +// Children_map[tax.second.parentTaxonomyID].insert(tax.first); + } + Parent_map[1] = 0; } else { - cerr << "TaxDB argument is required!" << endl; - return 1; + cerr << "TaxDB argument is required!" << endl; + return 1; + } + + if (Use_uids_instead_of_taxids) { + UID_map_file.open(UID_map_filename, ios_base::out | ios_base::binary); + + if (!UID_map_file.is_open()) { + cerr << "Something went wrong while creating the file." << endl; + exit(1); + } + } QuickFile db_file(DB_filename, "rw"); @@ -90,6 +119,14 @@ int main(int argc, char **argv) { Database = KrakenDB(temp_ptr); cerr << "done" << endl; } else { + if (Output_DB_filename.size() > 0) { + cerr << "You need to operate in RAM (flag -M) to use output to a different file (flag -o)" << endl; + return 1; + } + //std::ifstream ifs("input.txt", std::ios::binary); + //std::ofstream ofs("output.txt", std::ios::binary); + //ofs << ifs.rdbuf(); + Database = KrakenDB(db_file.ptr()); } @@ -104,15 +141,22 @@ int main(int argc, char **argv) { else process_files(); - if (Operate_in_RAM) { + if (Operate_in_RAM && !Pretend) { + if (Output_DB_filename.size() > 0) { + DB_filename = Output_DB_filename; + } + cerr << "Writing database from RAM back to " << DB_filename << " ..." << endl; ofstream ofs(DB_filename.c_str(), ofstream::binary); ofs.write(temp_ptr, db_file_size); ofs.close(); delete temp_ptr; } + UID_map_file.close(); - if (Add_taxIds_for_Sequences && !TaxDB_filename.empty()) { + // Write new TaxDB file if new taxids were added + if (Add_taxIds_for_Sequences && !TaxDB_filename.empty() && !Pretend) { + cerr << "Writing new TaxDB ..." << endl; ofstream ofs(TaxDB_filename.c_str()); taxdb.writeTaxonomyIndex(ofs); ofs.close(); @@ -171,7 +215,10 @@ void process_single_file() { uint32_t taxid; string prefix = "kraken:taxid|"; if (dna.id.substr(0,prefix.size()) == prefix) { - taxid = std::atoi(dna.id.substr(prefix.size()).c_str()); + taxid = std::stol(dna.id.substr(prefix.size())); + if (taxid == 0) { + cerr << "Error: taxid is zero for the line '" << dna.id << "'?!" << endl; + } const auto strBegin = dna.header_line.find_first_not_of("\t "); if (strBegin != std::string::npos) dna.header_line = dna.header_line.substr(strBegin); @@ -181,9 +228,9 @@ void process_single_file() { if (Add_taxIds_for_Sequences) { auto entryIt = taxdb.taxIDsAndEntries.find(taxid); - if (entryIt == taxdb.taxIDsAndEntries.end()) { + if (entryIt == taxdb.taxIDsAndEntries.end()) { cerr << "Error! Didn't find " << taxid << " in TaxonomyDB!!" << endl; - } else { + } else { entryIt->second.scientificName = dna.header_line; } } @@ -195,10 +242,9 @@ void process_single_file() { ++seqs_processed; } else { - if (verbose) - cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl; - - ++seqs_no_taxid; + if (verbose) + cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl; + ++seqs_no_taxid; } cerr << "\rProcessed " << seqs_processed << " sequences"; @@ -255,7 +301,7 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { continue; val_ptr = Database.kmer_query( Database.canonical_representation(*kmer_ptr) - ); + ); if (val_ptr == NULL) { if (! Allow_extra_kmers) { errx(EX_DATAERR, "kmer found in sequence that is not in database"); @@ -265,10 +311,69 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { } continue; } - if (!force_taxid) - *val_ptr = lca(Parent_map, taxid, *val_ptr); - else - *val_ptr = taxid; + if (Use_uids_instead_of_taxids) { + uint32_t kmer_uid = *val_ptr; + bool new_taxid = kmer_uid == 0; + vector taxid_set; + if (new_taxid) { + taxid_set.push_back(taxid); + } else { + if (kmer_uid > UID_to_taxids_vec.size()) { + // This can happen when set_lcas is called on a database that is not all zeros + cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl; + exit(1); + } + taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1)); + auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order + + if (it == taxid_set.end() || *it != taxid) { + // add the taxid to the set, in the right position + taxid_set.insert( it, taxid ); // insert before iterator it + new_taxid = true; + } + } + + if (new_taxid) { + if (max_uid <= current_uid) { + cerr << "Maxxed out on the UIDs!!" << endl; + exit(1); + } + + // get a new taxid for this set + #pragma omp critical(new_uid) + { + auto insert_res = Taxids_to_UID_map.insert( { std::move(taxid_set), current_uid + 1 } ); + if (insert_res.second) { + ++current_uid; + + // print result for map: + if (Output_UID_map_to_STDOUT) { + auto tid_it = insert_res.first->first.begin(); + cout << current_uid << '\t' << *tid_it++; + while (tid_it != insert_res.first->first.end()) { cout << ' ' << *tid_it++; } + cout << '\n'; + } + + // FORMAT: TAXID PARENT + // TODO: Consider using mmap here + UID_map_file.write((char*)&taxid, sizeof(taxid)); + UID_map_file.write((char*)&kmer_uid, sizeof(kmer_uid)); + + //UID_to_taxids_vec[current_uid] = taxid_set; + UID_to_taxids_vec.push_back( &(insert_res.first->first) ); + *val_ptr = current_uid; + } else { + *val_ptr = insert_res.first->second; + } + } + } + } else if (!force_taxid) { + *val_ptr = lca(Parent_map, taxid, *val_ptr); + } else { + // When force_taxid is set, do not compute lca, but assign the taxid + // of the (last) sequence to k-mers + *val_ptr = taxid; + } } } @@ -278,11 +383,18 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:a")) != -1) { + while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:apI:o:S")) != -1) { switch (opt) { case 'f' : File_to_taxon_map_filename = optarg; break; + case 'I' : + Use_uids_instead_of_taxids = true; + UID_map_filename = optarg; + break; + case 'S' : + Output_UID_map_to_STDOUT = true; + break; case 'd' : DB_filename = optarg; break; @@ -324,7 +436,12 @@ void parse_command_line(int argc, char **argv) { case 'M' : Operate_in_RAM = true; break; - + case 'o' : + Output_DB_filename = optarg; + break; + case 'p' : + Pretend = true; + break; default: usage(); break; @@ -353,12 +470,16 @@ void usage(int exit_code) { << "* -b filename Taxonomy DB file" << endl << " -t # Number of threads" << endl << " -M Copy DB to RAM during operation" << endl + << " -o filename Output database to filename, instead of overwriting the input database" << endl << " -x K-mers not found in DB do not cause errors" << endl << " -f filename File to taxon map" << endl << " -F filename Multi-FASTA file with sequence data" << endl << " -m filename Sequence ID to taxon map" << endl << " -a Add taxonomy IDs (starting with "< -uint64_t string_to_T(string str) { - stringstream stream(str); - T result; - stream >> result; - return result; -} - -template -inline -uint64_t reads(const T read_count) { - cerr << "No reads function for type!! " << endl; - throw ; - return(0); -} - - - -inline -uint64_t reads(const uint64_t read_count) { - return(read_count); -} - -std::vector in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) { - std::vector tokens; - size_t i = 0; - size_t next_end = start_at-1; - - for (size_t next_start = s.find(start_char, next_end + 1); \ - next_start != string::npos; - next_start = s.find(start_char, next_end + 1), ++i) { - - next_end = s.find(end_char, next_start + 1); - if (next_end == string::npos) - throw std::runtime_error("unmatched start and end!"); - - tokens.push_back(s.substr(next_start+1, next_end-1)); - } - - return tokens; -} - - - -std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) { - std::vector tokens(max_fields); - size_t delim_length = delimiter.length(); - size_t last = 0; - size_t i = 0; - - for (size_t next = s.find(delimiter, last); - (max_fields > 0 && i < max_fields) && next != string::npos; - next = s.find(delimiter, last), ++i) { - tokens[i] = s.substr(last, next-last); - last = next + delim_length; - } - if (max_fields > 0 && i < max_fields) { - tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars); - } - - return tokens; -} - -std::vector get_fields(const std::string &s, const std::string& delimiter, vector fields) { - std::vector tokens; - tokens.reserve(fields.size()); - size_t delim_length = delimiter.length(); - size_t last = 0; - size_t i = 0; - size_t current_field = 0; - - for (size_t next = s.find(delimiter, last); - tokens.size() < fields.size() && next != string::npos; - next = s.find(delimiter, last), ++i) { - if (i == fields[current_field]) { - tokens.push_back(s.substr(last, next-last)); - ++current_field; - } - last = next + delim_length; - } - - return tokens; -} - - - -//template<> -//TaxonomyEntry::TaxonomyEntry () { -// read_counts = 0; -// read_counts_children = 0; -//} -template -bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { - return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); - } - - -template -std::unordered_map TaxonomyDB::getScientificNameMap() const { - std::unordered_map scientificNameMap; - for (const auto & tax : taxIDsAndEntries) { - scientificNameMap[tax.second.scientificName] = tax.first; - } - return scientificNameMap; -} - -template -unordered_map TaxonomyDB::getParentMap() const { - unordered_map Parent_map; - for (const auto & tax : taxIDsAndEntries) { - if (tax.first != 0) - Parent_map[tax.first] = tax.second.parentTaxonomyID; - } - Parent_map[1] = 1; - return Parent_map; -} - -template -void TaxonomyDB::createPointers() { - for (auto& tax : taxIDsAndEntries) { - if (tax.second.parentTaxonomyID != tax.first) { - auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); - if (parentIt != taxIDsAndEntries.end()) { - tax.second.parent = &(parentIt->second); - parentIt->second.children.push_back(&tax.second); - } - } - } -} - -template -TaxonomyDB::TaxonomyDB() { } - -template -TaxonomyDB::TaxonomyDB(const std::string inFileName) { - log_msg("Building taxonomy index"); - readTaxonomyIndex(inFileName); - createPointers(); - log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + - " nodes"); -} - -template -void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { - std::ifstream nodesDumpFile(nodesDumpFileName); - if (!nodesDumpFile.is_open()) - throw std::runtime_error("unable to open nodes file"); - std::string line; - - TAXID taxonomyID; - TAXID parentTaxonomyID; - std::string rank; - - while (nodesDumpFile.good()) { - getline(nodesDumpFile, line); - std::vector tokens = tokenise(line, "\t|\t", 3, 2); - if (tokens.size() < 3) { - continue; - } - - taxonomyID = string_to_T(tokens[0]); - parentTaxonomyID = string_to_T(tokens[1]); - rank = tokens[2]; - - auto entryIt = taxIDsAndEntries.find(taxonomyID); - if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); - } else { - entryIt->second.parentTaxonomyID = parentTaxonomyID; - entryIt->second.rank = rank; - } - } -} - -template -void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { - std::ifstream namesDumpFile(namesDumpFileName); - if (!namesDumpFile.is_open()) - throw std::runtime_error("unable to open names file"); - std::string line; - - TAXID taxonomyID; - std::string scientificName; - while (namesDumpFile.good()) { - getline(namesDumpFile, line); - std::vector tokens = tokenise(line, "\t|\t", 4, 2); - if (tokens.size() < 4 || tokens[3] != "scientific name") { - continue; - } - taxonomyID = string_to_T(tokens[0]); - scientificName = tokens[1]; - - auto entryIt = taxIDsAndEntries.find(taxonomyID); - if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); - } else { - entryIt->second.scientificName = scientificName; - } - } -} - -template -void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs, - const std::string namesDumpFileName, - const std::string nodesDumpFileName) { - parseNodesDump(nodesDumpFileName); - parseNamesDump(namesDumpFileName); - writeTaxonomyIndex(outs); -} - -template -std::vector getSortedKeys(const std::unordered_map& unordered) { - std::vector keys; - keys.reserve (unordered.size()); - for (auto& it : unordered) { - keys.push_back(it.first); - } - std::sort (keys.begin(), keys.end()); - return keys; -} - -template -void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { - for (TAXID& key : getSortedKeys(taxIDsAndEntries)) { - const auto& entry = taxIDsAndEntries.at(key); - outs << key << "\t" << entry.parentTaxonomyID << "\t" - << entry.scientificName << "\t" << entry.rank << "\n"; - } -} - - - -template -void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) { - std::ifstream inFile(inFileName); - if (!inFile.is_open()) - throw std::runtime_error("unable to open taxonomy index file " + inFileName); - - TAXID taxonomyID, parentTaxonomyID; - std::string scientificName, rank; - - std::string line; - while (!inFile.eof()) { - inFile >> taxonomyID >> parentTaxonomyID; - inFile.get(); // read tab - std::getline(inFile, scientificName, '\t'); - std::getline(inFile, rank, '\n'); - TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName); - - //cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl; - taxIDsAndEntries.insert({ - taxonomyID, newEntry - }); - } - taxIDsAndEntries.insert({ - 0, {0, 0, "no rank", "unclassified" } - }); -} - -template -TAXID TaxonomyDB::getLowestCommonAncestor( - const std::vector& taxIDs) const { - if (taxIDs.size() == 0) { - return 0; - } - std::vector > paths; - for (auto& taxID : taxIDs) { - bool good = true; - std::vector path; - TAXID tempTaxID = taxID; - while (tempTaxID != 0) { - path.push_back(tempTaxID); - tempTaxID = getParentTaxID(tempTaxID); - } - if (good) paths.push_back(path); - } - if (paths.size() == 0) { - return 0; - } - for (auto& path : paths) - std::reverse(path.begin(), path.end()); - std::sort(paths.begin(), paths.end(), - [](std::vector i, std::vector j) { - return i.size() < j.size(); - }); - TAXID consensus = 0; - for (unsigned i = 0; i < paths[0].size(); i++) { - TAXID temp = 0; - for (auto& path : paths) { - if (temp == 0) - temp = path[i]; - else if (temp != path[i]) { - return consensus; - } - } - consensus = temp; - } - return consensus; -} - -template -TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { - auto entry = taxIDsAndEntries.find(taxID); - if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) - return entry->second.parentTaxonomyID; - else - return 0; -} - -template -std::string TaxonomyDB::getScientificName(const TAXID taxID) const { - auto entry = taxIDsAndEntries.find(taxID); - if (entry != taxIDsAndEntries.end()) { - return entry->second.scientificName; - } else - return std::string(); -} - -template -std::string TaxonomyDB::getRank(const TAXID taxID) const { - auto entry = taxIDsAndEntries.find(taxID); - if (entry != taxIDsAndEntries.end()) { - return entry->second.rank; - } else - return std::string(); -} - -template -std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { - std::string lineage; - while (true) { - // 131567 = Cellular organisms - if (taxonomyID != 131567) { - if (lineage.size()) lineage.insert(0, "; "); - lineage.insert(0, getScientificName(taxonomyID)); - if (getRank(taxonomyID) == "species") lineage.clear(); - } - taxonomyID = getParentTaxID(taxonomyID); - if (taxonomyID == 0) { - if (lineage.size()) lineage.append("."); - break; - } - } - return lineage; -} - -template -std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { - std::string rank = getRank(taxonomyID); - if (rank == "superphylum") return std::string(); - std::string lineage; - while (true) { - // 131567 = Cellular organisms - if (taxonomyID != 131567) { - std::string rank = getRank(taxonomyID); - if (rank == "species") { - lineage.insert(0, "|s__"); - lineage.insert(4, getScientificName(taxonomyID)); - } else if (rank == "genus") { - lineage.insert(0, "|g__"); - lineage.insert(4, getScientificName(taxonomyID)); - } else if (rank == "family") { - lineage.insert(0, "|f__"); - lineage.insert(4, getScientificName(taxonomyID)); - } else if (rank == "order") { - lineage.insert(0, "|o__"); - lineage.insert(4, getScientificName(taxonomyID)); - } else if (rank == "class") { - lineage.insert(0, "|c__"); - lineage.insert(4, getScientificName(taxonomyID)); - } else if (rank == "phylum") { - lineage.insert(0, "|p__"); - lineage.insert(4, getScientificName(taxonomyID)); - } else if (rank == "superkingdom") { - lineage.insert(0, "k__"); - lineage.insert(3, getScientificName(taxonomyID)); - } - } - taxonomyID = getParentTaxID(taxonomyID); - if (taxonomyID == 0) { - break; - } - } - std::replace(lineage.begin(), lineage.end(), ' ', '_'); - return lineage; -} - -template -TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, - const std::string& rank) const { - auto entry = taxIDsAndEntries.find(taxID); - while (entry != taxIDsAndEntries.end() && - entry->second.parentTaxonomyID != 1) { - if (entry->second.rank == rank) { - return entry->second.taxonomyID; - } else - entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); - } - return 0; -} - -template -int TaxonomyDB::isBelowInTree(TAXID upper, TAXID lower) const { - auto entry = taxIDsAndEntries.find(lower); - unsigned level = 0; - while (entry != taxIDsAndEntries.end() && - entry->second.parentTaxonomyID != 1) { - if (entry->first == upper) { - return level; - } else { - entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); - level++; - } - } - return -1; -} - -template -bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { - bool isSubSpecies = false; - auto entry = taxIDsAndEntries.find(taxonomyID); - int numLevels = 0; - while (entry != taxIDsAndEntries.end() && - entry->second.parentTaxonomyID != 1) { - if (entry->second.rank == "species") { - if (numLevels > 0) { - isSubSpecies = true; - } - break; - } else - entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); - numLevels++; - } - return isSubSpecies; -} - -template -void TaxonomyDB::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) { - auto it = taxIDsAndEntries.find(taxid); - if (it == taxIDsAndEntries.end()) { - cerr << "No taxonomy entry for " << taxid << "!!" << endl; - return; - } - TaxonomyEntry* tax = &it->second; - //cerr << taxid << " rc before: " << tax->read_counts << endl; - tax->read_counts += read_counts_; - //cerr << taxid << " rc after: " << tax->read_counts << endl; - - while (tax->parent != nullptr) { - tax = tax->parent; - tax->read_counts_children += read_counts_; - } -} - -template -void TaxonomyDB::fillCounts(const unordered_map& taxon_counts) { - for (auto& elem : taxon_counts) { - addCounts(elem.first, elem.second); - } - - for (auto& tax : taxIDsAndEntries) { - std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); - } -} - - -template -TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { - _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; -} - -template -void TaxReport::printReport(std::string format, std::string rank) { - _total_n_reads = - reads(_taxdb.taxIDsAndEntries.at(0).read_counts) + - reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) + - reads(_taxdb.taxIDsAndEntries.at(1).read_counts) + - reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// + - if (_total_n_reads == 0) { - std::cerr << "total number of reads is zero - not creating a report!" << endl; - return; - } - - if (format == "kraken") { - // A: print number of unidentified reads - printReport(_taxdb.taxIDsAndEntries.at(0),0u); - // B: print normal results - printReport(_taxdb.taxIDsAndEntries.at(1),0u); - // C: Print Unclassified stuff - //printReport(_taxdb.taxIDsAndEntries.at(-1),0u); - } else { - // print stuff at a certain level .. - //_uid_abundance; - //_taxinfo - - } -} - -template -void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { - if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) { - printLine(tax, depth); - for (auto child : tax.children) - printReport(*child, depth+1); - } -} - -template -void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { - for (auto& col : _report_cols) { - switch (col) { - case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; - case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; - case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; - case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break; - //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; - //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; - case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break; - case REPORTCOLS::NUM_READS: _reportOfb << (tax.read_counts); break; - //case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break; - //case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break; - //case REPORTCOLS::GENOME_SIZE: ; break; - //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; - //case REPORTCOLS::SUM_SCORE: ; break; - case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break; - default: _reportOfb << "NA"; - } - if (&col == &_report_cols.back()) { - _reportOfb << '\n'; - } else { - _reportOfb << '\t'; - } - } -} - - - // Return lowest common ancestor of a and b - // LCA(0,x) = LCA(x,0) = x - // Default ancestor is 1 (root of tree) -uint32_t lca(unordered_map &parent_map, uint32_t a, uint32_t b) - { - if (a == 0 || b == 0) - return a ? a : b; - - // create a path from a to the root - std::unordered_set a_path; - while (a > 0 && a != parent_map[a]) { - if (a == b) - return a; - a_path.insert(a); - a = parent_map[a]; - } - - // search for b in the path from a to the root - while (b > 0 && b != parent_map[b]) { - if (a_path.count(b) > 0) - return b; - b = parent_map[b]; - } - return 1; - } - -template -inline -V find_or_use_default(const std::unordered_map& my_map, const K& query, const V default_value) { - auto itr = my_map.find(query); - - if (itr == my_map.end()) { - return default_value; - } - - return itr->second; -} - - - diff --git a/src/taxdb.h b/src/taxdb.h index 3b825f2..7c94f33 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -101,16 +101,21 @@ class TaxonomyDB { TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false); TaxonomyDB(); void writeTaxonomyIndex(std::ostream & outs) const; + void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes); TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; std::string getScientificName(const TAXID taxID) const; std::string getRank(const TAXID taxID) const; TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; + TAXID getParentTaxID(const TAXID taxID) const; std::unordered_map getParentMap() const; std::unordered_map getScientificNameMap() const; std::string getLineage(TAXID taxonomyID) const; std::string getMetaPhlAnLineage(TAXID taxonomyID) const; + TaxonomyEntry getEntry(TAXID taxID) const; + + size_t distance(TAXID taxID1, TAXID taxID2) const; bool isSubSpecies(TAXID taxonomyID) const; int isBelowInTree(TAXID upper, TAXID lower) const; @@ -125,8 +130,9 @@ class TaxonomyDB { std::unordered_map > taxIDsAndEntries; bool genomeSizes_are_set = false; private: - std::unordered_map > - readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes); + + std::unordered_map > + readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes); void parseNamesDump(const std::string namesDumpFileName); void parseNodesDump(const std::string nodesDumpFileName); void createPointers(std::unordered_map >& taxIDsAndEntries); @@ -281,6 +287,17 @@ unordered_map TaxonomyDB::getParentMap() const { return Parent_map; } +template +TaxonomyEntry TaxonomyDB::getEntry(TAXID taxID) const { + auto it = taxIDsAndEntries.find(taxID); + if (it == taxIDsAndEntries.end()) { + TaxonomyEntry ti { 0, 0, "NA"}; + return ti; + } else { + return it->second; + } +} + template void TaxonomyDB::createPointers(std::unordered_map >& taxIDsAndEntries) { for (auto& tax : taxIDsAndEntries) { @@ -299,7 +316,7 @@ TaxonomyDB::TaxonomyDB() { } template TaxonomyDB::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) : - taxIDsAndEntries( readTaxonomyIndex(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes) + taxIDsAndEntries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes) { } template @@ -403,9 +420,15 @@ void TaxonomyDB::setGenomeSizes(const std::unordered_map +void TaxonomyDB::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) { + taxIDsAndEntries = readTaxonomyIndex_(inFileName, hasGenomeSizes); + genomeSizes_are_set = hasGenomeSizes; +} + template std::unordered_map > - TaxonomyDB::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) { + TaxonomyDB::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) { log_msg("Reading taxonomy index from " + inFileName); std::ifstream inFile(inFileName); if (!inFile.is_open()) @@ -418,12 +441,12 @@ std::unordered_map > std::string line; while (!inFile.eof()) { - inFile >> taxonomyID >> parentTaxonomyID; - inFile.get(); // read tab - std::getline(inFile, scientificName, '\t'); + inFile >> taxonomyID >> parentTaxonomyID; + inFile.get(); // read tab + std::getline(inFile, scientificName, '\t'); if (hasGenomeSizes) { std::getline(inFile, rank, '\t'); - inFile >> genomeSize >> genomeSizeOfChildren; + inFile >> genomeSize >> genomeSizeOfChildren; } else { std::getline(inFile, rank, '\n'); } @@ -438,7 +461,7 @@ std::unordered_map > 0, {0, 0, "no rank", "unclassified" } }); createPointers(taxIDsAndEntries); - log_msg("Finished, read " + std::to_string(taxIDsAndEntries.size()) + " taxa"); + log_msg("done reading TaxDB, read " + std::to_string(taxIDsAndEntries.size()) + " taxa"); return(taxIDsAndEntries); } @@ -469,6 +492,7 @@ TAXID TaxonomyDB::getLowestCommonAncestor( return i.size() < j.size(); }); TAXID consensus = 0; + // assumes equal paths lengths?? for (unsigned i = 0; i < paths[0].size(); i++) { TAXID temp = 0; for (auto& path : paths) { @@ -574,12 +598,15 @@ template TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, const std::string& rank) const { auto entry = taxIDsAndEntries.find(taxID); + //cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl; while (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) { + //cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl; if (entry->second.rank == rank) { return entry->second.taxonomyID; - } else + } else { entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + } } return 0; } From 2873a79b35b482eab30d9a196ece730ed631eb63 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 28 Aug 2017 09:54:52 -0400 Subject: [PATCH 037/105] Renamed to KrakenU --- .gitignore | 1 + README.md | 8 ++++---- install_kraken.sh | 4 ++-- scripts/{kraken_hll => krakenu} | 0 ...en_hll-add_to_library.sh => krakenu-add_to_library.sh} | 0 scripts/{kraken_hll-build => krakenu-build} | 2 +- scripts/{kraken_hll-build_db.sh => krakenu-build_db.sh} | 2 +- ...ck_for_jellyfish.sh => krakenu-check_for_jellyfish.sh} | 0 scripts/{kraken_hll-clean_db.sh => krakenu-clean_db.sh} | 0 ...ll-cp_into_tempfile.pl => krakenu-cp_into_tempfile.pl} | 0 ...mic_library.sh => krakenu-download_genomic_library.sh} | 0 ...-download_taxonomy.sh => krakenu-download_taxonomy.sh} | 0 scripts/{kraken_hll-filter => krakenu-filter} | 0 scripts/{kraken_hll-mpa-report => krakenu-mpa-report} | 0 .../{kraken_hll-read_merger.pl => krakenu-read_merger.pl} | 0 scripts/{kraken_hll-report => krakenu-report} | 0 scripts/{kraken_hll-shrink_db.sh => krakenu-shrink_db.sh} | 0 ...d_installation.sh => krakenu-standard_installation.sh} | 8 ++++---- scripts/{kraken_hll-translate => krakenu-translate} | 0 .../{kraken_hll-upgrade_db.sh => krakenu-upgrade_db.sh} | 0 ...-verify_gi_numbers.pl => krakenu-verify_gi_numbers.pl} | 0 src/Makefile | 2 +- 22 files changed, 14 insertions(+), 13 deletions(-) rename scripts/{kraken_hll => krakenu} (100%) rename scripts/{kraken_hll-add_to_library.sh => krakenu-add_to_library.sh} (100%) rename scripts/{kraken_hll-build => krakenu-build} (99%) rename scripts/{kraken_hll-build_db.sh => krakenu-build_db.sh} (99%) rename scripts/{kraken_hll-check_for_jellyfish.sh => krakenu-check_for_jellyfish.sh} (100%) rename scripts/{kraken_hll-clean_db.sh => krakenu-clean_db.sh} (100%) rename scripts/{kraken_hll-cp_into_tempfile.pl => krakenu-cp_into_tempfile.pl} (100%) rename scripts/{kraken_hll-download_genomic_library.sh => krakenu-download_genomic_library.sh} (100%) rename scripts/{kraken_hll-download_taxonomy.sh => krakenu-download_taxonomy.sh} (100%) rename scripts/{kraken_hll-filter => krakenu-filter} (100%) rename scripts/{kraken_hll-mpa-report => krakenu-mpa-report} (100%) rename scripts/{kraken_hll-read_merger.pl => krakenu-read_merger.pl} (100%) rename scripts/{kraken_hll-report => krakenu-report} (100%) rename scripts/{kraken_hll-shrink_db.sh => krakenu-shrink_db.sh} (100%) rename scripts/{kraken_hll-standard_installation.sh => krakenu-standard_installation.sh} (83%) rename scripts/{kraken_hll-translate => krakenu-translate} (100%) rename scripts/{kraken_hll-upgrade_db.sh => krakenu-upgrade_db.sh} (100%) rename scripts/{kraken_hll-verify_gi_numbers.pl => krakenu-verify_gi_numbers.pl} (100%) diff --git a/.gitignore b/.gitignore index 500b4a0..d6ff918 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /install/ +/Debug/ diff --git a/README.md b/README.md index a31d87c..83ae11b 100644 --- a/README.md +++ b/README.md @@ -33,13 +33,13 @@ Here's a small example of a classification against a viral database with k=25. T ## Usage -For usage, see `kraken_hll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM). +For usage, see `krakenu --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM). ### Differences to `kraken` - - Use `kraken_hll --report-file FILENAME ...` to write the kraken report to `FILENAME`. - - Use `kraken_hll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `kraken_hll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same. + - Use `krakenu --report-file FILENAME ...` to write the kraken report to `FILENAME`. + - Use `krakenu --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenu-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same. - Add a suffix `.gz` to output files to generate gzipped output files ### Differences to `kraken-build` - - Use `kraken_hll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`. + - Use `krakenu-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`. - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space diff --git a/install_kraken.sh b/install_kraken.sh index b909336..f6f5701 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -39,7 +39,7 @@ fi export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1") mkdir -p "$KRAKEN_DIR" -make -C src clean +#make -C src clean make -C src install for file in scripts/* do @@ -58,7 +58,7 @@ echo "Kraken installation complete." echo echo "To make things easier for you, you may want to copy/symlink the following" echo "files into a directory in your PATH:" -for file in $KRAKEN_DIR/kraken_hll* +for file in $KRAKEN_DIR/krakenu* do [ -x "$file" ] && echo " $file" done diff --git a/scripts/kraken_hll b/scripts/krakenu similarity index 100% rename from scripts/kraken_hll rename to scripts/krakenu diff --git a/scripts/kraken_hll-add_to_library.sh b/scripts/krakenu-add_to_library.sh similarity index 100% rename from scripts/kraken_hll-add_to_library.sh rename to scripts/krakenu-add_to_library.sh diff --git a/scripts/kraken_hll-build b/scripts/krakenu-build similarity index 99% rename from scripts/kraken_hll-build rename to scripts/krakenu-build index 8367fdd..9df965e 100755 --- a/scripts/kraken_hll-build +++ b/scripts/krakenu-build @@ -298,7 +298,7 @@ sub build_database { $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0); $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq; my $opt = ($verbose? "-x" : ""); - exec "kraken_hll-build_db.sh $opt"; + exec "krakenu-build_db.sh"; } sub clean_database { diff --git a/scripts/kraken_hll-build_db.sh b/scripts/krakenu-build_db.sh similarity index 99% rename from scripts/kraken_hll-build_db.sh rename to scripts/krakenu-build_db.sh index 402dc45..e3d38ad 100755 --- a/scripts/kraken_hll-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -42,7 +42,7 @@ script_dir=`dirname $0` DATABASE_DIR="$KRAKEN_DB_NAME" FIND_OPTS=-L -JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh` +JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh` if [ ! -d "$DATABASE_DIR" ] then diff --git a/scripts/kraken_hll-check_for_jellyfish.sh b/scripts/krakenu-check_for_jellyfish.sh similarity index 100% rename from scripts/kraken_hll-check_for_jellyfish.sh rename to scripts/krakenu-check_for_jellyfish.sh diff --git a/scripts/kraken_hll-clean_db.sh b/scripts/krakenu-clean_db.sh similarity index 100% rename from scripts/kraken_hll-clean_db.sh rename to scripts/krakenu-clean_db.sh diff --git a/scripts/kraken_hll-cp_into_tempfile.pl b/scripts/krakenu-cp_into_tempfile.pl similarity index 100% rename from scripts/kraken_hll-cp_into_tempfile.pl rename to scripts/krakenu-cp_into_tempfile.pl diff --git a/scripts/kraken_hll-download_genomic_library.sh b/scripts/krakenu-download_genomic_library.sh similarity index 100% rename from scripts/kraken_hll-download_genomic_library.sh rename to scripts/krakenu-download_genomic_library.sh diff --git a/scripts/kraken_hll-download_taxonomy.sh b/scripts/krakenu-download_taxonomy.sh similarity index 100% rename from scripts/kraken_hll-download_taxonomy.sh rename to scripts/krakenu-download_taxonomy.sh diff --git a/scripts/kraken_hll-filter b/scripts/krakenu-filter similarity index 100% rename from scripts/kraken_hll-filter rename to scripts/krakenu-filter diff --git a/scripts/kraken_hll-mpa-report b/scripts/krakenu-mpa-report similarity index 100% rename from scripts/kraken_hll-mpa-report rename to scripts/krakenu-mpa-report diff --git a/scripts/kraken_hll-read_merger.pl b/scripts/krakenu-read_merger.pl similarity index 100% rename from scripts/kraken_hll-read_merger.pl rename to scripts/krakenu-read_merger.pl diff --git a/scripts/kraken_hll-report b/scripts/krakenu-report similarity index 100% rename from scripts/kraken_hll-report rename to scripts/krakenu-report diff --git a/scripts/kraken_hll-shrink_db.sh b/scripts/krakenu-shrink_db.sh similarity index 100% rename from scripts/kraken_hll-shrink_db.sh rename to scripts/krakenu-shrink_db.sh diff --git a/scripts/kraken_hll-standard_installation.sh b/scripts/krakenu-standard_installation.sh similarity index 83% rename from scripts/kraken_hll-standard_installation.sh rename to scripts/krakenu-standard_installation.sh index 341e4e0..e10254b 100755 --- a/scripts/kraken_hll-standard_installation.sh +++ b/scripts/krakenu-standard_installation.sh @@ -31,10 +31,10 @@ then fi check_for_jellyfish.sh -kraken_hll-build --db $KRAKEN_DB_NAME --download-taxonomy -kraken_hll-build --db $KRAKEN_DB_NAME --download-library bacteria -kraken_hll-build --db $KRAKEN_DB_NAME --download-library viruses -kraken_hll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ +krakenu-build --db $KRAKEN_DB_NAME --download-taxonomy +krakenu-build --db $KRAKEN_DB_NAME --download-library bacteria +krakenu-build --db $KRAKEN_DB_NAME --download-library viruses +krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \ --max-db-size "$KRAKEN_MAX_DB_SIZE" \ --minimizer-len $KRAKEN_MINIMIZER_LEN \ diff --git a/scripts/kraken_hll-translate b/scripts/krakenu-translate similarity index 100% rename from scripts/kraken_hll-translate rename to scripts/krakenu-translate diff --git a/scripts/kraken_hll-upgrade_db.sh b/scripts/krakenu-upgrade_db.sh similarity index 100% rename from scripts/kraken_hll-upgrade_db.sh rename to scripts/krakenu-upgrade_db.sh diff --git a/scripts/kraken_hll-verify_gi_numbers.pl b/scripts/krakenu-verify_gi_numbers.pl similarity index 100% rename from scripts/kraken_hll-verify_gi_numbers.pl rename to scripts/krakenu-verify_gi_numbers.pl diff --git a/src/Makefile b/src/Makefile index 03f32cb..f721cf4 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ CXX = g++ -CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 +CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream From 607cb0cf4826ec844abe11a6144a35a391f5be9b Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 31 Aug 2017 13:34:36 -0400 Subject: [PATCH 038/105] Put version string in separate file --- VERSION | 1 + install_kraken.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 VERSION diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..49d5957 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1 diff --git a/install_kraken.sh b/install_kraken.sh index f6f5701..000c9b7 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -19,7 +19,7 @@ set -e -VERSION="0.10.7-kraken-hll" +VERSION=`cat $(dirname $0)/VERSION` if [ -z "$1" ] || [ -n "$2" ] then From 432d6ceb4eb87fb18e1f91529e324129e85940c9 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 31 Aug 2017 14:11:39 -0400 Subject: [PATCH 039/105] Fixed script paths --- scripts/krakenu-build | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/krakenu-build b/scripts/krakenu-build index 9df965e..267fc7d 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenu-build @@ -262,7 +262,7 @@ sub display_version { } sub download_taxonomy { - exec "download_taxonomy.sh"; + exec "krakenu-download_taxonomy.sh"; } sub download_library { @@ -271,12 +271,12 @@ sub download_library { warn "Unknown library type \"$type\"\n"; usage(); } - exec "download_genomic_library.sh", $type; + exec "krakenu-download_genomic_library.sh", $type; } sub add_to_library { my $arg = shift; - exec "add_to_library.sh", $arg; + exec "krakenu-add_to_library.sh", $arg; } sub shrink_db { @@ -287,11 +287,11 @@ sub shrink_db { if (! defined($new_db)) { die "Must specify new database name to perform shrink task\n"; } - exec "shrink_db.sh", $new_count, $new_db, $shrink_block_offset; + exec "krakenu-shrink_db.sh", $new_count, $new_db, $shrink_block_offset; } sub standard_installation { - exec "standard_installation.sh"; + exec "krakenu-standard_installation.sh"; } sub build_database { @@ -302,9 +302,9 @@ sub build_database { } sub clean_database { - exec "clean_db.sh"; + exec "krakenu-clean_db.sh"; } sub upgrade_database { - exec "upgrade_db.sh"; + exec "krakenu-upgrade_db.sh"; } From e52b7e0f8c183b19b11fc09b11cf06ea84456847 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 22 Sep 2017 11:32:44 -0400 Subject: [PATCH 040/105] Major update for UID mapping and having taxon entries for assemblies --- scripts/krakenu-build | 29 +- scripts/krakenu-build_db.sh | 127 ++++-- scripts/krakenu-download | 431 ++++++++++++++++++++ scripts/krakenu-download_genomic_library.sh | 119 ------ scripts/krakenu-download_taxonomy.sh | 48 --- scripts/krakenu-standard_installation.sh | 6 +- src/Makefile | 6 +- src/classify.cpp | 116 ++---- src/krakendb.cpp | 13 +- src/krakendb.hpp | 3 +- src/krakenutil.cpp | 153 +------ src/krakenutil.hpp | 18 +- src/read_uid_mapping.cpp | 3 +- src/set_lcas.cpp | 188 +++++---- src/taxdb.h | 64 +-- src/uid_mapping.cpp | 196 +++++++++ src/uid_mapping.hpp | 45 ++ 17 files changed, 1017 insertions(+), 548 deletions(-) create mode 100755 scripts/krakenu-download delete mode 100755 scripts/krakenu-download_genomic_library.sh delete mode 100755 scripts/krakenu-download_taxonomy.sh create mode 100644 src/uid_mapping.cpp create mode 100644 src/uid_mapping.hpp diff --git a/scripts/krakenu-build b/scripts/krakenu-build index 267fc7d..2303f76 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenu-build @@ -64,7 +64,12 @@ my ( $upgrade, $clean, + $build_uid_database, + $build_lca_database, + + $add_taxonomy_ids_for_genome, $add_taxonomy_ids_for_seq + ); my $verbose = 0; @@ -75,7 +80,10 @@ $kmer_len = $DEF_KMER_LEN; $work_on_disk = ""; $hash_size = ""; $max_db_size = ""; +$add_taxonomy_ids_for_genome = 0; $add_taxonomy_ids_for_seq = 0; +$build_uid_database = 1; +$build_lca_database = 1; # variables corresponding to task options my @TASK_LIST = ( @@ -115,7 +123,10 @@ GetOptions( "clean" => \$clean, "verbose" => \$verbose, - "generate-taxonomy-ids-for-sequences" => \$add_taxonomy_ids_for_seq + "taxids-for-genomes" => \$add_taxonomy_ids_for_genome, + "taxids-for-sequences" => \$add_taxonomy_ids_for_seq, + "lca-database!" => \$build_lca_database, + "uid-database!" => \$build_uid_database ) or usage(); if (@ARGV) { @@ -241,12 +252,17 @@ Options: --shrink-block-offset NUM When shrinking, select the k-mer that is NUM positions from the end of a block of k-mers (default: 1) + --uid-database Build a UID database (default no) + --lca-database Build a LCA database (default yes) + --no-lca-database Do not build a LCA database --work-on-disk Perform most operations on disk rather than in RAM (will slow down build in most cases) - --generate-taxonomy-ids-for-sequences - Generate taxonomy IDs for sequences, starting with 1000000000. - Can be useful to resolve classifications with multiple genomes - for one taxonomy ID. + --taxids-for-genomes Add taxonomy IDs (starting with 1bio) for genomes. + Only works with 3-column seqid2taxid map with third + column being the name + --taxids-for-sequences Add taxonomy IDs for sequences, starting with 1bio. + Can be useful to resolve classifications with multiple genomes + for one taxonomy ID. EOF exit $exit_code; } @@ -297,6 +313,9 @@ sub standard_installation { sub build_database { $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0); $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq; + $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome; + $ENV{"KRAKEN_UID_DATABASE"} = $build_uid_database; + $ENV{"KRAKEN_LCA_DATABASE"} = $build_lca_database; my $opt = ($verbose? "-x" : ""); exec "krakenu-build_db.sh"; } diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index e3d38ad..aebff74 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -1,4 +1,5 @@ #!/bin/bash +#vim: noai:ts=2:sw=2 # Copyright 2013-2015, Derrick Wood # @@ -23,9 +24,9 @@ set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands -set -x function report_time_elapsed() { + set -x curr_time=$(date "+%s.%N") perl -e '$time = $ARGV[1] - $ARGV[0];' \ -e '$sec = int($time); $nsec = $time - $sec;' \ @@ -37,12 +38,25 @@ function report_time_elapsed() { $1 $curr_time } +export VERBOSE=1 + +function cmd () { + export start_time1=$(date "+%s.%N") + if [[ $VERBOSE -eq 1 ]]; then + echo "EXECUTING $@" + fi + $@ +} + + start_time=$(date "+%s.%N") script_dir=`dirname $0` DATABASE_DIR="$KRAKEN_DB_NAME" FIND_OPTS=-L JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh` +NCBI_SERVER="ftp.ncbi.nih.gov" +FTP_SERVER="ftp://$NCBI_SERVER" if [ ! -d "$DATABASE_DIR" ] then @@ -72,7 +86,7 @@ fi N_FILES=`cat library-files.txt | wc -l` echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)" -if [ -e "database.jdb" ] +if [ -e "database.jdb" ] || [ -e "database0.kdb" ] then echo "Skipping step 1, k-mer set already exists." else @@ -150,18 +164,19 @@ else fi fi -if [ -e "database.kdb" ] +SORTED_DB_NAME=database0.kdb +if [ -e "$SORTED_DB_NAME" ] then echo "Skipping step 3, k-mer set already sorted." else echo "Sorting k-mer set (step 3 of 6)..." start_time1=$(date "+%s.%N") db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \ - -d database.jdb -o database.kdb.tmp \ + -d database.jdb -o $SORTED_DB_NAME.tmp \ -i database.idx # Once here, DB is sorted, can put file in proper place. - mv database.kdb.tmp database.kdb + mv $SORTED_DB_NAME.tmp $SORTED_DB_NAME echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]" fi @@ -180,35 +195,99 @@ else echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" fi -if [ -e "lca.complete" ] + +if [ -s "taxDB" ] then - echo "Skipping step 5, LCAs already set." + echo "Skipping step 5, taxDB exists." else - echo "Setting LCAs in database (step 5 of 6)..." - PARAM="" - if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then - echo " Adding taxonomy IDs for sequences" - PARAM=" -a" - fi + echo "Creating taxDB (step 5 of 6)... " start_time1=$(date "+%s.%N") - cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ - set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \ - -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 - touch "lca.complete" + if [ ! -f taxonomy/names.dmp ] || [ ! -f taxonomy/nodes.dmp ]; then + echo "taxonomy/names.dmp or taxonomy/nodes.dmp does not exist - downloading it ..." + [ -d taxonomy ] || mkdir taxonomy + cd taxonomy + wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz + tar zxf taxdump.tar.gz + cd .. + fi + build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp + mv taxDB.tmp taxDB + echo "taxDB construction finished. [$(report_time_elapsed $start_time1)]" +fi + +if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then + if [ -e "database.kdb" ] + then + echo "Skipping step 6, LCAs already set." + else + echo "Building standard Kraken LCA database (step 6 of 6)..." + PARAM="" + if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then + echo " Adding taxonomy IDs for sequences" + PARAM=" -a" + fi + if [[ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]]; then + echo " Adding taxonomy IDs for genomes" + PARAM="$PARAM -A" + fi + start_time1=$(date "+%s.%N") + set -x + cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ + set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \ + -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kmer_count \ + -F /dev/fd/0 > seqid2taxid-plus.map - echo "Database LCAs set. [$(report_time_elapsed $start_time1)]" + ## Make a classification report + krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input library/archaea.fna > $(basename `pwd`).kraken + set +x + if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then + mv seqid2taxid.map seqid2taxid.map.orig + mv seqid2taxid-plus.map seqid2taxid.map + fi + echo "LCA database created. [$(report_time_elapsed $start_time1)]" + fi fi -if [ -s "taxDB" ] + +if [ "$KRAKEN_UID_DATABASE" != "0" ]; then + if [ -e "uid_database.complete" ] + then + echo "Skipping step 6.3, UIDs already set." + else + echo "Building UID database (step 6.3 of 6)..." + PARAM="" + if [[ "$KRAKEN_LCA_DATABASE" == "0" ]]; then + if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" && ]]; then + echo " Adding taxonomy IDs for sequences" + PARAM=" -a" + fi + if [[ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]]; then + echo " Adding taxonomy IDs for genomes" + PARAM="$PARAM -A" + fi + fi + start_time1=$(date "+%s.%N") + cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ + set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \ + -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 + touch "uid_database.complete" + + echo "UID Database created. [$(report_time_elapsed $start_time1)]" + fi +fi + +if [ -s "uid_database.count" ] then - echo "Skipping step 6, taxDB exists." + echo "Skipping step 6.4, uid_database.kmer_count exists." else - echo "Creating taxDB (step 6 of 6)... " - time $JELLYFISH_BIN histo --high 100000000 database.kdb | tee database.taxon_count - build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp - mv taxDB.tmp taxDB + echo "Creating uid_database.kmer_count (step 6.4 of 6)... " + start_time1=$(date "+%s.%N") + time $JELLYFISH_BIN histo --high 100000000 uid_database.kdb > uid_database.kmer_count + echo "uid_database.kmer_count finished. [$(report_time_elapsed $start_time1)]" fi echo "Database construction complete. [Total: $(report_time_elapsed $start_time)] You can delete all files but database.{kdb,idx} and taxDB now, if you want" + + diff --git a/scripts/krakenu-download b/scripts/krakenu-download new file mode 100755 index 0000000..f3aa4bd --- /dev/null +++ b/scripts/krakenu-download @@ -0,0 +1,431 @@ +#!/bin/env perl + +# krakenu-download.pl - based on centrifuge-download +# (c) Florian Breitwieser, 2017 + +use strict; +use warnings; +use File::Basename; +use File::Fetch; +use File::Copy; +use File::Path qw/make_path remove_tree/; +use IO::Uncompress::Gunzip qw/gunzip $GunzipError/; +use autodie; +use Term::ANSIColor; +use Getopt::Long; +use Parallel::ForkManager; + +sub download_taxonomy(@); +sub download_contaminats(@); +sub download(@); +sub print_header_lines(@); +sub download_domain(@); +sub download_viral_neighbors(@); + +my $FTP="ftp://ftp.ncbi.nih.gov"; +my @ALL_GENOMES=qw/bacteria viral archaea fungi protozoa invertebrate plant vertebrate_mammalian vertebrate_other/; +my @ALL_DATABASES=qw/refseq genbank taxonomy contaminants/; +my @ALL_ASSEMBLY_LEVELS=qw/Complete\ Genome Chromosome Scaffold Contig/; +my @SMALL_GENOMES=qw/mitochondrion plaasmid plastid/; + +## Option parsing +my $DATABASE="refseq"; +my $ASSEMBLY_LEVEL="Complete Genome"; +my $REFSEQ_CATEGORY; +my $TAXID; + +my $BASE_DIR; +my $DB_DIR; +my $N_PROC=5; +my $CHANGE_HEADER=0; +my $DOWNLOAD_RNA=0; +my $DO_DUST=0; +my $FILTER_UNPLACED=0; +my $VERBOSE=0; +my $OVERWRITE_FILES=0; +my $INCLUDE_VIRAL_NEIGHBORS=0; +my $DOMAINS; +my $DL_MOD_RSYNC; + +my %ac_to_taxid; +my $downloaded_viral_refseq=0; +my $FNA_FILES="genomic"; + +my $USAGE="\n".basename($0). +" [] * + +ARGUMENT + One of refseq, genbank, contaminants or taxonomy: + - contaminants gets contaminant sequences from UniVec and EmVec, + - taxonomy for taxonomy mappings. + - use refseq or genbank for genomic sequences, + - refseq and genbank can be proceeded by '/DOMAIN' or '/DOMAIN/ASS_LEVEL', e.g. + - refseq/archaea, refseq/viral/Any, or genbank/bacteria + - if ASS_LEVEL is not given, the default is used + +COMMON OPTIONS + -o Folder to which the files are downloaded. Default: '.' + --db Alternative to -o: Download to /{library,taxonomy}. + -P <# of threads> Number of processes when downloading (uses xargs). Default: '$N_PROC' + --rsync, -R Download using rsync. + --overwrite Redownload and overwrite files with the same name. + -v Verbose. + +WHEN USING database refseq OR genbank: + -d What domain to download. One or more of @ALL_GENOMES (comma separated). + -a Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'. Use 'Any' for any assembly level. + -c Only download genomes in the specified refseq category. Default: any. + -t Only download the specified taxonomy IDs, comma separated. Default: any. + --fna Comma-separated list of sequence types, including genomic, rna, rna_from_genomic, cds_from_genomic. Default: $FNA_FILES. + -u Filter unplaced sequences. + -m Mask low-complexity regions using dustmasker. + -l Modify sequence header to include taxonomy ID for Kraken (i.e. add '>kraken:taxid|TAXID' to each sequence). + --include-viral-neighbors Include neighbors for viral genomes as defined at https://www.ncbi.nlm.nih.gov/genome/viruses/. + Only works if refseq viral is downloaded in the same session! +"; + +# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific) +Getopt::Long::Configure('no_auto_abbrev','pass_through'); +GetOptions( + "output|o=s" =>\$BASE_DIR, + "db=s" => \$DB_DIR, + "threads|P=i" =>\$N_PROC, + "domain|d=s" => \$DOMAINS, + "assembly-level|a=s" => \$ASSEMBLY_LEVEL, + "category|c=s" => \$REFSEQ_CATEGORY, + "taxonomy-id|t=s" => \$TAXID, + "fna=s" => \$FNA_FILES, + "rsync|R" => \$DL_MOD_RSYNC, + "include-viral-neighbors" => \$INCLUDE_VIRAL_NEIGHBORS, + "filter-unplaced|u" => \$FILTER_UNPLACED, + "mask|m" => \$DO_DUST, + "change-header|l" => \$CHANGE_HEADER, + "force" => \$OVERWRITE_FILES, + "verbose|v" => \$VERBOSE) or die "Error in command line arguments"; + +if (!defined $ARGV[0] || !$ARGV[0] =~ /refseq|genbank|taxonomy|contaminants/) { + print STDERR $USAGE; + exit 1; +} + +if (defined $BASE_DIR && defined $DB_DIR) { + print "Define either --db or -o, not both!"; + exit 1; +} + +# Use current directory as base directory +$BASE_DIR = "." unless defined $DB_DIR || defined $BASE_DIR; + +# If DB directory is defined, use that as base directory +# -- kept -o and --db options to allow the use of either Kraken and Centrifuge type command line +my $add_dir = defined $DB_DIR; +$BASE_DIR = $DB_DIR if defined $DB_DIR; +sub get_dir { + my ($dir, $name) = @_; + my $dir1 = $add_dir? "$dir/$name" : $dir; + make_path $dir1; + return $dir1; +} + +my $pm = new Parallel::ForkManager($N_PROC); +$pm->run_on_finish(sub { + my ($pid, $exit_code, $indent, $exit_signal, $core_dump, $data) = @_; + if (defined $data) { + @ac_to_taxid{keys %$data} = values %$data; + } +} +); + +my %select_taxonomy_ids; +if (defined $TAXID) { + %select_taxonomy_ids = map { $_ => 1 } split(/,/, $TAXID); +} + +if (!defined $ARGV[0]) { + print STDERR $USAGE; + exit 1; +} + +foreach my $DATABASE (@ARGV) { + if ( $DATABASE eq "taxonomy" ) { + download_taxonomy(get_dir($BASE_DIR,"taxonomy")); + } elsif ( $DATABASE eq "contaminants" ) { + download_contaminats(get_dir($BASE_DIR,"library/contaminants")); + } elsif ( $DATABASE =~ /^refseq/ || $DATABASE =~ /^genbank/ ) { + my ($db, $domain, @levels) = split(/\//, $DATABASE); + if (!defined $domain) { + foreach my $domain (split(/,/,$DOMAINS)) { + my $lib_dir = $add_dir? "$BASE_DIR/library/$domain" : "$BASE_DIR/$domain"; + download_domain($lib_dir, $domain, $ASSEMBLY_LEVEL); + } + } else { + my $lib_dir = $add_dir? "$BASE_DIR/library/$domain" : "$BASE_DIR/$domain"; + my $level = $ASSEMBLY_LEVEL; + my $taxid = $TAXID; + foreach (@levels) { + if (/taxid(.*)/) { + $taxid = $1; + } else { + $level = $_; + } + } + download_domain($lib_dir, $domain, $level, $taxid); + } + } else { + print STDERR "Unknown database $DATABASE. \n"; + print STDERR $USAGE; + exit 1; + } +} + +if ($INCLUDE_VIRAL_NEIGHBORS) { + if (!$downloaded_viral_refseq) { + print STDERR "--include-viral-neighbors only works when RefSeq viral is downloaded in the same session!"; + } else { + my $lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors"; + download_viral_neighbors($lib_dir); + } +} + + + +######################################################### +## Functions + +sub download(@) { + my ($url, $file) = @_; + if (-f $file && !$OVERWRITE_FILES) { + print STDERR "Not fetching $url - file $file exists.\n" if $VERBOSE; + return 1; + } + + $url =~ s/https/http/; + + if ( $DL_MOD_RSYNC && $url =~ /^ftp/ ) { + $url =~ s/^ftp/rsync/; + } + + print STDERR "Fetching $url to $file ...\n" if $VERBOSE; + my $ff = File::Fetch->new(uri=>"$url"); + my $where = $ff->fetch(to=> dirname($file)) or die $ff->error; + #my $where = $ff->fetch(to=> dirname($file)) or die "\n$ff->error for $url!"; + move($where, $file); + return -f $file; +} + +sub download_viral_neighbors(@) { + my ($dir) = @_; + print STDERR "Downloading viral neighbors into $dir ...\n"; + my $url = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2"; + my $nbr_file = "$dir/viral_neighbors-taxid10239.nbr"; + download($url, $nbr_file); + open(my $F, "<", $nbr_file); + my @file = <$F>; + close($F); + my $i = 0; + my $n_genomes = scalar @file; + + foreach (@file) { + next if /^#/; + ++$i; + print STDERR "\r Downloading viral neighbor sequence $i/$n_genomes ..." unless $VERBOSE; + my $pid = $pm->start and next; + my ($rep_acs, $nbr_ac, undef, undef, $nname, $sname) = split /\t/; + my ($name, $taxid); + foreach my $rep_ac (split (/,/, $rep_acs)) { + if (defined $ac_to_taxid{$rep_ac}) { + ($name, $taxid) = @{$ac_to_taxid{$rep_ac}}; + last; + } + } + if (!defined $taxid) { + print STDERR "No mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n"; + $pm->finish(0); + next; + } + (my $name1 = $name) =~ s/[^a-zA-Z0-9_]/_/g; + $name1 =~ s/__/_/g; + my $file = "$dir/$name1-tax$taxid/$nbr_ac.fna"; + my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac"; + if (download($url,$file)) { + print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac"); + } + ## TODO: dust viral neighbors + $pm->finish(0); + } + print STDERR "\n"; + $pm->wait_all_children(); +} + +sub print_header_lines(@) { + my ($file, $taxid, $name, $map_ref) = @_; + #return if -f "$file.map"; + open (my $F, ">", "$file.map"); + open (my $G, "<", $file); + while (<$G>) { + next unless /^>([^ ]*)/; + my $ac = $1; + print $F "$ac\t$taxid\t$name\n"; + $ac =~ s/\.[0-9]*$//; + $map_ref->{$ac} = [$name, $taxid] if defined $map_ref; + #$ac_to_taxid{$ac} = [$name, $taxid] if $downloaded_viral_refseq && $INCLUDE_VIRAL_NEIGHBORS; + } + close($G); + close($F); +} + +sub download_contaminats(@) { + my ($CONTAMINANT_DIR) = @_; + print STDERR "Downloading contaminant databases ... \n"; + my $CONTAMINANT_TAXID=32630; + make_path $CONTAMINANT_DIR; + + # download UniVec and EmVec database + download("ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec","$CONTAMINANT_DIR/UniVec.fna"); + download("ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz","$CONTAMINANT_DIR/emvec.dat.gz"); + + open(my $E1, "|-", "gunzip -c emvec.dat.gz"); + open(my $E2, ">", "$CONTAMINANT_DIR/EmVec.fna"); + + my ($ac,$de); + my $in_seq = 0; + while(<$E1>) { + if (/^AC\s+(.*)/) { + $ac = $1; + $ac =~ s/;$//; + } elsif (/^DE\s+(.*)/) { + $de = $1; + } elsif (/^SQ/) { + $in_seq = 1; + print $E2 ">$ac $de\n"; + print "$ac\t$CONTAMINANT_TAXID\tEmVec\n"; + } elsif ($in_seq) { + if (/^\s+[agct]/) { + s/\s+[0-9]+$//; + s/ //g; + print $_; + } else { + $in_seq = 0; + } + } + } + close($E2); + close($E1); + unlink("emvec.dat.gz"); + + if ( $CHANGE_HEADER ) { + system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/UniVec.fna"); + system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/EmVec.fna"); + } else { + print_header_lines("$CONTAMINANT_DIR/UniVec.fna", $CONTAMINANT_TAXID, "UniVec"); + } +} + +sub download_taxonomy(@) { + my ($dir) = @_; + print STDERR "Downloading NCBI taxonomy ... \n"; + make_path $dir; + + download("$FTP/pub/taxonomy/taxdump.tar.gz", "$dir/taxdump.tar.gz"); + system("tar -C $dir -zxvf $dir/taxdump.tar.gz nodes.dmp names.dmp 1>&2"); + system("date > $dir/timestamp"); +} + +sub download_domain(@) { + my ($domain_dir, $domain, $_assembly_level, $_taxid) = @_; + print STDERR "Downloading assembly summary file for $domain genomes.\n"; + die unless defined $domain_dir && defined $domain; + if (-d $domain_dir) { + print STDERR "WARNING: $domain_dir already exists - potentially overwriting files.\n"; + } else { + make_path $domain_dir; + } + my $ass_file = "$domain_dir/assembly_summary.txt"; + my $ass_file_filtered = "$domain_dir/assembly_summary_filtered.txt"; + my $n_genomes = 0; + download("ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$domain/assembly_summary.txt", $ass_file) or die "Could not download assembly summary file!"; + + $downloaded_viral_refseq =1 if $domain eq "viral"; + + my @genomes_to_dl; + open(my $A1, "<", $ass_file); + open(my $A2, ">", $ass_file_filtered); + while (<$A1>) { + next if /^#/; + my ($assembly_accession, $bioproject, $biosample, $wgs_master, $refseq_category, + $taxid, $species_taxid, $organism_name, $infraspecific_name, $isolate, $version_status, + $assembly_level, $release_type, $genome_rep, $seq_rel_date, $asm_name, $submitter, + $gbrs_paired_asm, $paired_asm_comp, $ftp_path, $excluded_from_refseq, $relation_to_type_material) = split /\t/; + + next unless $version_status eq "latest"; + next if ($_assembly_level ne "Any" && $assembly_level ne $_assembly_level); + next if (defined $REFSEQ_CATEGORY && $refseq_category ne $REFSEQ_CATEGORY); + next if (defined $_taxid && $taxid ne $_taxid); + print $A2 $_; + ++ $n_genomes; + push @genomes_to_dl, [$ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession]; + } + close $A2; + close $A1; + + my $i = 0; + foreach my $g (@genomes_to_dl) { + my ($ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession) = @$g; + ++$i; + + if (defined $infraspecific_name) { + (my $i1 = $infraspecific_name) =~ s/strain=//; + $organism_name .= " $infraspecific_name" unless $organism_name =~ /$i1/ || $i1 eq ""; + } + + print STDERR "\r Downloading $domain genomes: $i/$n_genomes ..." unless $VERBOSE; + + my $bname = basename($ftp_path); + ( my $organism_name1 = $organism_name ) =~ s/[^a-zA-Z0-9_]/_/g; + $organism_name1 = substr($organism_name1, 0, 100); + $organism_name1 =~ s/__/_/g; + $organism_name1 =~ s/_$//; + my $bname1 = "${organism_name1}-tax${taxid}-${bname}"; + my $pid = $pm->start and next; + my %local_ac_to_taxid; + + foreach my $ext (split(/,/, $FNA_FILES)) { + my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz"; + my $bfname = $bname1."_".$ext; + my $fname = $bfname.".fna"; + + if (!$OVERWRITE_FILES && -f "$domain_dir/$fname") { + print STDERR "$domain_dir/$fname exists - not downloading.. \n" if $VERBOSE; + } else { + download($full_ftp_path, "$domain_dir/$fname.gz"); + gunzip "$domain_dir/$fname.gz" => "$domain_dir/$fname" or die "gunzip failed: $GunzipError"; + unlink "$domain_dir/$fname.gz"; + } + + if ($CHANGE_HEADER) { + system("sed -i 's/^>/>kraken:taxid|$taxid /' '$domain_dir/$fname'"); + } + if ($FILTER_UNPLACED) { + ## Not implemented yet! + } + + ## Output sequenceID to taxonomy ID map to STDOUT + + if ($domain eq "viral" && $INCLUDE_VIRAL_NEIGHBORS) { + print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession", \%local_ac_to_taxid); + } else { + print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession"); + } + + if ($DO_DUST) { + ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps + system("dustmasker -infmt fasta -in '$domain_dir/$fname' -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > '$domain_dir/${bfname}_dustmasked.fna'"); + unlink("$domain_dir/$fname"); + } + } + $pm->finish(0, \%local_ac_to_taxid); + } + + $pm->wait_all_children; + print STDERR "\n"; +} diff --git a/scripts/krakenu-download_genomic_library.sh b/scripts/krakenu-download_genomic_library.sh deleted file mode 100755 index b1a7f13..0000000 --- a/scripts/krakenu-download_genomic_library.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/bash - -# Copyright 2013-2015, Derrick Wood -# -# This file is part of the Kraken taxonomic sequence classification system. -# -# Kraken is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Kraken is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Kraken. If not, see . - -# Download specific genomic libraries for use with Kraken. -# Supported choices are: -# bacteria - NCBI RefSeq complete bacterial/archaeal genomes -# plasmids - NCBI RefSeq plasmid sequences -# viruses - NCBI RefSeq complete viral DNA and RNA genomes -# human - NCBI RefSeq GRCh38 human reference genome - -set -u # Protect against uninitialized vars. -set -e # Stop on error - -LIBRARY_DIR="$KRAKEN_DB_NAME/library" -NCBI_SERVER="ftp.ncbi.nih.gov" -FTP_SERVER="ftp://$NCBI_SERVER" -RSYNC_SERVER="rsync://$NCBI_SERVER" -THIS_DIR=$PWD - -case "$1" in - "bacteria") - mkdir -p $LIBRARY_DIR/Bacteria - cd $LIBRARY_DIR/Bacteria - if [ ! -e "lib.complete" ] - then - rm -f all.fna.tar.gz - wget $FTP_SERVER/genomes/Bacteria/all.fna.tar.gz - echo -n "Unpacking..." - tar zxf all.fna.tar.gz - rm all.fna.tar.gz - echo " complete." - touch "lib.complete" - else - echo "Skipping download of bacterial genomes, already downloaded here." - fi - ;; - "plasmids") - mkdir -p $LIBRARY_DIR/Plasmids - cd $LIBRARY_DIR/Plasmids - if [ ! -e "lib.complete" ] - then - rm -f plasmids.all.fna.tar.gz - wget $FTP_SERVER/genomes/Plasmids/plasmids.all.fna.tar.gz - echo -n "Unpacking..." - tar zxf plasmids.all.fna.tar.gz - rm plasmids.all.fna.tar.gz - echo " complete." - touch "lib.complete" - else - echo "Skipping download of plasmids, already downloaded here." - fi - ;; - "viruses") - mkdir -p $LIBRARY_DIR/Viruses - cd $LIBRARY_DIR/Viruses - if [ ! -e "lib.complete" ] - then - rm -f all.fna.tar.gz - rm -f all.ffn.tar.gz - wget $FTP_SERVER/genomes/Viruses/all.fna.tar.gz - wget $FTP_SERVER/genomes/Viruses/all.ffn.tar.gz - echo -n "Unpacking..." - tar zxf all.fna.tar.gz - tar zxf all.ffn.tar.gz - rm all.fna.tar.gz - rm all.ffn.tar.gz - echo " complete." - touch "lib.complete" - else - echo "Skipping download of viral genomes, already downloaded here." - fi - ;; - "human") - mkdir -p $LIBRARY_DIR/Human - cd $LIBRARY_DIR/Human - if [ ! -e "lib.complete" ] - then - # get list of CHR_* directories - wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/ - directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing) - rm .listing - - # For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add - for directory in $directories - do - wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/$directory/ - file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\S+\.fa\.gz)\s*$/ and print $1' .listing) - [ -z "$file" ] && exit 1 - rm .listing - wget $FTP_SERVER/genomes/H_sapiens/$directory/$file - gunzip "$file" - done - - touch "lib.complete" - else - echo "Skipping download of human genome, already downloaded here." - fi - ;; - *) - echo "Unsupported library. Valid options are: " - echo " bacteria plasmids virus human" - ;; -esac diff --git a/scripts/krakenu-download_taxonomy.sh b/scripts/krakenu-download_taxonomy.sh deleted file mode 100755 index fc27842..0000000 --- a/scripts/krakenu-download_taxonomy.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -# Copyright 2013-2015, Derrick Wood -# -# This file is part of the Kraken taxonomic sequence classification system. -# -# Kraken is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Kraken is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Kraken. If not, see . - -# Download NCBI taxonomy information for Kraken. -# Designed to be called by kraken_build - -set -u # Protect against uninitialized vars. -set -e # Stop on error - -TAXONOMY_DIR="$KRAKEN_DB_NAME/taxonomy" -NCBI_SERVER="ftp.ncbi.nih.gov" -FTP_SERVER="ftp://$NCBI_SERVER" -THIS_DIR=$PWD - -mkdir -p "$TAXONOMY_DIR" -cd "$TAXONOMY_DIR" - -if [ ! -e "nucl_gb.accession2taxid.flag" ] -then - wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz - time gunzip -c nucl_gb.accession2taxid.gz | cut -f 2,3 | sort -k 1,1 > nucl_gb.accession2taxid.sorted - touch nucl_gb.accession2taxid.flag - echo "Downloaded and sorted GB to taxon map" -fi - -if [ ! -e "taxdump.flag" ] -then - wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz - tar zxf taxdump.tar.gz - touch taxdump.flag - echo "Downloaded and uncompressed taxonomy tree data" -fi diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenu-standard_installation.sh index e10254b..815d482 100755 --- a/scripts/krakenu-standard_installation.sh +++ b/scripts/krakenu-standard_installation.sh @@ -31,9 +31,9 @@ then fi check_for_jellyfish.sh -krakenu-build --db $KRAKEN_DB_NAME --download-taxonomy -krakenu-build --db $KRAKEN_DB_NAME --download-library bacteria -krakenu-build --db $KRAKEN_DB_NAME --download-library viruses +krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy +krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map +krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \ --max-db-size "$KRAKEN_MAX_DB_SIZE" \ diff --git a/src/Makefile b/src/Makefile index f721cf4..82246e9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ CXX = g++ -CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g +CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g -Wfatal-errors PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream @@ -17,9 +17,9 @@ db_shrink: krakendb.o quickfile.o db_sort: krakendb.o quickfile.o -set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o +set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp -classify: krakendb.o quickfile.o krakenutil.o seqreader.o +classify: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp $(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS) build_taxdb: taxdb.h diff --git a/src/classify.cpp b/src/classify.cpp index 990012f..b5e196f 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -25,6 +25,7 @@ #include "readcounts.hpp" #include "taxdb.h" #include "gzstream.h" +#include "uid_mapping.hpp" #include const size_t DEF_WORK_UNIT_SIZE = 500000; @@ -39,7 +40,6 @@ void process_file(char *filename); bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, unordered_map&); -string hitlist_string(vector &taxa, vector &ambig); set get_ancestry(uint32_t taxon); void report_stats(struct timeval time1, struct timeval time2); unordered_map taxon_counts; // stats per taxon @@ -79,6 +79,7 @@ static vector KrakenDatabases (DB_filenames.size()); uint64_t total_classified = 0; uint64_t total_sequences = 0; uint64_t total_bases = 0; +uint32_t ambig_taxon = -1; inline bool ends_with(std::string const & value, std::string const & ending) { @@ -117,35 +118,6 @@ void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) database.set_index(&db_index); } -vector get_taxids_for_uid(uint32_t uid, char* fptr) { - size_t int_size = sizeof(int); - size_t block_size = sizeof(int)*2; - // TODO: Just get a uint64_t and shift the bits, probably faster - uint32_t taxid = *(uint32_t*)(fptr+(uid-1)*block_size); - uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size); - - vector taxids = {taxid}; - while (parent_uid != 0) { - taxid = *(uint32_t*)(fptr+(parent_uid-1)*block_size); - parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size); - taxids.push_back(taxid); - } - std::sort(taxids.begin(), taxids.end()); - return(taxids); -} - -vector get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map >& uid_map ) { - auto it = uid_map.find(uid); - if (it != uid_map.end()) { - return it->second; - } - vector taxids = get_taxids_for_uid(uid, fptr); - uid_map[uid] = taxids; - return(taxids); -} - - - int main(int argc, char **argv) { #ifdef _OPENMP omp_set_num_threads(1); @@ -161,9 +133,11 @@ int main(int argc, char **argv) { cerr << "Reading UID mapping file " << UID_to_TaxID_map_filename << endl; UID_to_TaxID_map_file.open_file(UID_to_TaxID_map_filename); - if (Populate_memory) { - UID_to_TaxID_map_file.load_file(); - } + + // Always Populate memory + //if (Populate_memory) { + UID_to_TaxID_map_file.load_file(); + //} } if (!TaxDB_file.empty()) { @@ -376,12 +350,27 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu return taxon; } +inline +void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_t& last_counter, uint32_t current_taxon) { + if (last_taxon == current_taxon) { + ++last_counter; + } else { + if (last_counter > 0) { + if (last_taxon == ambig_taxon) { + hitlist_string += "A:" + std::to_string(last_counter) + ' '; + } else { + hitlist_string += std::to_string(last_taxon) + ':' + std::to_string(last_counter) + ' '; + } + } + last_counter = 1; + last_taxon = current_taxon; + } +} + bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, unordered_map& my_taxon_counts) { // TODO: use vector::reserve - vector taxa; - vector ambig_list; unordered_map hit_counts; uint64_t *kmer_ptr; uint32_t taxon = 0; @@ -394,6 +383,10 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, int64_t current_max_pos = 0; }; + string hitlist_string; + uint32_t last_taxon; + uint32_t last_counter; + vector db_statuses(KrakenDatabases.size()); if (dna.seq.size() >= KrakenDatabases[0]->get_k()) { @@ -401,10 +394,9 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, while ((kmer_ptr = scanner.next_kmer()) != NULL) { taxon = 0; if (scanner.ambig_kmer()) { - ambig_list.push_back(1); + append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon); } else { - ambig_list.push_back(0); // go through multiple databases to map k-mer for (size_t i=0; i &taxa, vector &ambig) -{ - int64_t last_code; - int code_count = 1; - ostringstream hitlist; - - if (ambig[0]) { last_code = -1; } - else { last_code = taxa[0]; } - - for (size_t i = 1; i < taxa.size(); i++) { - int64_t code; - if (ambig[i]) { code = -1; } - else { code = taxa[i]; } - - if (code == last_code) { - code_count++; - } - else { - if (last_code >= 0) { - hitlist << last_code << ":" << code_count << " "; - } - else { - hitlist << "A:" << code_count << " "; - } - code_count = 1; - last_code = code; - } - } - if (last_code >= 0) { - hitlist << last_code << ":" << code_count; - } - else { - hitlist << "A:" << code_count; - } - return hitlist.str(); -} - set get_ancestry(uint32_t taxon) { set path; diff --git a/src/krakendb.cpp b/src/krakendb.cpp index ec9927c..f89f869 100644 --- a/src/krakendb.cpp +++ b/src/krakendb.cpp @@ -69,15 +69,18 @@ KrakenDB::KrakenDB(char *ptr) { key_len = key_bits / 8 + !! (key_bits % 8); } -std::unordered_map KrakenDB::count_taxons() { - throw std::runtime_error("count_taxons() is not working"); - // Not working currently!! +//using std::map to have the keys sorted +std::map KrakenDB::count_taxons() { char *ptr = get_pair_ptr(); size_t pair_sz = pair_size(); - std::unordered_map taxon_counts; + std::map taxon_counts; + if (ptr == NULL) { + std::cerr << "Kraken database pointer is NULL [pair_sz: " << pair_sz << ", key_ct: "< +#include namespace kraken { class KrakenDBIndex { @@ -64,7 +65,7 @@ namespace kraken { // return a count of k-mers for all taxons - std::unordered_map count_taxons(); + std::map count_taxons(); // return "bin key" for kmer, based on index // If idx_nt not specified, use index's value diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index 28ca837..2da433e 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -20,6 +20,7 @@ #include "assert_helpers.h" #include "kraken_headers.hpp" #include "krakenutil.hpp" +#include using namespace std; @@ -49,30 +50,30 @@ namespace kraken { // Return lowest common ancestor of a and b // LCA(0,x) = LCA(x,0) = x // Default ancestor is 1 (root of tree) - uint32_t lca(const unordered_map &parent_map, - uint32_t a, uint32_t b) - { + uint32_t lca(const unordered_map &parent_map, uint32_t a, uint32_t b) { if (a == 0 || b == 0) return a ? a : b; // create a path from a to the root - set a_path; - while (a > 0) { + std::unordered_set a_path; + while (a > 0 && a != parent_map.at(a)) { + if (a == b) + return a; a_path.insert(a); - assert(parent_map.find(a) != parent_map.end()); a = parent_map.at(a); } // search for b in the path from a to the root - while (b > 0) { + while (b > 0 && b != parent_map.at(b)) { if (a_path.count(b) > 0) return b; - assert(parent_map.find(b) != parent_map.end()); b = parent_map.at(b); } return 1; } + + // Tree resolution: take all hit taxa (plus ancestors), then // return leaf of highest weighted leaf-to-root path. uint32_t resolve_tree(const unordered_map &hit_counts, @@ -120,126 +121,6 @@ namespace kraken { } - // Tree resolution: take all hit taxa (plus ancestors), then - // return leaf of highest weighted leaf-to-root path. - uint32_t resolve_uids( - const unordered_map &uid_hit_counts, - const unordered_map &parent_map, - const vector< vector > &UID_to_taxids_vec) { - unordered_map taxid_counts; - unordered_map frac_taxid_counts; - - if (uid_hit_counts.size() == 0) { - return(0); - } - - for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { - uint32_t uid = it->first; - double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size()); - for (auto taxid : UID_to_taxids_vec[uid-1]) { - taxid_counts[taxid] += it->second; - frac_taxid_counts[taxid] += frac_count; - } - } - vector max_taxids; - uint32_t max_count = 0; - double max_frac_count = 0; - for (auto it : taxid_counts) { - if (it.second == max_count) { - if (frac_taxid_counts[it.first] == max_frac_count) { - max_taxids.push_back(it.first); - } else if (frac_taxid_counts[it.first] > max_frac_count) { - max_frac_count = frac_taxid_counts[it.first]; - max_taxids = { it.first }; - } - } else if (it.second > max_count) { - max_taxids = { it.first }; - max_count = it.second; - max_frac_count = frac_taxid_counts[it.first]; - } - } - - uint32_t max_taxon = max_taxids[0]; - auto sit = max_taxids.begin(); - for (++sit; sit != max_taxids.end(); ++sit) { - max_taxon = lca(parent_map, max_taxon, *sit); - - } - - // return the taxid that appeared most often - return max_taxon; - } - - // Tree resolution: take all hit taxa (plus ancestors), then - // return leaf of highest weighted leaf-to-root path. - uint32_t resolve_uids2( - const unordered_map &uid_hit_counts, - const unordered_map &parent_map, - char* fptr) { - unordered_map taxid_counts; - unordered_map frac_taxid_counts; - - if (uid_hit_counts.size() == 0) { - return(0); - } - - size_t int_size = sizeof(int); - size_t block_size = sizeof(int)*2; - for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { - uint32_t uid = it->first; - if (uid == 0) { - continue; - } - uint32_t taxid; - // TODO: Just get a uint64_t and shift the bits, probably faster - vector taxids; - do { - taxid = *(uint32_t*)(fptr+(uid-1)*block_size); - uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size); - - taxid_counts[taxid] += it->second; - taxids.push_back(taxid); - } while (uid != 0); - - double frac_count = (double)it->second / (double)taxids.size(); - for (uint32_t taxid : taxids) { - frac_taxid_counts[taxid] += frac_count; - } - } - - if (taxid_counts.size() == 0) { - return(0); - } - vector max_taxids; - uint32_t max_count = 0; - double max_frac_count = 0; - for (auto it : taxid_counts) { - if (it.second == max_count) { - if (frac_taxid_counts[it.first] == max_frac_count) { - max_taxids.push_back(it.first); - } else if (frac_taxid_counts[it.first] > max_frac_count) { - max_frac_count = frac_taxid_counts[it.first]; - max_taxids = { it.first }; - } - } else if (it.second > max_count) { - max_taxids = { it.first }; - max_count = it.second; - max_frac_count = frac_taxid_counts[it.first]; - } - } - - uint32_t max_taxon = max_taxids[0]; - auto sit = max_taxids.begin(); - for (++sit; sit != max_taxids.end(); ++sit) { - max_taxon = lca(parent_map, max_taxon, *sit); - - } - - // return the taxid that appeared most often - return max_taxon; - } - - uint8_t KmerScanner::k = 0; @@ -277,14 +158,19 @@ namespace kraken { } uint64_t *KmerScanner::next_kmer() { + bool skip_pos = false; if (curr_pos >= pos2) return NULL; if (loaded_nt) loaded_nt--; while (loaded_nt < k) { - loaded_nt++; - kmer <<= 2; - ambig <<= 1; + if (skip_pos) { + skip_pos = false; + } else { + loaded_nt++; + kmer <<= 2; + ambig <<= 1; + } switch ((*str)[curr_pos++]) { case 'A': case 'a': break; @@ -297,6 +183,11 @@ namespace kraken { case 'T': case 't': kmer |= 3; break; + case '\n': case '\r': + --loaded_nt; + skip_pos = true; + continue; + break; default: ambig |= 1; break; diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp index 854e26b..46e8eb8 100644 --- a/src/krakenutil.hpp +++ b/src/krakenutil.hpp @@ -27,23 +27,17 @@ namespace kraken { // Build a map of node to parent from an NCBI taxonomy nodes.dmp file std::unordered_map build_parent_map(std::string filename); - // Return the lowest common ancestor of a and b, according to parent_map - // NOTE: LCA(0,x) = LCA(x,0) = x + // Return lowest common ancestor of a and b + // LCA(0,x) = LCA(x,0) = x + // Default ancestor is 1 (root of tree) +uint32_t lca(const std::unordered_map &parent_map, uint32_t a, uint32_t b); + + // Resolve classification tree uint32_t resolve_tree(const std::unordered_map &hit_counts, const std::unordered_map &parent_map); - uint32_t resolve_uids( - const std::unordered_map &uid_hit_counts, - const std::unordered_map &parent_map, - const std::vector< std::vector > &UID_to_taxids_vec); - - uint32_t resolve_uids2( - const std::unordered_map &uid_hit_counts, - const std::unordered_map &parent_map, - char* fptr); - class KmerScanner { public: diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp index 76b839a..0ac84db 100644 --- a/src/read_uid_mapping.cpp +++ b/src/read_uid_mapping.cpp @@ -7,6 +7,7 @@ using namespace std; using namespace kraken; +inline vector get_taxids_for_uid(uint32_t uid, char* fptr) { size_t int_size = sizeof(int); size_t block_size = sizeof(int)*2; @@ -24,7 +25,7 @@ vector get_taxids_for_uid(uint32_t uid, char* fptr) { return(taxids); } - +inline vector get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map >& uid_map ) { auto it = uid_map.find(uid); if (it != uid_map.end()) { diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 1396a7f..504e2b6 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -25,6 +25,7 @@ #include "seqreader.hpp" #include "taxdb.h" #include "readcounts.hpp" +#include "uid_mapping.hpp" #include #include @@ -43,6 +44,7 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish); int Num_threads = 1; string DB_filename, Index_filename, Output_DB_filename, TaxDB_filename, + Kmer_count_filename, File_to_taxon_map_filename, ID_to_taxon_map_filename, Multi_fasta_filename; bool force_taxid = false; @@ -52,6 +54,7 @@ bool Allow_extra_kmers = false; bool verbose = false; bool Operate_in_RAM = false; bool One_FASTA_file = false; +bool Add_taxIds_for_Assembly = false; bool Add_taxIds_for_Sequences = false; bool Use_uids_instead_of_taxids = false; bool Output_UID_map_to_STDOUT = false; @@ -61,20 +64,20 @@ string UID_map_filename; ofstream UID_map_file; uint32_t current_uid = 0; -uint32_t max_uid = -1; unordered_map Parent_map; //unordered_multimap Children_map; //typedef std::_Rb_tree_iterator, unsigned int> > map_it; //typedef std::_Rb_tree_iterator, unsigned int> > map_it; -typedef const vector* map_it; -vector< map_it > UID_to_taxids_vec; -map< vector, uint32_t> Taxids_to_UID_map; +vector< const TaxidSet* > UID_to_taxids_vec; +map< TaxidSet, uint32_t> Taxids_to_UID_map; unordered_map ID_to_taxon_map; unordered_map SeqId_added; KrakenDB Database; TaxonomyDB taxdb; +const string prefix = "kraken:taxid|"; + int main(int argc, char **argv) { #ifdef _OPENMP omp_set_num_threads(1); @@ -141,6 +144,16 @@ int main(int argc, char **argv) { else process_files(); + if (!Kmer_count_filename.empty()) { + ofstream ofs(Kmer_count_filename.c_str()); + cerr << "Writing kmer counts to " << Kmer_count_filename << "..." << endl; + auto counts = Database.count_taxons(); + for (auto const & kv : counts) { + ofs << kv.first << '\t' << kv.second << '\n'; + } + ofs.close(); + } + if (Operate_in_RAM && !Pretend) { if (Output_DB_filename.size() > 0) { DB_filename = Output_DB_filename; @@ -155,7 +168,7 @@ int main(int argc, char **argv) { UID_map_file.close(); // Write new TaxDB file if new taxids were added - if (Add_taxIds_for_Sequences && !TaxDB_filename.empty() && !Pretend) { + if ((Add_taxIds_for_Sequences || Add_taxIds_for_Assembly) && !TaxDB_filename.empty() && !Pretend) { cerr << "Writing new TaxDB ..." << endl; ofstream ofs(TaxDB_filename.c_str()); taxdb.writeTaxonomyIndex(ofs); @@ -165,35 +178,78 @@ int main(int argc, char **argv) { return 0; } -void process_single_file() { - cerr << "Processing FASTA files" << endl; +inline +uint32_t get_taxid( + unordered_map& name_to_taxid_map, + unordered_map& Parent_map, + string name, uint32_t parent_taxid, const string & rank_name) { + + auto it = name_to_taxid_map.find(name); + if (it == name_to_taxid_map.end()) { + uint32_t new_taxid = ++New_taxid_start; + bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name); + if (!insert_res) + cerr << "Taxonomy ID " << new_taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl; + // insert_res shows if insert failed, but we don't care + // cerr << "Adding assembly: " << name << " with taxid " << new_taxid << endl; + Parent_map[new_taxid] = parent_taxid; + name_to_taxid_map[name] = new_taxid; + return new_taxid; + } else { + return it->second; + } +} + +unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_filename, + TaxonomyDB& taxdb, unordered_map& Parent_map, + bool Add_taxIds_for_Assembly, bool Add_taxIds_for_Sequences) { + + unordered_map ID_to_taxon_map; ifstream map_file(ID_to_taxon_map_filename.c_str()); if (map_file.rdstate() & ifstream::failbit) { err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str()); } string line, seq_id; - uint32_t parent_taxid, taxid; + uint32_t taxid; + + // Used when adding new taxids for assembly or sequence + unordered_map name_to_taxid_map; + while (map_file.good()) { getline(map_file, line); if (line.empty()) break; istringstream iss(line); - iss >> seq_id; - if (ID_to_taxon_map.find(seq_id) != ID_to_taxon_map.end()) - continue; + iss >> seq_id >> taxid; + + auto it = ID_to_taxon_map.find(seq_id); + if (it != ID_to_taxon_map.end()) { + // The sequence ID has been seen before, ignore + continue; + } + + if (Add_taxIds_for_Assembly && iss.good()) { + iss.get(); + string name; + getline(iss, name); + taxid = get_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly"); + } if (Add_taxIds_for_Sequences) { - iss >> parent_taxid; - taxid = ++New_taxid_start; - Parent_map[taxid] = parent_taxid; - auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry(taxid, parent_taxid, "sequence")}); - if (!itEntry.second) - cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl; - } else { - iss >> taxid; + taxid = get_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence"); + } + if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) { + cout << seq_id << '\t' << taxid << '\n'; } ID_to_taxon_map[seq_id] = taxid; } + return std::move(ID_to_taxon_map); +} + +void process_single_file() { + cerr << "Processing FASTA files" << endl; + + ID_to_taxon_map = read_seqid_to_taxid_map(ID_to_taxon_map_filename, taxdb, Parent_map, Add_taxIds_for_Assembly, Add_taxIds_for_Sequences); FastaReader reader(Multi_fasta_filename); DNASequence dna; @@ -213,23 +269,27 @@ void process_single_file() { // Get the taxid. If the header specifies kraken:taxid, use that uint32_t taxid; - string prefix = "kraken:taxid|"; - if (dna.id.substr(0,prefix.size()) == prefix) { + auto it = ID_to_taxon_map.find(dna.id); + if (it != ID_to_taxon_map.end()) { + taxid = it->second; + } else if (dna.id.size() >= prefix.size() && dna.id.substr(0,prefix.size()) == prefix) { taxid = std::stol(dna.id.substr(prefix.size())); if (taxid == 0) { - cerr << "Error: taxid is zero for the line '" << dna.id << "'?!" << endl; + cerr << "Error: taxonomy ID is zero for sequence '" << dna.id << "'?!" << endl; } const auto strBegin = dna.header_line.find_first_not_of("\t "); if (strBegin != std::string::npos) dna.header_line = dna.header_line.substr(strBegin); } else { - taxid = ID_to_taxon_map[dna.id]; + cerr << "Error! Didn't find taxonomy ID mapping for sequence " << dna.id << "!!" << endl; + ++seqs_skipped; + continue; } if (Add_taxIds_for_Sequences) { auto entryIt = taxdb.taxIDsAndEntries.find(taxid); if (entryIt == taxdb.taxIDsAndEntries.end()) { - cerr << "Error! Didn't find " << taxid << " in TaxonomyDB!!" << endl; + cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["<second.scientificName = dna.header_line; } @@ -271,6 +331,7 @@ void process_files() { istringstream iss(line); iss >> filename; iss >> taxid; + // TODO: Support a mapping file with only file names, not taxids process_file(filename, taxid); cerr << "\rProcessed " << ++seqs_processed << " sequences"; } @@ -291,6 +352,11 @@ void process_file(string filename, uint32_t taxid) { set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1); } +void process_sequence(DNASequence dna) { + // TODO: Refactor such that a list of files + taxid can be given. + // Or maybe asembly_summary file? +} + void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { KmerScanner scanner(seq, start, finish); uint64_t *kmer_ptr; @@ -311,62 +377,11 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { } continue; } - if (Use_uids_instead_of_taxids) { - uint32_t kmer_uid = *val_ptr; - bool new_taxid = kmer_uid == 0; - vector taxid_set; - if (new_taxid) { - taxid_set.push_back(taxid); - } else { - if (kmer_uid > UID_to_taxids_vec.size()) { - // This can happen when set_lcas is called on a database that is not all zeros - cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl; - exit(1); - } - taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1)); - auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order - if (it == taxid_set.end() || *it != taxid) { - // add the taxid to the set, in the right position - taxid_set.insert( it, taxid ); // insert before iterator it - new_taxid = true; - } - } - - if (new_taxid) { - if (max_uid <= current_uid) { - cerr << "Maxxed out on the UIDs!!" << endl; - exit(1); - } - - // get a new taxid for this set - #pragma omp critical(new_uid) - { - auto insert_res = Taxids_to_UID_map.insert( { std::move(taxid_set), current_uid + 1 } ); - if (insert_res.second) { - ++current_uid; - - // print result for map: - if (Output_UID_map_to_STDOUT) { - auto tid_it = insert_res.first->first.begin(); - cout << current_uid << '\t' << *tid_it++; - while (tid_it != insert_res.first->first.end()) { cout << ' ' << *tid_it++; } - cout << '\n'; - } - - // FORMAT: TAXID PARENT - // TODO: Consider using mmap here - UID_map_file.write((char*)&taxid, sizeof(taxid)); - UID_map_file.write((char*)&kmer_uid, sizeof(kmer_uid)); - - //UID_to_taxids_vec[current_uid] = taxid_set; - UID_to_taxids_vec.push_back( &(insert_res.first->first) ); - *val_ptr = current_uid; - } else { - *val_ptr = insert_res.first->second; - } - } - } + // TODO: Should I use pragma omp critical here? + if (Use_uids_instead_of_taxids) { + #pragma omp critical(new_uid) + *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file); } else if (!force_taxid) { *val_ptr = lca(Parent_map, taxid, *val_ptr); } else { @@ -383,7 +398,7 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:apI:o:S")) != -1) { + while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:aApI:o:Sc:")) != -1) { switch (opt) { case 'f' : File_to_taxon_map_filename = optarg; @@ -392,9 +407,6 @@ void parse_command_line(int argc, char **argv) { Use_uids_instead_of_taxids = true; UID_map_filename = optarg; break; - case 'S' : - Output_UID_map_to_STDOUT = true; - break; case 'd' : DB_filename = optarg; break; @@ -430,9 +442,15 @@ void parse_command_line(int argc, char **argv) { case 'a' : Add_taxIds_for_Sequences = true; break; + case 'A' : + Add_taxIds_for_Assembly = true; + break; case 'b' : TaxDB_filename = optarg; break; + case 'c' : + Kmer_count_filename = optarg; + break; case 'M' : Operate_in_RAM = true; break; @@ -475,10 +493,10 @@ void usage(int exit_code) { << " -f filename File to taxon map" << endl << " -F filename Multi-FASTA file with sequence data" << endl << " -m filename Sequence ID to taxon map" << endl - << " -a Add taxonomy IDs (starting with "< getParentMap() const; + TAXID getByScientificName(string name) const; std::unordered_map getScientificNameMap() const; std::string getLineage(TAXID taxonomyID) const; std::string getMetaPhlAnLineage(TAXID taxonomyID) const; TaxonomyEntry getEntry(TAXID taxID) const; + bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_); + size_t distance(TAXID taxID1, TAXID taxID2) const; bool isSubSpecies(TAXID taxonomyID) const; @@ -158,12 +161,6 @@ class TaxReport { std::vector _report_cols; }; - - // Return lowest common ancestor of a and b - // LCA(0,x) = LCA(x,0) = x - // Default ancestor is 1 (root of tree) -uint32_t lca(std::unordered_map &parent_map, uint32_t a, uint32_t b); - template inline V find_or_use_default(const std::unordered_map& my_map, const K& query, const V default_value); @@ -266,6 +263,15 @@ bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntryreadCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren))); } +template +TAXID TaxonomyDB::getByScientificName(string name) const { + for (const auto & tax : taxIDsAndEntries) { + if (tax.second.scientificName == name) { + return tax.first; + } + } + return 0; +} template std::unordered_map TaxonomyDB::getScientificNameMap() const { @@ -507,6 +513,26 @@ TAXID TaxonomyDB::getLowestCommonAncestor( return consensus; } +template +bool TaxonomyDB::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, + std::string rank_, std::string scientificName_) { + + TaxonomyEntry newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0); + + auto parentIt = taxIDsAndEntries.find(parentTaxonomyID_); + if (parentIt == taxIDsAndEntries.end() || parentTaxonomyID_ == taxonomyID_) { + cerr << "ERROR while inserting taxonomy entry - taxonomy ID " << taxonomyID_ <<"; parent taxonomy ID " << parentTaxonomyID_ << "!" << endl; + return false; + } + + newEntry.parent = &(parentIt->second); + auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry}); + parentIt->second.children.push_back(&insert_res.first->second); + + return insert_res.second; + +} + template TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { auto entry = taxIDsAndEntries.find(taxID); @@ -798,32 +824,6 @@ void TaxReport::printLine(TaxonomyEntry& tax } - // Return lowest common ancestor of a and b - // LCA(0,x) = LCA(x,0) = x - // Default ancestor is 1 (root of tree) -uint32_t lca(unordered_map &parent_map, uint32_t a, uint32_t b) - { - if (a == 0 || b == 0) - return a ? a : b; - - // create a path from a to the root - std::unordered_set a_path; - while (a > 0 && a != parent_map[a]) { - if (a == b) - return a; - a_path.insert(a); - a = parent_map[a]; - } - - // search for b in the path from a to the root - while (b > 0 && b != parent_map[b]) { - if (a_path.count(b) > 0) - return b; - b = parent_map[b]; - } - return 1; - } - template inline V find_or_use_default(const std::unordered_map& my_map, const K& query, const V default_value) { diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp new file mode 100644 index 0000000..966a685 --- /dev/null +++ b/src/uid_mapping.cpp @@ -0,0 +1,196 @@ + +#include +#include "uid_mapping.hpp" +#include "krakenutil.hpp" +#include "assert_helpers.h" + +using namespace std; + +namespace kraken { + + static size_t INT_SIZE=sizeof(uint32_t); + static size_t UID_BLOCK_SIZE=2*INT_SIZE; + static uint32_t max_uid = -1; + + uint32_t uid_mapping( + map< TaxidSet, uint32_t>& Taxids_to_UID_map, + vector< const TaxidSet* >& UID_to_taxids_vec, + uint32_t taxid, + uint32_t kmer_uid, + uint32_t& current_uid, + ofstream& UID_map_file) { + + vector taxid_set; + if (kmer_uid == 0) { + taxid_set.push_back(taxid); + } else { + if (kmer_uid > UID_to_taxids_vec.size()) { + // This can happen when set_lcas is called more than once on a database (ie not all values start w/ 0) + cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl; + exit(1); + } + taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1)); + auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order + if (it == taxid_set.end() || *it != taxid) { + // add the taxid to the set, in the right position such that it remains sorted + taxid_set.insert( it, taxid ); // insert before iterator it + } else { + // the taxid is already part of the set for kmer_uid, return kmer_uid + return kmer_uid; + } + } + + // This taxid is not part of kmer_uids set, but is this new taxon_set already assigned to another UID? + // Try inserting .. + auto insert_res = Taxids_to_UID_map.insert( { std::move(taxid_set), current_uid + 1 } ); + if (!insert_res.second) { + // Insert unsuccessful, taxid set already has an UID + return insert_res.first->second; + } + + // Get a new UID + if (max_uid <= ++current_uid) { + cerr << "Maxxed out on UIDs!!" << endl; + exit(1); + } + + UID_to_taxids_vec.push_back( &(insert_res.first->first) ); + assert(UID_to_taxids_vec.size() == current_uid); + + // Write to mapping file + // format: TAXID PARENT + // read it with read_uid_mapping + UID_map_file.write((char*)&taxid, sizeof(taxid)); + UID_map_file.write((char*)&kmer_uid, sizeof(kmer_uid)); + + return current_uid; + } // end of uid_mapping + + + // Tree resolution: take all hit taxa (plus ancestors), then + // return leaf of highest weighted leaf-to-root path. + uint32_t resolve_uids( + const unordered_map &uid_hit_counts, + const unordered_map &parent_map, + const vector< vector > &UID_to_taxids_vec) { + unordered_map taxid_counts; + unordered_map frac_taxid_counts; + + if (uid_hit_counts.size() == 0) { + return(0); + } + + for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { + uint32_t uid = it->first; + double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size()); + for (auto taxid : UID_to_taxids_vec[uid-1]) { + taxid_counts[taxid] += it->second; + frac_taxid_counts[taxid] += frac_count; + } + } + vector max_taxids; + uint32_t max_count = 0; + double max_frac_count = 0; + for (auto it : taxid_counts) { + if (it.second == max_count) { + if (frac_taxid_counts[it.first] == max_frac_count) { + max_taxids.push_back(it.first); + } else if (frac_taxid_counts[it.first] > max_frac_count) { + max_frac_count = frac_taxid_counts[it.first]; + max_taxids = { it.first }; + } + } else if (it.second > max_count) { + max_taxids = { it.first }; + max_count = it.second; + max_frac_count = frac_taxid_counts[it.first]; + } + } + + uint32_t max_taxon = max_taxids[0]; + auto sit = max_taxids.begin(); + for (++sit; sit != max_taxids.end(); ++sit) { + max_taxon = lca(parent_map, max_taxon, *sit); + + } + + // return the taxid that appeared most often + return max_taxon; + } + + // Tree resolution: take all hit taxa (plus ancestors), then + // return leaf of highest weighted leaf-to-root path. + uint32_t resolve_uids2( + const unordered_map &uid_hit_counts, + const unordered_map &parent_map, + const uint32_t* fptr, const size_t fsize) { + + unordered_map taxid_counts; + unordered_map frac_taxid_counts; + + if (uid_hit_counts.size() == 0) { + return(0); + } + + for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { + uint32_t next_uid = it->first; + if (next_uid == 0) { + continue; + } + uint32_t taxid; + // TODO: Just get a uint64_t and shift the bits, probably faster + vector taxids; + do { + // Check if the accessed memory is out of range + // -- move this to a DEBUG-only assert + // UID-1 is used because UIDs start at 1 + uint32_t offset = (next_uid-1)*UID_BLOCK_SIZE; + if (offset >= fsize) { + cerr << "It seems you are trying to access a block after the file end: \n" << + " fptr: " << fptr << "; uid: " << next_uid << "; " << " addr: " << (offset + INT_SIZE) << endl; + exit(1); + } + taxid = *(fptr + offset); + next_uid = *(fptr+ offset + INT_SIZE); + taxid_counts[taxid] += it->second; + taxids.push_back(taxid); + } while (next_uid != 0); + + double frac_count = (double)it->second / (double)taxids.size(); + for (uint32_t taxid : taxids) { + frac_taxid_counts[taxid] += frac_count; + } + } + + if (taxid_counts.size() == 0) { + return(0); + } + vector max_taxids; + uint32_t max_count = 0; + double max_frac_count = 0; + for (auto it : taxid_counts) { + if (it.second == max_count) { + if (frac_taxid_counts[it.first] == max_frac_count) { + max_taxids.push_back(it.first); + } else if (frac_taxid_counts[it.first] > max_frac_count) { + max_frac_count = frac_taxid_counts[it.first]; + max_taxids = { it.first }; + } + } else if (it.second > max_count) { + max_taxids = { it.first }; + max_count = it.second; + max_frac_count = frac_taxid_counts[it.first]; + } + } + + uint32_t max_taxon = max_taxids[0]; + auto sit = max_taxids.begin(); + for (++sit; sit != max_taxids.end(); ++sit) { + max_taxon = lca(parent_map, max_taxon, *sit); + + } + + // return the taxid that appeared most often + return max_taxon; + } + +} diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp new file mode 100644 index 0000000..7c7d0fa --- /dev/null +++ b/src/uid_mapping.hpp @@ -0,0 +1,45 @@ + +#ifndef UID_MAPPING_H +#define UID_MAPPING_H + +#include +#include +#include +#include +using namespace std; + + +// Takes the current UID kmer_uid, and checks whether +// - taxid is in taxon set T specified in UID_to_taxids_vec[kmer_uid]? +// - yes: return kmer_uid +// - no: is there a set (T,taxid) in Taxids_to_UID_map? +// - yes: return the uid of that set +// - no: +// - increment current_uid by one and set this as the set uid +// - add the set to Taxids_to_UID_map and UID_to_taxids_vec +// - write the mapping to UID_map_file +// + +using TaxidSet = vector; + +namespace kraken { +uint32_t uid_mapping( + map< TaxidSet, uint32_t>& Taxids_to_UID_map, + vector< const TaxidSet* >& UID_to_taxids_vec, + uint32_t taxid, + uint32_t kmer_uid, + uint32_t& current_uid, + ofstream& UID_map_file); + + +uint32_t resolve_uids( + const unordered_map &uid_hit_counts, + const unordered_map &parent_map, + const vector< vector > &UID_to_taxids_vec); + +uint32_t resolve_uids2( + const unordered_map &uid_hit_counts, + const unordered_map &parent_map, + const uint32_t* fptr, const size_t fsize); +} +#endif From bcca5a8569a378f4f16b92de64f6b67b20693ed1 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 10:54:19 -0400 Subject: [PATCH 041/105] Updated build script, and add some info when loading database --- scripts/krakenu-build_db.sh | 10 ++++++---- src/krakendb.cpp | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index aebff74..89dc7af 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -187,9 +187,10 @@ then else echo "Creating seqID to taxID map (step 4 of 6).." start_time1=$(date "+%s.%N") - cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt - join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp - mv seqid2taxid.map.tmp seqid2taxid.map + #cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt + #join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp + #mv seqid2taxid.map.tmp seqid2taxid.map + find library -name '*.map' -exec cat {} \; > seqid2taxid.map line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" @@ -238,7 +239,8 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then -F /dev/fd/0 > seqid2taxid-plus.map ## Make a classification report - krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input library/archaea.fna > $(basename `pwd`).kraken + cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ + krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input /dev/stdin > $(basename `pwd`).kraken set +x if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then mv seqid2taxid.map seqid2taxid.map.orig diff --git a/src/krakendb.cpp b/src/krakendb.cpp index f89f869..de33901 100644 --- a/src/krakendb.cpp +++ b/src/krakendb.cpp @@ -67,6 +67,7 @@ KrakenDB::KrakenDB(char *ptr) { errx(EX_DATAERR, "can only handle 4 byte DB values"); k = key_bits / 2; key_len = key_bits / 8 + !! (key_bits % 8); + std::cerr << "Loaded database with " << key_ct << " keys with k of " << (size_t)k << " [val_len " << val_len << ", key_len " << key_len << "]." << std::endl; } //using std::map to have the keys sorted From a13e9fc28798d3e4ce8e2d09488a1870eaa7006d Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 18:03:44 -0400 Subject: [PATCH 042/105] Added jellyfish submodule --- .gitmodules | 3 +++ Jellyfish | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 Jellyfish diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4bab269 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "Jellyfish"] + path = Jellyfish + url = https://github.com/gmarcais/Jellyfish diff --git a/Jellyfish b/Jellyfish new file mode 160000 index 0000000..fa9b676 --- /dev/null +++ b/Jellyfish @@ -0,0 +1 @@ +Subproject commit fa9b67610f604c0ca14a51dd68c5dd408c251317 From 4d3694058cc54c6a4c09354a6c13e9c37adfe4b0 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 18:05:49 -0400 Subject: [PATCH 043/105] specify branch for jellyfish --- .gitmodules | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitmodules b/.gitmodules index 4bab269..899f8c8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "Jellyfish"] path = Jellyfish url = https://github.com/gmarcais/Jellyfish + branch = series-1.1 From e8f687330b3e345fd236ed1017279e5ec7b3dbaf Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 19:00:21 -0400 Subject: [PATCH 044/105] Instal jellyfish from code archive --- .gitignore | 1 + .gitmodules | 4 ---- Jellyfish | 1 - install_kraken.sh | 31 +++++++++++++++++++++++--- scripts/krakenu-check_for_jellyfish.sh | 15 ++++++++----- 5 files changed, 38 insertions(+), 14 deletions(-) delete mode 100644 .gitmodules delete mode 160000 Jellyfish diff --git a/.gitignore b/.gitignore index d6ff918..4b1f087 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /install/ /Debug/ +/tests/dbs diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 899f8c8..0000000 --- a/.gitmodules +++ /dev/null @@ -1,4 +0,0 @@ -[submodule "Jellyfish"] - path = Jellyfish - url = https://github.com/gmarcais/Jellyfish - branch = series-1.1 diff --git a/Jellyfish b/Jellyfish deleted file mode 160000 index fa9b676..0000000 --- a/Jellyfish +++ /dev/null @@ -1 +0,0 @@ -Subproject commit fa9b67610f604c0ca14a51dd68c5dd408c251317 diff --git a/install_kraken.sh b/install_kraken.sh index 000c9b7..12c50f1 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -19,11 +19,22 @@ set -e +DIR=$(dirname $0) VERSION=`cat $(dirname $0)/VERSION` +if [ "$1" == "--install-jellyfish" ]; then + INSTALL_JELLYFISH=1; + shift; +fi + if [ -z "$1" ] || [ -n "$2" ] then - echo "Usage: $(basename $0) KRAKEN_DIR" + echo "Usage: $(basename $0) [--install-jellyfish] KRAKEN_DIR + +If --install-jellyfish is specified, the source code for version 1.1 +is downloaded from http://www.cbcb.umd.edu/software/jellyfish and installed +in KRAKEN_DIR. Note that this may overwrite other jellyfish installation in +the same path." exit 64 fi @@ -34,14 +45,28 @@ then exit 1 fi + # Perl cmd used to canonicalize dirname - "readlink -f" doesn't work # on OS X. export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1") +if [ "$INSTALL_JELLYFISH" == "1" ]; then + WD=`pwd` + cd /tmp + wget http://www.cbcb.umd.edu/software/jellyfish/jellyfish-1.1.11.tar.gz + tar xvvf jellyfish-1.1.11.tar.gz + cd jellyfish-1.1.11 + ./configure + make + cp bin/jellyfish $KRAKEN_DIR + #rm -r jellyfish-1.1.11.tar.gz jellyfish-1.1.11 + cd $WD +fi + mkdir -p "$KRAKEN_DIR" #make -C src clean -make -C src install -for file in scripts/* +make -C $DIR/src install +for file in $DIR/scripts/* do perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\ -e 's/#####=(\w+)=#####/$H{$1}/g' \ diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenu-check_for_jellyfish.sh index 9143b62..311e307 100755 --- a/scripts/krakenu-check_for_jellyfish.sh +++ b/scripts/krakenu-check_for_jellyfish.sh @@ -25,12 +25,15 @@ set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands -JELLYFISH_BIN="jellyfish" -if hash jellyfish1 2>/dev/null; then - JELLYFISH_BIN="jellyfish1" -elif hash jellyfish 2>/dev/null; then - JELLYFISH_BIN="jellyfish" -else +JELLYFISH_BIN="" +for JF in $(dirname $0)/jellyfish jellyfish1 jellyfish; do + if hash $JF 2>/dev/null; then + JELLYFISH_BIN=$JF; + break + fi +done + +if [ "$JELLYFISH_BIN" == "" ]; then echo "Did not find jellyfish!" 1>&2 exit 1 fi From c60e100ee2a291194c67a72b6b5273c02942b73f Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 19:13:37 -0400 Subject: [PATCH 045/105] Update .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 4b1f087..27c6246 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ /install/ /Debug/ /tests/dbs +/tests/data +/tests/install From a2f75cbe472fbade3e4c9ade9cac89205d69aaa1 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 19:21:38 -0400 Subject: [PATCH 046/105] Use /usr/bin/env --- scripts/krakenu | 2 +- scripts/krakenu-build | 2 +- scripts/krakenu-cp_into_tempfile.pl | 2 +- scripts/krakenu-filter | 2 +- scripts/krakenu-mpa-report | 2 +- scripts/krakenu-read_merger.pl | 2 +- scripts/krakenu-report | 2 +- scripts/krakenu-translate | 2 +- scripts/krakenu-verify_gi_numbers.pl | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/krakenu b/scripts/krakenu index e2d8412..243bcda 100755 --- a/scripts/krakenu +++ b/scripts/krakenu @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # diff --git a/scripts/krakenu-build b/scripts/krakenu-build index 2303f76..1461663 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenu-build @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # diff --git a/scripts/krakenu-cp_into_tempfile.pl b/scripts/krakenu-cp_into_tempfile.pl index 4e24ff2..c502d2e 100755 --- a/scripts/krakenu-cp_into_tempfile.pl +++ b/scripts/krakenu-cp_into_tempfile.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # diff --git a/scripts/krakenu-filter b/scripts/krakenu-filter index 04dcb7c..5ab01df 100755 --- a/scripts/krakenu-filter +++ b/scripts/krakenu-filter @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013, Derrick Wood # diff --git a/scripts/krakenu-mpa-report b/scripts/krakenu-mpa-report index 7813569..526a167 100755 --- a/scripts/krakenu-mpa-report +++ b/scripts/krakenu-mpa-report @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # diff --git a/scripts/krakenu-read_merger.pl b/scripts/krakenu-read_merger.pl index 6e97099..adbecf9 100755 --- a/scripts/krakenu-read_merger.pl +++ b/scripts/krakenu-read_merger.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # diff --git a/scripts/krakenu-report b/scripts/krakenu-report index 99cab1b..e9cdaf5 100755 --- a/scripts/krakenu-report +++ b/scripts/krakenu-report @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # diff --git a/scripts/krakenu-translate b/scripts/krakenu-translate index 89a067a..46c9102 100755 --- a/scripts/krakenu-translate +++ b/scripts/krakenu-translate @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # diff --git a/scripts/krakenu-verify_gi_numbers.pl b/scripts/krakenu-verify_gi_numbers.pl index ec616f5..0bb5cdf 100755 --- a/scripts/krakenu-verify_gi_numbers.pl +++ b/scripts/krakenu-verify_gi_numbers.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Copyright 2013-2015, Derrick Wood # From 8a434fc21fe9083dfd021a9f76d118680ea81cef Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 19:22:36 -0400 Subject: [PATCH 047/105] Much faster krakenu-download using forks and LWP --- scripts/krakenu-download | 164 ++++++++++++++++++++++++++------------- 1 file changed, 109 insertions(+), 55 deletions(-) diff --git a/scripts/krakenu-download b/scripts/krakenu-download index f3aa4bd..7cf6fd3 100755 --- a/scripts/krakenu-download +++ b/scripts/krakenu-download @@ -1,4 +1,5 @@ -#!/bin/env perl +#!/usr/bin/env perl +#vim: et:ts=2:sw=2 # krakenu-download.pl - based on centrifuge-download # (c) Florian Breitwieser, 2017 @@ -13,7 +14,8 @@ use IO::Uncompress::Gunzip qw/gunzip $GunzipError/; use autodie; use Term::ANSIColor; use Getopt::Long; -use Parallel::ForkManager; +use LWP::UserAgent; + sub download_taxonomy(@); sub download_contaminats(@); @@ -46,8 +48,9 @@ my $OVERWRITE_FILES=0; my $INCLUDE_VIRAL_NEIGHBORS=0; my $DOMAINS; my $DL_MOD_RSYNC; +my $n_children = 0; +my @pids; -my %ac_to_taxid; my $downloaded_viral_refseq=0; my $FNA_FILES="genomic"; @@ -113,6 +116,8 @@ if (defined $BASE_DIR && defined $DB_DIR) { exit 1; } +my $ua = LWP::UserAgent->new( ssl_opts => { verify_hostname => 0 } ); + # Use current directory as base directory $BASE_DIR = "." unless defined $DB_DIR || defined $BASE_DIR; @@ -127,15 +132,6 @@ sub get_dir { return $dir1; } -my $pm = new Parallel::ForkManager($N_PROC); -$pm->run_on_finish(sub { - my ($pid, $exit_code, $indent, $exit_signal, $core_dump, $data) = @_; - if (defined $data) { - @ac_to_taxid{keys %$data} = values %$data; - } -} -); - my %select_taxonomy_ids; if (defined $TAXID) { %select_taxonomy_ids = map { $_ => 1 } split(/,/, $TAXID); @@ -182,8 +178,9 @@ if ($INCLUDE_VIRAL_NEIGHBORS) { if (!$downloaded_viral_refseq) { print STDERR "--include-viral-neighbors only works when RefSeq viral is downloaded in the same session!"; } else { - my $lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors"; - download_viral_neighbors($lib_dir); + my $nbr_lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors"; + my $viral_lib_dir = $add_dir? "$BASE_DIR/library/viral" : "$BASE_DIR/viral"; + download_viral_neighbors($viral_lib_dir, $nbr_lib_dir); } } @@ -193,35 +190,103 @@ if ($INCLUDE_VIRAL_NEIGHBORS) { ## Functions sub download(@) { - my ($url, $file) = @_; - if (-f $file && !$OVERWRITE_FILES) { + my ($url, $file, $gunzipped_filename) = @_; + if (-s $file && !$OVERWRITE_FILES) { print STDERR "Not fetching $url - file $file exists.\n" if $VERBOSE; return 1; } - $url =~ s/https/http/; + start_fork() and return; + if ($url =~ /^http/) { + print STDERR "Fetching $url to $file ..." if $VERBOSE; + if (!-d dirname($file)) { + make_path(dirname($file)); + } + my $response = $ua->get($url, ':content_file' => $file); + if (!$response->is_success) { + print STDERR "\nFAIL: Error downloading $url!\n"; + print STDERR $response->status_line."\n"; + exit; + } else { + print STDERR "SUCCESS\n" if $VERBOSE; + } + } else { + if ( $DL_MOD_RSYNC && $url =~ /^ftp/ ) { + $url =~ s/^ftp/rsync/; + } + print STDERR "Fetching $url to $file ..." if $VERBOSE; + + my $ff = File::Fetch->new(uri=>"$url"); + my $where = $ff->fetch(to=> dirname($file)) or die $ff->error; + move($where, $file); - if ( $DL_MOD_RSYNC && $url =~ /^ftp/ ) { - $url =~ s/^ftp/rsync/; + if (defined $gunzipped_filename) { + print STDERR " GUNZIPPING"; + gunzip $file => $gunzipped_filename or die "gunzip failed: $GunzipError"; + unlink $file; + $file = $gunzipped_filename; + } + print STDERR " SUCCESS\n" if $VERBOSE; } - - print STDERR "Fetching $url to $file ...\n" if $VERBOSE; - my $ff = File::Fetch->new(uri=>"$url"); - my $where = $ff->fetch(to=> dirname($file)) or die $ff->error; + exit; #my $where = $ff->fetch(to=> dirname($file)) or die "\n$ff->error for $url!"; - move($where, $file); - return -f $file; + return -s $file; +} + +sub start_fork() { + my $pid; + return if $N_PROC <= 1; + if ($n_children == $N_PROC) { + $pid = wait(); + --$n_children; + } + if (defined($pid = fork())) { + if ($pid) { + ++$n_children; + #print STDERR "Parent: forked child $pid\n"; + push @pids, $pid; + } + } else { + print STDERR "ERROR: Failed to fork\n"; + } + return $pid; +} + +sub wait_children() { + foreach my $pid (@pids) { + waitpid $pid, 0; + } + @pids = (); + $n_children = 0; +} + +sub end_fork() { + exit() unless $N_PROC == 1; } sub download_viral_neighbors(@) { - my ($dir) = @_; - print STDERR "Downloading viral neighbors into $dir ...\n"; + my ($viral_dir, $nbr_dir) = @_; + print STDERR "Reading map files from $viral_dir ... \n"; + my %ac_to_taxid; + foreach my $f (glob("$viral_dir/*.map")) { + open (my $F, "<", $f); + while (<$F>) { + chomp; + my ($ac, $taxid, $name) = split(/\t/); + $ac =~ s/\.[0-9]*$//; + $ac_to_taxid{$ac} = [$name, $taxid]; + } + close ($F); + } + + print STDERR "Downloading viral neighbors into $nbr_dir ...\n"; my $url = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2"; - my $nbr_file = "$dir/viral_neighbors-taxid10239.nbr"; + my $nbr_file = "$nbr_dir/viral_neighbors-taxid10239.nbr"; download($url, $nbr_file); open(my $F, "<", $nbr_file); my @file = <$F>; close($F); + my $i = 0; my $n_genomes = scalar @file; @@ -229,46 +294,44 @@ sub download_viral_neighbors(@) { next if /^#/; ++$i; print STDERR "\r Downloading viral neighbor sequence $i/$n_genomes ..." unless $VERBOSE; - my $pid = $pm->start and next; +# my $pid = $pm->start and next; + my ($rep_acs, $nbr_ac, undef, undef, $nname, $sname) = split /\t/; my ($name, $taxid); foreach my $rep_ac (split (/,/, $rep_acs)) { if (defined $ac_to_taxid{$rep_ac}) { ($name, $taxid) = @{$ac_to_taxid{$rep_ac}}; last; - } + } } if (!defined $taxid) { - print STDERR "No mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n"; - $pm->finish(0); + print STDERR "\nNo mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n"; next; } (my $name1 = $name) =~ s/[^a-zA-Z0-9_]/_/g; $name1 =~ s/__/_/g; - my $file = "$dir/$name1-tax$taxid/$nbr_ac.fna"; + my $file = "$nbr_dir/$name1-tax$taxid/$nbr_ac.fna"; my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac"; if (download($url,$file)) { print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac"); } - ## TODO: dust viral neighbors - $pm->finish(0); } print STDERR "\n"; - $pm->wait_all_children(); + wait_children; + +# $pm->wait_all_children(); } sub print_header_lines(@) { - my ($file, $taxid, $name, $map_ref) = @_; - #return if -f "$file.map"; + my ($file, $taxid, $name) = @_; + return if -s "$file.map"; + print STDERR "Making map file for $file\n" if ($VERBOSE); open (my $F, ">", "$file.map"); open (my $G, "<", $file); while (<$G>) { next unless /^>([^ ]*)/; my $ac = $1; print $F "$ac\t$taxid\t$name\n"; - $ac =~ s/\.[0-9]*$//; - $map_ref->{$ac} = [$name, $taxid] if defined $map_ref; - #$ac_to_taxid{$ac} = [$name, $taxid] if $downloaded_viral_refseq && $INCLUDE_VIRAL_NEIGHBORS; } close($G); close($F); @@ -372,13 +435,13 @@ sub download_domain(@) { foreach my $g (@genomes_to_dl) { my ($ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession) = @$g; ++$i; + print STDERR "\r Downloading $domain genomes: $i/$n_genomes ..." unless $VERBOSE; if (defined $infraspecific_name) { (my $i1 = $infraspecific_name) =~ s/strain=//; $organism_name .= " $infraspecific_name" unless $organism_name =~ /$i1/ || $i1 eq ""; } - print STDERR "\r Downloading $domain genomes: $i/$n_genomes ..." unless $VERBOSE; my $bname = basename($ftp_path); ( my $organism_name1 = $organism_name ) =~ s/[^a-zA-Z0-9_]/_/g; @@ -386,20 +449,16 @@ sub download_domain(@) { $organism_name1 =~ s/__/_/g; $organism_name1 =~ s/_$//; my $bname1 = "${organism_name1}-tax${taxid}-${bname}"; - my $pid = $pm->start and next; - my %local_ac_to_taxid; foreach my $ext (split(/,/, $FNA_FILES)) { my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz"; my $bfname = $bname1."_".$ext; my $fname = $bfname.".fna"; - if (!$OVERWRITE_FILES && -f "$domain_dir/$fname") { + if (!$OVERWRITE_FILES && -s "$domain_dir/$fname") { print STDERR "$domain_dir/$fname exists - not downloading.. \n" if $VERBOSE; } else { - download($full_ftp_path, "$domain_dir/$fname.gz"); - gunzip "$domain_dir/$fname.gz" => "$domain_dir/$fname" or die "gunzip failed: $GunzipError"; - unlink "$domain_dir/$fname.gz"; + download($full_ftp_path, "$domain_dir/$fname.gz", "$domain_dir/$fname"); } if ($CHANGE_HEADER) { @@ -411,11 +470,7 @@ sub download_domain(@) { ## Output sequenceID to taxonomy ID map to STDOUT - if ($domain eq "viral" && $INCLUDE_VIRAL_NEIGHBORS) { - print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession", \%local_ac_to_taxid); - } else { - print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession"); - } + print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession"); if ($DO_DUST) { ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps @@ -423,9 +478,8 @@ sub download_domain(@) { unlink("$domain_dir/$fname"); } } - $pm->finish(0, \%local_ac_to_taxid); } - $pm->wait_all_children; +# $pm->wait_all_children; print STDERR "\n"; } From 7185d736f17a4a33667c0936e6290952098bfeb5 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 25 Sep 2017 19:24:02 -0400 Subject: [PATCH 048/105] Add test files --- .gitignore | 2 ++ scripts/krakenu-standard_installation.sh | 2 +- tests/init.sh | 7 +++++++ tests/install_viral_databases.sh | 15 +++++++++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 tests/init.sh create mode 100755 tests/install_viral_databases.sh diff --git a/.gitignore b/.gitignore index 27c6246..10700cd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ /tests/dbs /tests/data /tests/install + +\.idea/ diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenu-standard_installation.sh index 815d482..e09de80 100755 --- a/scripts/krakenu-standard_installation.sh +++ b/scripts/krakenu-standard_installation.sh @@ -30,7 +30,7 @@ then WOD_FLAG="--work-on-disk" fi -check_for_jellyfish.sh +krakenu-check_for_jellyfish.sh krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map diff --git a/tests/init.sh b/tests/init.sh new file mode 100644 index 0000000..c6cd8f3 --- /dev/null +++ b/tests/init.sh @@ -0,0 +1,7 @@ + +## Install KrakenU locally into install/ +../install_kraken.sh `pwd`/install + +## Download taxonomy and genomic data into data/ +install/krakenu-download --db data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any + diff --git a/tests/install_viral_databases.sh b/tests/install_viral_databases.sh new file mode 100755 index 0000000..b7e6c3d --- /dev/null +++ b/tests/install_viral_databases.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -xeu + +mkdir -p dbs/refseq-viral/library +mkdir -p dbs/refseq-viral-plus/library + +[[ -L dbs/refseq-viral/taxonomy ]] || ln -s data/taxonomy dbs/refseq-viral +[[ -L dbs/refseq-viral/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral/library +[[ -L dbs/refseq-viral-plus/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral-plus/library +[[ -L dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s data/library/viral-neighbors/ dbs/refseq-viral-plus/library + +export PATH="install:$PATH" +krakenu-build --db refseq-viral --build + From c7ea4b89cfe54755a7f74f126dc1e53e8493bc03 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 27 Sep 2017 23:44:08 -0400 Subject: [PATCH 049/105] Added files for automated tests --- tests/build-dbs.sh | 13 +++++++++++++ tests/classify-reads.sh | 10 ++++++++++ tests/init.sh | 13 +++++++++++-- tests/install_viral_databases.sh | 15 --------------- tests/simulate-reads.sh | 9 +++++++++ 5 files changed, 43 insertions(+), 17 deletions(-) create mode 100755 tests/build-dbs.sh create mode 100755 tests/classify-reads.sh mode change 100644 => 100755 tests/init.sh delete mode 100755 tests/install_viral_databases.sh create mode 100755 tests/simulate-reads.sh diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh new file mode 100755 index 0000000..3e489e3 --- /dev/null +++ b/tests/build-dbs.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -xeu + +[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 + +mkdir -p $DIR/dbs/refseq-viral-plus/library +[[ -L $DIR/dbs/refseq-viral-plus/library/viral ]] || ln -s $DIR/data/library/viral/ $DIR/dbs/refseq-viral-plus/library/ +[[ -L $DIR/dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s $DIR/data/library/viral-neighbors/ $DIR/dbs/refseq-viral-plus/library/ + +export PATH="$DIR/install:$PATH" +krakenu-build --db $DIR/dbs/refseq-viral --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy + diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh new file mode 100755 index 0000000..807d287 --- /dev/null +++ b/tests/classify-reads.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -xeu + +[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 +SDIR=$DIR/simulated_reads +CDIR=$DIR/classification-results +mkdir -p $CDIR + +NAM=viral-neighbors-10m +time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $CDIR/$NAM.krakenu.report > $CDIR/$NAM.krakenu diff --git a/tests/init.sh b/tests/init.sh old mode 100644 new mode 100755 index c6cd8f3..a289d0d --- a/tests/init.sh +++ b/tests/init.sh @@ -1,7 +1,16 @@ +#!/bin/bash + +DIR=$1 +[[ "$DIR" == "" ]] && DIR=`pwd` ## Install KrakenU locally into install/ -../install_kraken.sh `pwd`/install +$(dirname $0)/install_kraken.sh --install-jellyfish $DIR/install ## Download taxonomy and genomic data into data/ -install/krakenu-download --db data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any +$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any +for i in viral viral-neighbors archaea bacteria; do + if [[ ! -f "$DIR/data/all-$i.fna" ]]; then + find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna + fi +done diff --git a/tests/install_viral_databases.sh b/tests/install_viral_databases.sh deleted file mode 100755 index b7e6c3d..0000000 --- a/tests/install_viral_databases.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -xeu - -mkdir -p dbs/refseq-viral/library -mkdir -p dbs/refseq-viral-plus/library - -[[ -L dbs/refseq-viral/taxonomy ]] || ln -s data/taxonomy dbs/refseq-viral -[[ -L dbs/refseq-viral/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral/library -[[ -L dbs/refseq-viral-plus/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral-plus/library -[[ -L dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s data/library/viral-neighbors/ dbs/refseq-viral-plus/library - -export PATH="install:$PATH" -krakenu-build --db refseq-viral --build - diff --git a/tests/simulate-reads.sh b/tests/simulate-reads.sh new file mode 100755 index 0000000..d5fd965 --- /dev/null +++ b/tests/simulate-reads.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -xeu + +[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 +SDIR=$DIR/simulated_reads +mkdir -p $SDIR + +randomreads.sh ref=$DIR/data/all-viral-neighbors.fna out=$SDIR/viral-neighbors-10m.fq reads=10m len=150 From f9307644f7801033def058755c44a8a97927fac6 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 27 Sep 2017 23:46:11 -0400 Subject: [PATCH 050/105] Added OSX files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 10700cd..4979752 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,5 @@ /tests/dbs /tests/data /tests/install - +*.dSYM \.idea/ From a359ba388ed7ca28cb921edbb6d906876bede04b Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 27 Sep 2017 23:53:38 -0400 Subject: [PATCH 051/105] Fix jellyfish installation - don't use make install --- install_kraken.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/install_kraken.sh b/install_kraken.sh index 12c50f1..0e662b6 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -50,20 +50,24 @@ fi # on OS X. export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1") +mkdir -p "$KRAKEN_DIR" if [ "$INSTALL_JELLYFISH" == "1" ]; then WD=`pwd` - cd /tmp - wget http://www.cbcb.umd.edu/software/jellyfish/jellyfish-1.1.11.tar.gz - tar xvvf jellyfish-1.1.11.tar.gz - cd jellyfish-1.1.11 - ./configure + cd $KRAKEN_DIR + if [[ ! -d jellyfish ]]; then + wget http://www.cbcb.umd.edu/software/jellyfish/jellyfish-1.1.11.tar.gz + tar xvvf jellyfish-1.1.11.tar.gz + mv jellyfish-1.1.11 jellyfish + fi + cd jellyfish + [[ -f Makefile ]] || ./configure make - cp bin/jellyfish $KRAKEN_DIR + #make install ## doest not work for me on OSX + #cp $KRAKEN_DIR/jellyfish-install/bin/jellyfish $KRAKEN_DIR #rm -r jellyfish-1.1.11.tar.gz jellyfish-1.1.11 cd $WD fi -mkdir -p "$KRAKEN_DIR" #make -C src clean make -C $DIR/src install for file in $DIR/scripts/* From b32fdd8a7733bd84c7c89222d9f7c5d3f3a268be Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 27 Sep 2017 23:54:34 -0400 Subject: [PATCH 052/105] Add parameters --library-dir and --taxonomy-dir --- scripts/krakenu-build | 17 +++++++-- scripts/krakenu-build_db.sh | 76 +++++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 36 deletions(-) diff --git a/scripts/krakenu-build b/scripts/krakenu-build index 1461663..59b0d50 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenu-build @@ -68,7 +68,10 @@ my ( $build_lca_database, $add_taxonomy_ids_for_genome, - $add_taxonomy_ids_for_seq + $add_taxonomy_ids_for_seq, + + $library_dir, + $taxonomy_dir ); @@ -125,8 +128,12 @@ GetOptions( "taxids-for-genomes" => \$add_taxonomy_ids_for_genome, "taxids-for-sequences" => \$add_taxonomy_ids_for_seq, + "lca-database!" => \$build_lca_database, - "uid-database!" => \$build_uid_database + "uid-database!" => \$build_uid_database, + + "library-dir=s" => \$library_dir, + "taxonomy-dir=s" => \$taxonomy_dir ) or usage(); if (@ARGV) { @@ -235,7 +242,7 @@ Task options (exactly one must be selected): --version Print version information Options: - --db NAME Kraken DB/library name (mandatory except for + --db DBDIR Kraken DB directory (mandatory except for --help/--version) --threads # Number of threads (def: $DEF_THREAD_CT) --new-db NAME New Kraken DB name (shrink task only; mandatory @@ -263,6 +270,8 @@ Options: --taxids-for-sequences Add taxonomy IDs for sequences, starting with 1bio. Can be useful to resolve classifications with multiple genomes for one taxonomy ID. + --library-dir DIR Use DIR for reference sequences instead of DBDIR/library. + --taxonomy-dir DIR Use DIR for taxonomy instead of DBDIR/taxonomy. EOF exit $exit_code; } @@ -316,6 +325,8 @@ sub build_database { $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome; $ENV{"KRAKEN_UID_DATABASE"} = $build_uid_database; $ENV{"KRAKEN_LCA_DATABASE"} = $build_lca_database; + $ENV{"KRAKEN_LIBRARY_DIR"} = $library_dir; + $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir; my $opt = ($verbose? "-x" : ""); exec "krakenu-build_db.sh"; } diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index 89dc7af..48260ad 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -76,15 +76,34 @@ fi if [ "$KRAKEN_REBUILD_DATABASE" == "1" ] then - rm -f database.* *.map lca.complete library-files.txt + rm -f database.* *.map lca.complete library-files.txt uid_database.* taxDB fi -if [ ! -f "library-files.txt" ]; then +LIBRARY_DIR="library/" +[[ "$KRAKEN_LIBRARY_DIR" != "" ]] && LIBRARY_DIR="$KRAKEN_LIBRARY_DIR" + +TAXONOMY_DIR="library/" +[[ "$KRAKEN_TAXONOMY_DIR" != "" ]] && TAXONOMY_DIR="$KRAKEN_TAXONOMY_DIR" + +if [ ! -s "library-files.txt" ]; then echo "Finding all library files" - find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt + find $FIND_OPTS $LIBRARY_DIR '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt fi + +files0() { + cat library-files.txt | tr '\n' '\0' +} +cat_library() { + cat library-files.txt | tr '\n' '\0' | xargs -0 cat +} + N_FILES=`cat library-files.txt | wc -l` -echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)" +if [[ "$N_FILES" -eq 0 ]]; then + echo "ERROR: No fna, fa, or ffn files found in $LIBRARY_DIR!"; + exit 1 +fi +echo "Found $N_FILES sequence files (*.{fna,fa,ffn}) in the library directory." + if [ -e "database.jdb" ] || [ -e "database0.kdb" ] then @@ -98,13 +117,12 @@ else # Estimate hash size as 1.15 * chars in library FASTA files if [ -z "$KRAKEN_HASH_SIZE" ] then - KRAKEN_HASH_SIZE=$(find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -printf '%s\n' | perl -nle '$sum += $_; END {print int(1.15 * $sum)}') + KRAKEN_HASH_SIZE=$( files0 | xargs -0 stat -f%z | perl -nle '$sum += $_; END {print int(1.15 * $sum)}') echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'" fi - cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ - $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ - -o database /dev/fd/0 + $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \ + -o database <( cat_library ) # Merge only if necessary if [ -e "database_1" ] @@ -181,16 +199,13 @@ else echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]" fi -if [ -e "seqid2taxid.map" ] +if [ -s "seqid2taxid.map" ] then echo "Skipping step 4, seqID to taxID map already complete." else echo "Creating seqID to taxID map (step 4 of 6).." start_time1=$(date "+%s.%N") - #cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt - #join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp - #mv seqid2taxid.map.tmp seqid2taxid.map - find library -name '*.map' -exec cat {} \; > seqid2taxid.map + find -L $LIBRARY_DIR/ -name '*.map' -exec cat {} \; > seqid2taxid.map line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" @@ -233,33 +248,34 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then fi start_time1=$(date "+%s.%N") set -x - cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \ -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kmer_count \ - -F /dev/fd/0 > seqid2taxid-plus.map - - ## Make a classification report - cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ - krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input /dev/stdin > $(basename `pwd`).kraken + -F <( cat_library ) > seqid2taxid-plus.map set +x if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then mv seqid2taxid.map seqid2taxid.map.orig mv seqid2taxid-plus.map seqid2taxid.map fi + echo "LCA database created. [$(report_time_elapsed $start_time1)]" fi + ## Make a classification report + if [[ ! -s $(basename `pwd`).report ]]; then + echo "Creating database summary report ..." + krakenu --db . --report-file $(basename `pwd`).report --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $(basename `pwd`).kraken + fi fi if [ "$KRAKEN_UID_DATABASE" != "0" ]; then if [ -e "uid_database.complete" ] then - echo "Skipping step 6.3, UIDs already set." + echo "Skipping step 6.3, UID datanbase already generated." else echo "Building UID database (step 6.3 of 6)..." PARAM="" if [[ "$KRAKEN_LCA_DATABASE" == "0" ]]; then - if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" && ]]; then + if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then echo " Adding taxonomy IDs for sequences" PARAM=" -a" fi @@ -269,26 +285,20 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then fi fi start_time1=$(date "+%s.%N") - cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \ set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \ - -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0 + -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kmer_count -F <( cat_library ) touch "uid_database.complete" echo "UID Database created. [$(report_time_elapsed $start_time1)]" fi -fi -if [ -s "uid_database.count" ] -then - echo "Skipping step 6.4, uid_database.kmer_count exists." -else - echo "Creating uid_database.kmer_count (step 6.4 of 6)... " - start_time1=$(date "+%s.%N") - time $JELLYFISH_BIN histo --high 100000000 uid_database.kdb > uid_database.kmer_count - echo "uid_database.kmer_count finished. [$(report_time_elapsed $start_time1)]" + ## Make a classification report + if [[ ! -s $(basename `pwd`).uid_report ]]; then + echo "Creating database summary report ..." + krakenu --db . --report-file $(basename `pwd`).uid_report --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $(basename `pwd`).uid_kraken + fi fi - echo "Database construction complete. [Total: $(report_time_elapsed $start_time)] You can delete all files but database.{kdb,idx} and taxDB now, if you want" From 3c4e7e44f70056b9b9db55afb532b11d91f0fbf1 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 27 Sep 2017 23:55:18 -0400 Subject: [PATCH 053/105] Look for locally installed jellyfish, first --- scripts/krakenu-check_for_jellyfish.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenu-check_for_jellyfish.sh index 311e307..c2aa2d7 100755 --- a/scripts/krakenu-check_for_jellyfish.sh +++ b/scripts/krakenu-check_for_jellyfish.sh @@ -26,8 +26,8 @@ set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands JELLYFISH_BIN="" -for JF in $(dirname $0)/jellyfish jellyfish1 jellyfish; do - if hash $JF 2>/dev/null; then +for JF in $(dirname $0)/jellyfish/bin/jellyfish /usr/local/opt/jellyfish-1.1/bin/jellyfish jellyfish1 jellyfish; do + if test -f $JF || hash $JF 2>/dev/null; then JELLYFISH_BIN=$JF; break fi From 3b7642d1c0c6c45ed8578309afa1eaac8c099e79 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 27 Sep 2017 23:58:50 -0400 Subject: [PATCH 054/105] Add slurp_file for file reading - fix for OSX --- src/quickfile.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++ src/quickfile.hpp | 4 ++++ src/set_lcas.cpp | 34 +++++++++++++++++----------------- 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/src/quickfile.cpp b/src/quickfile.cpp index ddabe9a..c518dd9 100644 --- a/src/quickfile.cpp +++ b/src/quickfile.cpp @@ -129,4 +129,48 @@ void QuickFile::close_file() { valid = false; } +// from http://programanddesign.com/cpp/human-readable-file-size-in-c/ +char* readable_fs(double size/*in bytes*/, char *buf) { + int i = 0; + const char* units[] = {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}; + while (size > 1024) { + size /= 1024; + i++; + } + sprintf(buf, "%.*f %s", i, size, units[i]); + return buf; +} + + + +std::vector slurp_file(std::string filename, size_t lSize) { + FILE * pFile; + size_t result; + + pFile = fopen ( filename.c_str() , "rb" ); + if (pFile==NULL) {fputs ("File error",stderr); exit (1);} + + if (lSize > 0) { + // obtain file size: + fseek (pFile , 0 , SEEK_END); + lSize = ftell (pFile); + rewind (pFile); + } + + char buf[50]; + readable_fs(lSize, buf); + std::cerr << "Getting " << filename << " into memory (" << buf << ") ..."; + + // copy the file into the vector: + std::vector buffer(lSize); + result = fread (buffer.data(),1,lSize,pFile); + if (result != lSize) {fputs ("Reading error",stderr); exit (3);} + fclose (pFile); + + std::cerr << " Done" << std::endl; + return(std::move(buffer)); +} + + + } // namespace diff --git a/src/quickfile.hpp b/src/quickfile.hpp index 5533580..8f57642 100644 --- a/src/quickfile.hpp +++ b/src/quickfile.hpp @@ -21,6 +21,7 @@ #define QUICKFILE_HPP #include "kraken_headers.hpp" +#include namespace kraken { class QuickFile { @@ -43,6 +44,9 @@ namespace kraken { char *fptr; size_t filesize; }; + + std::vector slurp_file(std::string filename, size_t lSize = 0); + } #endif diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 504e2b6..8db1033 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -78,6 +78,7 @@ TaxonomyDB taxdb; const string prefix = "kraken:taxid|"; + int main(int argc, char **argv) { #ifdef _OPENMP omp_set_num_threads(1); @@ -105,31 +106,24 @@ int main(int argc, char **argv) { cerr << "Something went wrong while creating the file." << endl; exit(1); } + } + if (!Operate_in_RAM && Output_DB_filename.size() > 0) { + cerr << "You need to operate in RAM (flag -M) to use output to a different file (flag -o)" << endl; + return 1; } QuickFile db_file(DB_filename, "rw"); - - char *temp_ptr = NULL; size_t db_file_size = db_file.size(); + vector dat; if (Operate_in_RAM) { - cerr << "Getting " << DB_filename << " into memory ... "; db_file.close_file(); - temp_ptr = new char[ db_file_size ]; - ifstream ifs(DB_filename.c_str(), ifstream::binary); - ifs.read(temp_ptr, db_file_size); - ifs.close(); - Database = KrakenDB(temp_ptr); - cerr << "done" << endl; + dat = slurp_file(DB_filename, db_file_size); + Database = KrakenDB(dat.data()); } else { if (Output_DB_filename.size() > 0) { - cerr << "You need to operate in RAM (flag -M) to use output to a different file (flag -o)" << endl; - return 1; + //system("cp " + DB_filename + " " + Output_DB_filename); } - //std::ifstream ifs("input.txt", std::ios::binary); - //std::ofstream ofs("output.txt", std::ios::binary); - //ofs << ifs.rdbuf(); - Database = KrakenDB(db_file.ptr()); } @@ -160,9 +154,9 @@ int main(int argc, char **argv) { } cerr << "Writing database from RAM back to " << DB_filename << " ..." << endl; ofstream ofs(DB_filename.c_str(), ofstream::binary); - ofs.write(temp_ptr, db_file_size); + ofs.write(dat.data(), db_file_size); ofs.close(); - delete temp_ptr; + dat.clear(); } UID_map_file.close(); @@ -204,6 +198,8 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi TaxonomyDB& taxdb, unordered_map& Parent_map, bool Add_taxIds_for_Assembly, bool Add_taxIds_for_Sequences) { + cerr << "Reading sequence ID to taxonomy ID mapping ... "; + unordered_map ID_to_taxon_map; ifstream map_file(ID_to_taxon_map_filename.c_str()); if (map_file.rdstate() & ifstream::failbit) { @@ -243,6 +239,10 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi } ID_to_taxon_map[seq_id] = taxid; } + if (ID_to_taxon_map.size() == 0) { + cerr << "Error: No ID mappings present!!" << endl; + } + cerr << " Done - read " << ID_to_taxon_map.size() << " mappings." << endl; return std::move(ID_to_taxon_map); } From da0978e4d4e22705231f59d659be249d331336e5 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 28 Sep 2017 01:31:12 -0400 Subject: [PATCH 055/105] Added classification rater (not working yet) --- src/grade_classification.cpp | 76 ++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 src/grade_classification.cpp diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp new file mode 100644 index 0000000..8f9b1e0 --- /dev/null +++ b/src/grade_classification.cpp @@ -0,0 +1,76 @@ +/* + * Copyright 2017, Florian Breitwieser + * licnsed under GPLv3 + */ + +#include "taxdb.h" +#include "quickfile.hpp" +#include +#include +#include + +using namespace std; + +unordered_map read_seqid_mapping(string filename) { + unordered_map ID_to_taxon_map; + ifstream map_file(filename.c_str()); + if (map_file.rdstate() & ifstream::failbit) { + err(EX_NOINPUT, "can't open %s", filename.c_str()); + } + string line, seq_id; + uint32_t taxid; + + while (map_file.good()) { + getline(map_file, line); + if (line.empty()) + break; + istringstream iss(line); + iss >> seq_id >> taxid; + ID_to_taxon_map[seq_id] = taxid; + } + map_file.close(); + return ID_to_taxon_map; +} + +int main(int argc, char **argv) { + if (argc != 4) { + std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file\n"; + return 1; + } + TaxonomyDB taxdb = TaxonomyDB(argv[1], false); + unordered_map seqid_map = read_seqid_mapping(argv[2]); + cerr << "Read " << seqid_map.size() << " taxa mappings" << endl; + + ifstream k_file(argv[3]); + if (k_file.rdstate() & ifstream::failbit) { + err(EX_NOINPUT, "can't open %s", argv[3]); + } + string line, classification_state, read_id, seq_id; + uint32_t taxid; + uint32_t seq_taxid; + + while (k_file.good()) { + getline(k_file, line); + if (line.empty()) + continue; + istringstream iss(line); + iss >> classification_state >> read_id >> taxid; + seq_id = read_id.substr(read_id.find_last_of("_")+1); + auto it = seqid_map.find(seq_id); + if (it == seqid_map.end()) { + cerr << "ERROR: Couldn't find taxid for " << seq_id << endl; + } else { + seq_taxid = it->second; + size_t distance_between_taxids; + string lowest_common_rank; + seq_taxid = taxdb.getTaxIDAtRank(seq_taxid, "species"); + taxid = taxdb.getTaxIDAtRank(taxid, "species"); + pair lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, taxid); + string lca_rank = taxdb.getRank(lca_taxid_dist.first); + cout << seq_taxid << '\t' << taxid << '\t' << lca_rank << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << endl; + } + } + k_file.close(); + + +} From f49630a3b8648ba57d57116ba93829e7e26e402c Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sat, 30 Sep 2017 13:15:51 -0400 Subject: [PATCH 056/105] Allow multiple library directories on command line --- scripts/krakenu-build | 7 ++++--- scripts/krakenu-build_db.sh | 14 +++++++------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/scripts/krakenu-build b/scripts/krakenu-build index 59b0d50..8f72697 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenu-build @@ -70,11 +70,12 @@ my ( $add_taxonomy_ids_for_genome, $add_taxonomy_ids_for_seq, - $library_dir, $taxonomy_dir ); +my @library_dirs; + my $verbose = 0; $threads = $DEF_THREAD_CT; @@ -132,7 +133,7 @@ GetOptions( "lca-database!" => \$build_lca_database, "uid-database!" => \$build_uid_database, - "library-dir=s" => \$library_dir, + "library-dir=s" => \@library_dirs, "taxonomy-dir=s" => \$taxonomy_dir ) or usage(); @@ -325,7 +326,7 @@ sub build_database { $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome; $ENV{"KRAKEN_UID_DATABASE"} = $build_uid_database; $ENV{"KRAKEN_LCA_DATABASE"} = $build_lca_database; - $ENV{"KRAKEN_LIBRARY_DIR"} = $library_dir; + $ENV{"KRAKEN_LIBRARY_DIRS"} = "@library_dirs"; $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir; my $opt = ($verbose? "-x" : ""); exec "krakenu-build_db.sh"; diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index 48260ad..fb79fac 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -80,9 +80,9 @@ then fi LIBRARY_DIR="library/" -[[ "$KRAKEN_LIBRARY_DIR" != "" ]] && LIBRARY_DIR="$KRAKEN_LIBRARY_DIR" +[[ "$KRAKEN_LIBRARY_DIRS" != "" ]] && LIBRARY_DIR="$KRAKEN_LIBRARY_DIRS" -TAXONOMY_DIR="library/" +TAXONOMY_DIR="taxonomy/" [[ "$KRAKEN_TAXONOMY_DIR" != "" ]] && TAXONOMY_DIR="$KRAKEN_TAXONOMY_DIR" if [ ! -s "library-files.txt" ]; then @@ -218,15 +218,15 @@ then else echo "Creating taxDB (step 5 of 6)... " start_time1=$(date "+%s.%N") - if [ ! -f taxonomy/names.dmp ] || [ ! -f taxonomy/nodes.dmp ]; then - echo "taxonomy/names.dmp or taxonomy/nodes.dmp does not exist - downloading it ..." - [ -d taxonomy ] || mkdir taxonomy - cd taxonomy + if [ ! -f $TAXONOMY_DIR/names.dmp ] || [ ! -f $TAXONOMY_DIR/nodes.dmp ]; then + echo "$TAXONOMY_DIR/names.dmp or $TAXONOMY_DIR/nodes.dmp does not exist - downloading it ..." + [ -d $TAXONOMY_DIR ] || mkdir $TAXONOMY_DIR + cd $TAXONOMY_DIR wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz tar zxf taxdump.tar.gz cd .. fi - build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp + build_taxdb $TAXONOMY_DIR/names.dmp $TAXONOMY_DIR/nodes.dmp | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp mv taxDB.tmp taxDB echo "taxDB construction finished. [$(report_time_elapsed $start_time1)]" fi From 314f49c3966a02bb712b33bcbda71b0969b50364 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sat, 30 Sep 2017 13:17:44 -0400 Subject: [PATCH 057/105] Update on tests --- src/Makefile | 7 +- src/grade_classification.cpp | 129 +++++++++++++++++++++++--- src/krakendb.cpp | 17 +++- src/krakendb.hpp | 6 +- src/taxdb.h | 173 +++++++++++++++++++++++++++++++++-- tests/build-dbs.sh | 10 +- tests/classify-reads.sh | 8 +- tests/init.sh | 7 +- tests/simulate-reads.sh | 44 ++++++++- 9 files changed, 372 insertions(+), 29 deletions(-) diff --git a/src/Makefile b/src/Makefile index 82246e9..f127108 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,7 @@ CXX = g++ -CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g -Wfatal-errors -PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb +FOPENMP?=-fopenmp +CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors +PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream .PHONY: all install clean @@ -19,6 +20,8 @@ db_sort: krakendb.o quickfile.o set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp +grade_classification: taxdb.h + classify: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp $(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS) diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp index 8f9b1e0..f787065 100644 --- a/src/grade_classification.cpp +++ b/src/grade_classification.cpp @@ -8,9 +8,12 @@ #include #include #include +#include using namespace std; +using TAXID = uint32_t; + unordered_map read_seqid_mapping(string filename) { unordered_map ID_to_taxon_map; ifstream map_file(filename.c_str()); @@ -33,20 +36,35 @@ unordered_map read_seqid_mapping(string filename) { } int main(int argc, char **argv) { - if (argc != 4) { - std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file\n"; + if (argc != 5) { + std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file result_file\n"; return 1; } TaxonomyDB taxdb = TaxonomyDB(argv[1], false); unordered_map seqid_map = read_seqid_mapping(argv[2]); cerr << "Read " << seqid_map.size() << " taxa mappings" << endl; + ofstream out_file(argv[4]); + set all_ranks; + unordered_map< string, size_t > rank_counts; + map< int, set > simulated_taxids_at_rank; + map< int, set > identified_taxids_at_rank; + map< int, size_t > correct_reads_at_rank; + map< int, size_t > incorrect_reads_at_rank; + map< int, size_t > reads_at_higher_rank; + size_t total_reads = 0; + size_t unidentified_reads = 0; + + + vector ranks_of_interest = {TaxRank::RANK::assembly, TaxRank::RANK::species, TaxRank::RANK::genus, TaxRank::RANK::family, TaxRank::RANK::order}; + ifstream k_file(argv[3]); if (k_file.rdstate() & ifstream::failbit) { err(EX_NOINPUT, "can't open %s", argv[3]); } + string line, classification_state, read_id, seq_id; - uint32_t taxid; + uint32_t identified_taxid; uint32_t seq_taxid; while (k_file.good()) { @@ -54,23 +72,110 @@ int main(int argc, char **argv) { if (line.empty()) continue; istringstream iss(line); - iss >> classification_state >> read_id >> taxid; - seq_id = read_id.substr(read_id.find_last_of("_")+1); + iss >> classification_state >> read_id >> identified_taxid; + + ++total_reads; + if (identified_taxid == 0) { + ++unidentified_reads; + } + + // sequence id is after the 5th underscore with random_reads.sh - find it + size_t pos = 0; + size_t count = 0; + do { + pos = read_id.find("_", pos) + 1; + ++count; + } while (count <= 5 && pos != std::string::npos); + + seq_id = read_id.substr(pos); auto it = seqid_map.find(seq_id); if (it == seqid_map.end()) { cerr << "ERROR: Couldn't find taxid for " << seq_id << endl; + exit(1); } else { seq_taxid = it->second; - size_t distance_between_taxids; - string lowest_common_rank; - seq_taxid = taxdb.getTaxIDAtRank(seq_taxid, "species"); - taxid = taxdb.getTaxIDAtRank(taxid, "species"); - pair lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, taxid); - string lca_rank = taxdb.getRank(lca_taxid_dist.first); - cout << seq_taxid << '\t' << taxid << '\t' << lca_rank << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << endl; + + // go up to species level or next proper (i.e. not 'no rank') rank for + // both real and assigned taxon + if (0) { + seq_taxid = taxdb.getTaxIDAtRank(seq_taxid, "species"); + uint32_t identified_species_taxid = taxdb.getTaxIDAtRank(identified_taxid, "species"); + if (identified_species_taxid != 0) { + identified_taxid = identified_species_taxid; + } else { + identified_taxid = taxdb.getTaxIDAtNextProperRank(identified_taxid); + } + } + + string seq_species = taxdb.getScientificName(seq_taxid); + // getLowestCommonAncestor returns lca taxon as well as distance between the taxa + pair lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, identified_taxid); + string lca_rank_string = taxdb.getNextProperRank(lca_taxid_dist.first); + TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string); + + TaxRank::RANK identified_rank = TaxRank::toRank(taxdb.getRank(identified_taxid)); + for (TaxRank::RANK rank : ranks_of_interest) { + TAXID simulated_taxid_at_rank = taxdb.getTaxIDAtRank(seq_taxid, TaxRank::toString(rank)); + TAXID identified_taxid_at_rank = taxdb.getTaxIDAtRank(identified_taxid, TaxRank::toString(rank)); + simulated_taxids_at_rank[rank].insert(simulated_taxid_at_rank); + // only consider identifications at the rank or more specific + // alternative: count identifications that are further up, too + if (identified_rank <= rank) { + identified_taxids_at_rank[rank].insert(identified_taxid_at_rank); + if (simulated_taxid_at_rank == identified_taxid_at_rank) { + ++correct_reads_at_rank[rank]; + } else { + ++incorrect_reads_at_rank[rank]; + } + } else { + ++reads_at_higher_rank[rank]; + } + } + + if (identified_taxid == 0) + lca_rank_string = "unidentified"; + ++rank_counts[lca_rank_string]; + out_file << seq_species << '\t' << seq_taxid << '\t' << identified_taxid << '\t' << lca_rank_string << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << '\n'; } } k_file.close(); + cout << "#LCA_RANK_READ_COUNTS" << endl; + for (const auto & kv : rank_counts) { + cout << kv.first << '\t' << kv.second << endl; + } + cout << "\n#rank; total_reads; correct; incorrect; at_higher_rank; unidentified" << endl; + for (TaxRank::RANK rank : ranks_of_interest) { + cout << TaxRank::toString(rank) << '\t' << total_reads + << '\t' << correct_reads_at_rank[rank] + << '\t' << incorrect_reads_at_rank[rank] + << '\t' << reads_at_higher_rank[rank] + << '\t' << unidentified_reads + << '\n'; + } + cout << "\n#rank;P;TP;FP;sens;prec" << endl; + for (TaxRank::RANK rank : ranks_of_interest) { + size_t true_positives = 0; + size_t false_positives = 0; + + for (const auto & tid : identified_taxids_at_rank[rank]) { + if (simulated_taxids_at_rank[rank].count(tid) == 1) { + ++true_positives; + } else { + ++false_positives; + } + } + + double sensitivity = 100.0*(double)true_positives/(double)simulated_taxids_at_rank[rank].size(); + double specificity = 100.0*(double)true_positives/(double)(true_positives+false_positives); + + cout << TaxRank::toString(rank) + << '\t' << simulated_taxids_at_rank[rank].size() + << '\t' << true_positives + << '\t' << false_positives << setprecision(2) << std::fixed + << '\t' << sensitivity << '%' + << '\t' << specificity << '%' + << '\n'; + } } diff --git a/src/krakendb.cpp b/src/krakendb.cpp index de33901..cae738f 100644 --- a/src/krakendb.cpp +++ b/src/krakendb.cpp @@ -52,14 +52,21 @@ KrakenDB::KrakenDB() { key_len = 0; key_bits = 0; k = 0; + _filesize = 0; } // Assumes ptr points to start of a readable mmap'ed file -KrakenDB::KrakenDB(char *ptr) { +KrakenDB::KrakenDB(char *ptr, size_t filesize) { + _filesize = filesize; index_ptr = NULL; fptr = ptr; - if (strncmp(ptr, DATABASE_FILE_TYPE, strlen(DATABASE_FILE_TYPE))) - errx(EX_DATAERR, "database in improper format"); + if (ptr == NULL) { + errx(EX_DATAERR, "pointer is NULL"); + } + if (strncmp(ptr, DATABASE_FILE_TYPE, strlen(DATABASE_FILE_TYPE))) { + string msg = "database in improper format - found " + string(ptr, strlen(DATABASE_FILE_TYPE)); + errx(EX_DATAERR, msg.c_str()); + } memcpy(&key_bits, ptr + 8, 8); memcpy(&val_len, ptr + 16, 8); memcpy(&key_ct, ptr + 48, 8); @@ -70,6 +77,10 @@ KrakenDB::KrakenDB(char *ptr) { std::cerr << "Loaded database with " << key_ct << " keys with k of " << (size_t)k << " [val_len " << val_len << ", key_len " << key_len << "]." << std::endl; } +size_t KrakenDB::filesize() const { + return _filesize; +} + //using std::map to have the keys sorted std::map KrakenDB::count_taxons() { char *ptr = get_pair_ptr(); diff --git a/src/krakendb.hpp b/src/krakendb.hpp index 4683654..5a19a71 100644 --- a/src/krakendb.hpp +++ b/src/krakendb.hpp @@ -86,14 +86,18 @@ namespace kraken { void set_index(KrakenDBIndex *i_ptr); + size_t filesize() const; + // Null constructor KrakenDB(); // ptr points to start of mmap'ed DB in read or read/write mode - KrakenDB(char *ptr); + KrakenDB(char *ptr, size_t filesize = 0); + private: + size_t _filesize; char *fptr; KrakenDBIndex *index_ptr; uint8_t k; diff --git a/src/taxdb.h b/src/taxdb.h index 0495c8c..28313f7 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -46,15 +46,109 @@ std::vector in_betweens(const std::string &s, const char start_char std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0); - std::vector get_fields(const std::string &s, const std::string& delimiter, std::vector fields); +// TODO: Consider using TaxRank instead of string in TaxonomyEntry +// However, then it would not be possible to define custom ranks.. +struct TaxRank { + // All ranks that appear in the NCBI taxonomy database, + // plus 'sequence', 'assembly', and 'root' + //static constexpr vector rank_strings = { + // "no rank", "sequence", "assembly", + // "subspecies", "species", "subgenus", "genus", "tribe", "subfamily", + //"family", "superfamily", "parvorder", "infraorder", "suborder", + //"order", "superorder", "parvclass", "infraclass", "subclass", + //"class", "superclass", "subphylum", "phylum", "kingdom", + //"superkingdom", "root"}; + + enum RANK { unknown, no_rank, sequence, assembly, + subspecies, species, subgenus, genus, tribe, subfamily, + family, superfamily, parvorder, infraorder, suborder, + order, superorder, parvclass, infraclass, subclass, + class_, superclass, subphylum, phylum, kingdom, + superkingdom, root + }; + + static const unordered_map string_to_rank; + + static const RANK toRank(const string& rank) { + return string_to_rank.at(rank); + } + + static const char* toString(const TaxRank::RANK& rank) { + switch(rank) { + case RANK::unknown: return "unknown"; + case RANK::no_rank: return "no rank"; + case RANK::sequence: return "sequence"; + case RANK::assembly: return "assembly"; + case RANK::subspecies: return "subspecies"; + case RANK::species: return "species"; + case RANK::subgenus: return "subgenus"; + case RANK::genus: return "genus"; + case RANK::tribe: return "tribe"; + case RANK::subfamily: return "subfamily"; + case RANK::family: return "family"; + case RANK::superfamily: return "superfamily"; + case RANK::parvorder: return "parvorder"; + case RANK::infraorder: return "infraorder"; + case RANK::suborder: return "suborder"; + case RANK::order: return "order"; + case RANK::superorder: return "superorder"; + case RANK::parvclass: return "parvclass"; + case RANK::infraclass: return "infraclass"; + case RANK::subclass: return "subclass"; + case RANK::class_: return "class"; + case RANK::superclass: return "superclass"; + case RANK::subphylum: return "subphylum"; + case RANK::phylum: return "phylum"; + case RANK::kingdom: return "kingdom"; + case RANK::superkingdom: return "superkingdom"; + case RANK::root: return "root"; + default: + log_msg("Invalid rank!"); + } + return "NA"; + } + +}; + +const unordered_map TaxRank::string_to_rank = { + {"unknown", TaxRank::unknown}, + {"no rank", TaxRank::no_rank}, + {"sequence", TaxRank::sequence}, + {"assembly", TaxRank::assembly}, + {"subspecies", TaxRank::subspecies}, + {"species", TaxRank::species}, + {"subgenus", TaxRank::subgenus}, + {"genus", TaxRank::genus}, + {"tribe", TaxRank::tribe}, + {"subfamily", TaxRank::subfamily}, + {"family", TaxRank::family}, + {"superfamily", TaxRank::superfamily}, + {"parvorder", TaxRank::parvorder}, + {"infraorder", TaxRank::infraorder}, + {"suborder", TaxRank::suborder}, + {"order", TaxRank::order}, + {"superorder", TaxRank::superorder}, + {"parvclass", TaxRank::parvclass}, + {"infraclass", TaxRank::infraclass}, + {"subclass", TaxRank::subclass}, + {"class", TaxRank::class_}, + {"superclass", TaxRank::superclass}, + {"subphylum", TaxRank::subphylum}, + {"phylum", TaxRank::phylum}, + {"kingdom", TaxRank::kingdom}, + {"superkingdom", TaxRank::superkingdom}, + {"root", TaxRank::root} +}; + + template class TaxonomyEntry { public: TAXID taxonomyID = 0; TAXID parentTaxonomyID = 0; - std::string rank; + string rank; std::string scientificName; TaxonomyEntry() {} @@ -107,6 +201,9 @@ class TaxonomyDB { std::string getScientificName(const TAXID taxID) const; std::string getRank(const TAXID taxID) const; TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; + pair getLowestCommonAncestor(TAXID a, TAXID b) const; + string getNextProperRank(TAXID a) const; + TAXID getTaxIDAtNextProperRank(TAXID a) const; TAXID getParentTaxID(const TAXID taxID) const; std::unordered_map getParentMap() const; @@ -132,6 +229,7 @@ class TaxonomyDB { std::unordered_map > taxIDsAndEntries; bool genomeSizes_are_set = false; + private: std::unordered_map > @@ -471,6 +569,63 @@ std::unordered_map > return(taxIDsAndEntries); } +template +string TaxonomyDB::getNextProperRank(TAXID a) const { + if (a == 0) { + return "NA"; + } + while (getRank(a) == "no rank" && a != getParentTaxID(a)) { + a = getParentTaxID(a); + } + if ( a == 1 ) { + return "root"; + } + return getRank(a); +} + +template +TAXID TaxonomyDB::getTaxIDAtNextProperRank(TAXID a) const { + if (a == 0 || a == 1) { + return 0; + } + while (getRank(a) == "no rank" && a != getParentTaxID(a)) { + a = getParentTaxID(a); + } + return a; +} + +template +pair TaxonomyDB::getLowestCommonAncestor(TAXID a, TAXID b) const { + if (a == 0 || b == 0) { + return a ? pair(a,-1) : pair(b,-1); + } + + // create a path from a to the root + std::unordered_set a_path; + int distA = 0; + while (a > 0 && a != getParentTaxID(a)) { + if (a == b) + return pair{a, distA}; + a_path.insert(a); + a = getParentTaxID(a); + ++distA; + } + + int distB = 0; + // search for b in the path from a to the root + while (b > 0 && b != getParentTaxID(b)) { + auto it = a_path.find(b); + if (it != a_path.end()) { + return pair(b, distB + std::distance(a_path.begin(), it)); + } + b = getParentTaxID(b); + ++distB; + } + return pair(1, distA+distB); +} + + + template TAXID TaxonomyDB::getLowestCommonAncestor( const std::vector& taxIDs) const { @@ -623,11 +778,13 @@ std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) template TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, const std::string& rank) const { + if (taxID == 0 || taxID == 1) + return 0; auto entry = taxIDsAndEntries.find(taxID); - //cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl; + // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl; while (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) { - //cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl; + // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl; if (entry->second.rank == rank) { return entry->second.taxonomyID; } else { @@ -722,8 +879,12 @@ void TaxonomyDB::setReadCounts(const unordered_map - TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { - _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME}; + TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, + bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, + REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, + REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, + REPORTCOLS::SPACED_NAME}; } diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index 3e489e3..8002fe9 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -9,5 +9,13 @@ mkdir -p $DIR/dbs/refseq-viral-plus/library [[ -L $DIR/dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s $DIR/data/library/viral-neighbors/ $DIR/dbs/refseq-viral-plus/library/ export PATH="$DIR/install:$PATH" -krakenu-build --db $DIR/dbs/refseq-viral --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy +for K in 21 26 31; do + mkdir -p $DIR/dbs/refseq-viral-k$K + krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy + + if [[ `uname` != "Darwin" ]]; then + krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --taxonomy-dir=$DIR/data/taxonomy + + fi +done diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh index 807d287..802b29b 100755 --- a/tests/classify-reads.sh +++ b/tests/classify-reads.sh @@ -7,4 +7,10 @@ CDIR=$DIR/classification-results mkdir -p $CDIR NAM=viral-neighbors-10m -time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $CDIR/$NAM.krakenu.report > $CDIR/$NAM.krakenu +for K in 21 26 31; do + KFILE=$CDIR/$NAM.k$K.krakenu + [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $KFILE.report > $KFILE 2> $KFILE.log + [[ -s $KFILE.results ]] || $DIR/install/grade_classification $DIR/dbs/refseq-viral-k$K/taxDB $DIR/data/all-viral-neighbors.map $KFILE > $KFILE.results + [[ -s $KFILE.results.stats ]] || cut -f 4 $KFILE.results | sort | uniq -c | sort -n > $KFILE.results.stats + +done diff --git a/tests/init.sh b/tests/init.sh index a289d0d..0c341fa 100755 --- a/tests/init.sh +++ b/tests/init.sh @@ -4,13 +4,16 @@ DIR=$1 [[ "$DIR" == "" ]] && DIR=`pwd` ## Install KrakenU locally into install/ -$(dirname $0)/install_kraken.sh --install-jellyfish $DIR/install +$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install ## Download taxonomy and genomic data into data/ -$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any +#$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any for i in viral viral-neighbors archaea bacteria; do if [[ ! -f "$DIR/data/all-$i.fna" ]]; then find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna fi + if [[ ! -f "$DIR/data/all-$i.map" ]]; then + find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map + fi done diff --git a/tests/simulate-reads.sh b/tests/simulate-reads.sh index d5fd965..09d7db7 100755 --- a/tests/simulate-reads.sh +++ b/tests/simulate-reads.sh @@ -4,6 +4,48 @@ set -xeu [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 SDIR=$DIR/simulated_reads +CDIR=$DIR/classification-results +mkdir -p $CDIR mkdir -p $SDIR -randomreads.sh ref=$DIR/data/all-viral-neighbors.fna out=$SDIR/viral-neighbors-10m.fq reads=10m len=150 +run_krakenu_viral() { + FQ=$1 + NAM=$2 + K=$3 + DAT=$4 + + KFILE=$CDIR/$NAM.k$K.krakenu + [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ --report-file $KFILE.report > $KFILE 2> $KFILE.log + [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map + [[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats +} + +run_kraken_viral() { + FQ=$1 + NAM=$2 + K=$3 + DAT=$4 + + KFILE=$CDIR/$NAM.k$K.kraken + [[ -s $KFILE ]] || time kraken --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ > $KFILE 2> $KFILE.log + [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map + #[[ -s $KFILE.results.stats ]] || + $DIR/install/grade_classification $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats +} + + + +AB=1m +for i in 1 2 3; do + for dat in viral viral-neighbors bacteria archaea; do + for len in 75 100 150; do + NAM=$dat.$AB${len}bp.$i + FQ=$SDIR/$NAM.fq + [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i + for K in 21 26 31; do + run_krakenu_viral $FQ $NAM $K $dat + run_kraken_viral $FQ $NAM $K $dat + done + done + done +done From 3898b25c75d47a2f74fc28cef6c02eeca56d3c4b Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 03:01:56 -0400 Subject: [PATCH 058/105] Added script to dump taxDB to NCBI dump format --- src/Makefile | 2 +- src/dump_taxdb.cpp | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 src/dump_taxdb.cpp diff --git a/src/Makefile b/src/Makefile index f127108..0ed70b3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ CXX = g++ FOPENMP?=-fopenmp CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors -PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification +PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream .PHONY: all install clean diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp new file mode 100644 index 0000000..b2c73f0 --- /dev/null +++ b/src/dump_taxdb.cpp @@ -0,0 +1,34 @@ +#include "taxdb.h" +#include "quickfile.hpp" +#include +#include +#include + +using namespace std; + +int main(int argc, char **argv) { + if (argc != 3) { + std::cerr << "Usage: build_taxdb taxDB names.dmp nodes.dmp\n"; + return 1; + } + TaxonomyDB taxdb {(string)argv[1]}; + ofstream names_file(argv[2]); + names_file.exceptions(ifstream::failbit | ifstream::badbit); + ofstream nodes_file(argv[3]); + nodes_file.exceptions(ifstream::failbit | ifstream::badbit); + + for (const auto &taxon : taxdb.taxIDsAndEntries) { + std::string scientificName; + nodes_file << taxon.second.taxonomyID + << "\t|\t" << taxon.second.parentTaxonomyID + << "\t|\t" << taxon.second.rank + << "\t|\n"; // there are further columns, but Kraken does not care about them + + names_file << taxon.second.taxonomyID + << "\t|\t" << taxon.second.scientificName + << "\t|\t" + << "\t|\t" << "scientific name" << "\t|\n"; + } + names_file.close(); + nodes_file.close(); +} From 94b4326d5878490eb7336fd03a1b7589eecb089f Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 03:03:27 -0400 Subject: [PATCH 059/105] Added dusting and 'standard' db to testing --- tests/build-dbs.sh | 16 +++++++--------- tests/init.sh | 21 ++++++++++++--------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index 8002fe9..082bac5 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -4,18 +4,16 @@ set -xeu [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 -mkdir -p $DIR/dbs/refseq-viral-plus/library -[[ -L $DIR/dbs/refseq-viral-plus/library/viral ]] || ln -s $DIR/data/library/viral/ $DIR/dbs/refseq-viral-plus/library/ -[[ -L $DIR/dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s $DIR/data/library/viral-neighbors/ $DIR/dbs/refseq-viral-plus/library/ - export PATH="$DIR/install:$PATH" -for K in 21 26 31; do - mkdir -p $DIR/dbs/refseq-viral-k$K - krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy +for K in 31 26 21; do + #mkdir -p $DIR/dbs/refseq-viral-k$K + #krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy if [[ `uname` != "Darwin" ]]; then - krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --taxonomy-dir=$DIR/data/taxonomy - + #mkdir -p $DIR/dbs/refseq-bacteria-k$K + #krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy + mkdir -p $DIR/dbs/refseq-oct2017-k$K + krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/libray/vertebrate_mammalia --taxonomy-dir=$DIR/data/taxonomy fi done diff --git a/tests/init.sh b/tests/init.sh index 0c341fa..f4c73d3 100755 --- a/tests/init.sh +++ b/tests/init.sh @@ -1,19 +1,22 @@ #!/bin/bash -DIR=$1 -[[ "$DIR" == "" ]] && DIR=`pwd` +set -xeu + +[[ $# -eq 1 ]] && DIR=$1 || DIR=`pwd` ## Install KrakenU locally into install/ -$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install +#$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install ## Download taxonomy and genomic data into data/ #$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any +#$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 for i in viral viral-neighbors archaea bacteria; do - if [[ ! -f "$DIR/data/all-$i.fna" ]]; then - find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna - fi - if [[ ! -f "$DIR/data/all-$i.map" ]]; then - find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map - fi + [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna + [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map + DUSTED_F="$DIR/data/all-$i-dusted.fna" + [[ -s $DUSTED_F ]] || dustmasker -infmt fasta -in $DIR/data/all-$i.fna -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > "$DUSTED_F" + mkdir -p $DIR/data/library/$i-dusted + [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna" ]] || ln "$DUSTED_F" "$DIR/data/library/$i-dusted/all-$i-dusted.fna" + [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna.map" ]] || ln "$DIR/data/all-$i.map" "$DIR/data/library/$i-dusted/all-$i.map" done From 4130390f5db074de0fa03df9e6cfccfc4f3663f8 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 03:05:11 -0400 Subject: [PATCH 060/105] Make building work for OSX and Linux --- scripts/krakenu-build_db.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index fb79fac..00e708a 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -24,6 +24,7 @@ set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands +set -x function report_time_elapsed() { set -x @@ -90,8 +91,9 @@ if [ ! -s "library-files.txt" ]; then find $FIND_OPTS $LIBRARY_DIR '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt fi -files0() { - cat library-files.txt | tr '\n' '\0' +file_sizes() { + ## stat -c is for Linux, stat -f is for BSD/OSX + cat library-files.txt | tr '\n' '\0' | xargs -0 -I '{}' sh -c "stat -c '%s\n' {} 2> /dev/null || stat -f '%z' {}" } cat_library() { cat library-files.txt | tr '\n' '\0' | xargs -0 cat @@ -117,7 +119,7 @@ else # Estimate hash size as 1.15 * chars in library FASTA files if [ -z "$KRAKEN_HASH_SIZE" ] then - KRAKEN_HASH_SIZE=$( files0 | xargs -0 stat -f%z | perl -nle '$sum += $_; END {print int(1.15 * $sum)}') + KRAKEN_HASH_SIZE=$( file_sizes | perl -nle '$sum += $_; END {print int(1.15 * $sum)}') echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'" fi From cfd04227198252437d6874c9c0854d1ae4b882a0 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 03:05:29 -0400 Subject: [PATCH 061/105] Fixes for building and downloading --- scripts/krakenu-download | 22 ++++++++++++---------- src/set_lcas.cpp | 15 ++++++++------- src/taxdb.h | 21 +++++++++++++-------- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/scripts/krakenu-download b/scripts/krakenu-download index 7cf6fd3..4508f98 100755 --- a/scripts/krakenu-download +++ b/scripts/krakenu-download @@ -40,7 +40,6 @@ my $BASE_DIR; my $DB_DIR; my $N_PROC=5; my $CHANGE_HEADER=0; -my $DOWNLOAD_RNA=0; my $DO_DUST=0; my $FILTER_UNPLACED=0; my $VERBOSE=0; @@ -80,6 +79,7 @@ WHEN USING database refseq OR genbank: -c Only download genomes in the specified refseq category. Default: any. -t Only download the specified taxonomy IDs, comma separated. Default: any. --fna Comma-separated list of sequence types, including genomic, rna, rna_from_genomic, cds_from_genomic. Default: $FNA_FILES. + See the assembly project FTP site for available sequences -u Filter unplaced sequences. -m Mask low-complexity regions using dustmasker. -l Modify sequence header to include taxonomy ID for Kraken (i.e. add '>kraken:taxid|TAXID' to each sequence). @@ -191,12 +191,11 @@ if ($INCLUDE_VIRAL_NEIGHBORS) { sub download(@) { my ($url, $file, $gunzipped_filename) = @_; - if (-s $file && !$OVERWRITE_FILES) { + if (!$OVERWRITE_FILES && (( defined $gunzipped_filename && -s $gunzipped_filename) || (!defined $gunzipped_filename && -s $file))) { print STDERR "Not fetching $url - file $file exists.\n" if $VERBOSE; return 1; } - start_fork() and return; if ($url =~ /^http/) { print STDERR "Fetching $url to $file ..." if $VERBOSE; if (!-d dirname($file)) { @@ -206,7 +205,6 @@ sub download(@) { if (!$response->is_success) { print STDERR "\nFAIL: Error downloading $url!\n"; print STDERR $response->status_line."\n"; - exit; } else { print STDERR "SUCCESS\n" if $VERBOSE; } @@ -221,14 +219,13 @@ sub download(@) { move($where, $file); if (defined $gunzipped_filename) { - print STDERR " GUNZIPPING"; + print STDERR " GUNZIPPING" if $VERBOSE; gunzip $file => $gunzipped_filename or die "gunzip failed: $GunzipError"; unlink $file; $file = $gunzipped_filename; } print STDERR " SUCCESS\n" if $VERBOSE; } - exit; #my $where = $ff->fetch(to=> dirname($file)) or die "\n$ff->error for $url!"; return -s $file; } @@ -261,7 +258,7 @@ sub wait_children() { } sub end_fork() { - exit() unless $N_PROC == 1; + exit() unless $N_PROC <= 1; } sub download_viral_neighbors(@) { @@ -312,12 +309,14 @@ sub download_viral_neighbors(@) { $name1 =~ s/__/_/g; my $file = "$nbr_dir/$name1-tax$taxid/$nbr_ac.fna"; my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac"; + start_fork() and next; if (download($url,$file)) { print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac"); } + end_fork(); } print STDERR "\n"; - wait_children; + wait_children(); # $pm->wait_all_children(); } @@ -396,7 +395,7 @@ sub download_taxonomy(@) { sub download_domain(@) { my ($domain_dir, $domain, $_assembly_level, $_taxid) = @_; - print STDERR "Downloading assembly summary file for $domain genomes.\n"; + print STDERR "Downloading assembly summary file for $domain genomes, and filtering to assembly level $_assembly_level and taxid $_taxid.\n"; die unless defined $domain_dir && defined $domain; if (-d $domain_dir) { print STDERR "WARNING: $domain_dir already exists - potentially overwriting files.\n"; @@ -451,6 +450,7 @@ sub download_domain(@) { my $bname1 = "${organism_name1}-tax${taxid}-${bname}"; foreach my $ext (split(/,/, $FNA_FILES)) { + start_fork() and next; my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz"; my $bfname = $bname1."_".$ext; my $fname = $bfname.".fna"; @@ -477,9 +477,11 @@ sub download_domain(@) { system("dustmasker -infmt fasta -in '$domain_dir/$fname' -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > '$domain_dir/${bfname}_dustmasked.fna'"); unlink("$domain_dir/$fname"); } + end_fork(); } } -# $pm->wait_all_children; + wait_children(); + print STDERR "\n"; } diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 8db1033..7e69bab 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -173,7 +173,7 @@ int main(int argc, char **argv) { } inline -uint32_t get_taxid( +uint32_t get_new_taxid( unordered_map& name_to_taxid_map, unordered_map& Parent_map, string name, uint32_t parent_taxid, const string & rank_name) { @@ -182,8 +182,9 @@ uint32_t get_taxid( if (it == name_to_taxid_map.end()) { uint32_t new_taxid = ++New_taxid_start; bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name); - if (!insert_res) - cerr << "Taxonomy ID " << new_taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl; + if (!insert_res) { + return 0; + } // insert_res shows if insert failed, but we don't care // cerr << "Adding assembly: " << name << " with taxid " << new_taxid << endl; Parent_map[new_taxid] = parent_taxid; @@ -205,7 +206,7 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi if (map_file.rdstate() & ifstream::failbit) { err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str()); } - string line, seq_id; + string line, seq_id, name; uint32_t taxid; // Used when adding new taxids for assembly or sequence @@ -226,13 +227,13 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi if (Add_taxIds_for_Assembly && iss.good()) { iss.get(); - string name; getline(iss, name); - taxid = get_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly"); + if (!name.empty()) + taxid = get_new_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly"); } if (Add_taxIds_for_Sequences) { - taxid = get_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence"); + taxid = get_new_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence"); } if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) { cout << seq_id << '\t' << taxid << '\n'; diff --git a/src/taxdb.h b/src/taxdb.h index 28313f7..eb8fe78 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -561,9 +561,8 @@ std::unordered_map > taxonomyID, newEntry }); } - taxIDsAndEntries.insert({ - 0, {0, 0, "no rank", "unclassified" } - }); + taxIDsAndEntries.insert({0, {0, 0, "no rank", "unclassified" }}); + //taxIDsAndEntries.insert({-1, {-1, 0, "no rank", "uncategorized" }}); createPointers(taxIDsAndEntries); log_msg("done reading TaxDB, read " + std::to_string(taxIDsAndEntries.size()) + " taxa"); return(taxIDsAndEntries); @@ -671,15 +670,18 @@ TAXID TaxonomyDB::getLowestCommonAncestor( template bool TaxonomyDB::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) { + if (parentTaxonomyID_ == taxonomyID_) { + return false; + } - TaxonomyEntry newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0); - auto parentIt = taxIDsAndEntries.find(parentTaxonomyID_); - if (parentIt == taxIDsAndEntries.end() || parentTaxonomyID_ == taxonomyID_) { - cerr << "ERROR while inserting taxonomy entry - taxonomy ID " << taxonomyID_ <<"; parent taxonomy ID " << parentTaxonomyID_ << "!" << endl; + if (parentIt == taxIDsAndEntries.end()) { + cerr << "ERROR with taxon [" << taxonomyID_ <<";"< newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0); + newEntry.parent = &(parentIt->second); auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry}); parentIt->second.children.push_back(&insert_res.first->second); @@ -933,7 +935,10 @@ void TaxReport::printReport(std::string format, std::string ra // B: print normal results printReport(_taxdb.taxIDsAndEntries.at(1),0u); // C: Print Unclassified stuff - //printReport(_taxdb.taxIDsAndEntries.at(-1),0u); + auto it = _taxdb.taxIDsAndEntries.find(-1); + if (it != _taxdb.taxIDsAndEntries.end()) { + printReport(it->second,0u); + } } else { // print stuff at a certain level .. //_uid_abundance; From a88535a01bb3ce8bb22ea26b8d2c0aa4bc11901b Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 13:12:00 -0400 Subject: [PATCH 062/105] Fixed contaminants download --- scripts/krakenu-download | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/krakenu-download b/scripts/krakenu-download index 4508f98..1d67438 100755 --- a/scripts/krakenu-download +++ b/scripts/krakenu-download @@ -344,9 +344,9 @@ sub download_contaminats(@) { # download UniVec and EmVec database download("ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec","$CONTAMINANT_DIR/UniVec.fna"); - download("ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz","$CONTAMINANT_DIR/emvec.dat.gz"); + download("ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz","$CONTAMINANT_DIR/emvec.dat.gz", "$CONTAMINANT_DIR/emvec.dat"); - open(my $E1, "|-", "gunzip -c emvec.dat.gz"); + open(my $E1, "<", "$CONTAMINANT_DIR/emvec.dat"); open(my $E2, ">", "$CONTAMINANT_DIR/EmVec.fna"); my ($ac,$de); @@ -360,12 +360,11 @@ sub download_contaminats(@) { } elsif (/^SQ/) { $in_seq = 1; print $E2 ">$ac $de\n"; - print "$ac\t$CONTAMINANT_TAXID\tEmVec\n"; } elsif ($in_seq) { if (/^\s+[agct]/) { s/\s+[0-9]+$//; s/ //g; - print $_; + print $E2 $_; } else { $in_seq = 0; } @@ -373,13 +372,14 @@ sub download_contaminats(@) { } close($E2); close($E1); - unlink("emvec.dat.gz"); + unlink("$CONTAMINANT_DIR/emvec.dat"); if ( $CHANGE_HEADER ) { system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/UniVec.fna"); system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/EmVec.fna"); } else { print_header_lines("$CONTAMINANT_DIR/UniVec.fna", $CONTAMINANT_TAXID, "UniVec"); + print_header_lines("$CONTAMINANT_DIR/EmVec.fna", $CONTAMINANT_TAXID, "EmVec"); } } From 0145ef457994e1f2540bfb90f4da02c955c905de Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 14:04:54 -0400 Subject: [PATCH 063/105] Fix for Linux/OSX building --- scripts/krakenu-build_db.sh | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index 00e708a..a39432f 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -92,8 +92,11 @@ if [ ! -s "library-files.txt" ]; then fi file_sizes() { - ## stat -c is for Linux, stat -f is for BSD/OSX - cat library-files.txt | tr '\n' '\0' | xargs -0 -I '{}' sh -c "stat -c '%s\n' {} 2> /dev/null || stat -f '%z' {}" + if [[ `uname` == "Darwin" ]]; then + cat library-files.txt | tr '\n' '\0' | xargs -0 stat -f '%z' + else + cat library-files.txt | tr '\n' '\0' | xargs -0 stat -c '%s\n' + fi } cat_library() { cat library-files.txt | tr '\n' '\0' | xargs -0 cat @@ -234,7 +237,7 @@ else fi if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then - if [ -e "database.kdb" ] + if [ -s "database.kdb" ] then echo "Skipping step 6, LCAs already set." else @@ -262,15 +265,16 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then echo "LCA database created. [$(report_time_elapsed $start_time1)]" fi ## Make a classification report - if [[ ! -s $(basename `pwd`).report ]]; then - echo "Creating database summary report ..." - krakenu --db . --report-file $(basename `pwd`).report --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $(basename `pwd`).kraken + REPNAME=database + if [[ ! -s $REPNAME.report.tsv ]]; then + echo "Creating database summary report $REPNAME.report.tsv ..." + krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv fi fi if [ "$KRAKEN_UID_DATABASE" != "0" ]; then - if [ -e "uid_database.complete" ] + if [ -s "uid_database.kdb" ] then echo "Skipping step 6.3, UID datanbase already generated." else @@ -289,15 +293,15 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then start_time1=$(date "+%s.%N") set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \ -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kmer_count -F <( cat_library ) - touch "uid_database.complete" echo "UID Database created. [$(report_time_elapsed $start_time1)]" fi ## Make a classification report - if [[ ! -s $(basename `pwd`).uid_report ]]; then - echo "Creating database summary report ..." - krakenu --db . --report-file $(basename `pwd`).uid_report --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $(basename `pwd`).uid_kraken + REPNAME=uid_database + if [[ ! -s $REPNAME.report.tsv ]]; then + echo "Creating UID database summary report $REPNAME.report.tsv ..." + krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $REPNAME.kraken.tsv fi fi From bb1e65da3968fbc583661e32aba405724fe4a910 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 14:06:01 -0400 Subject: [PATCH 064/105] Minor speed improvements in classify --- src/classify.cpp | 113 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 27 deletions(-) diff --git a/src/classify.cpp b/src/classify.cpp index b5e196f..703b5d8 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -40,6 +40,10 @@ void process_file(char *filename); bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, unordered_map&); +inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna); +string hitlist_string(const vector &taxa); + + set get_ancestry(uint32_t taxon); void report_stats(struct timeval time1, struct timeval time2); unordered_map taxon_counts; // stats per taxon @@ -350,6 +354,20 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu return taxon; } + +inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna) { + if (Fastq_input) { + (*oss_ptr) << "@" << dna.header_line << endl + << dna.seq << endl + << "+" << endl + << dna.quals << endl; + } + else { + (*oss_ptr) << ">" << dna.header_line << endl + << dna.seq << endl; + } +} + inline void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_t& last_counter, uint32_t current_taxon) { if (last_taxon == current_taxon) { @@ -367,10 +385,47 @@ void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_ } } +string hitlist_string(const vector &taxa) +{ + uint32_t last_code = taxa[0]; + int code_count = 1; + ostringstream hitlist; + + for (size_t i = 1; i < taxa.size(); i++) { + uint32_t code = taxa[i]; + + if (code == last_code) { + code_count++; + } + else { + if (last_code >= 0) { + hitlist << last_code << ":" << code_count << " "; + } + else { + hitlist << "A:" << code_count << " "; + } + code_count = 1; + last_code = code; + } + } + if (last_code == -1) { + hitlist << "A:" << code_count; + } + else { + hitlist << last_code << ":" << code_count; + } + return hitlist.str(); +} + + bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, unordered_map& my_taxon_counts) { - // TODO: use vector::reserve + size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1; + vector taxa; + taxa.reserve(n_kmers); + //vector ambig_list; + //ambig_list.reserve(n_kmers); unordered_map hit_counts; uint64_t *kmer_ptr; uint32_t taxon = 0; @@ -383,7 +438,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, int64_t current_max_pos = 0; }; - string hitlist_string; + //string hitlist_string; uint32_t last_taxon; uint32_t last_counter; @@ -394,9 +449,12 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, while ((kmer_ptr = scanner.next_kmer()) != NULL) { taxon = 0; if (scanner.ambig_kmer()) { - append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon); + //append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon); + //ambig_list.push_back(1); + taxa.push_back(-1); } else { + //ambig_list.push_back(0); // go through multiple databases to map k-mer for (size_t i=0; i= Minimum_hit_count) break; } + taxa.push_back(taxon); } - append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon); + //append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon); } } @@ -434,24 +497,16 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, call = resolve_tree(hit_counts, Parent_map); } + + #pragma omp atomic ++(my_taxon_counts[call].n_reads); - if (Print_unclassified || Print_classified) { - ostringstream *oss_ptr = call ? &coss : &uoss; - bool print = call ? Print_classified : Print_unclassified; - if (print) { - if (Fastq_input) { - (*oss_ptr) << "@" << dna.header_line << endl - << dna.seq << endl - << "+" << endl - << dna.quals << endl; - } - else { - (*oss_ptr) << ">" << dna.header_line << endl - << dna.seq << endl; - } - } - } + if (Print_unclassified && !call) + print_sequence(&uoss, dna); + + if (Print_classified && call) + print_sequence(&coss, dna); + if (! Print_kraken) return call; @@ -464,19 +519,23 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, return false; koss << "U\t"; } - koss << dna.id << "\t" << call << "\t" << dna.seq.size() << "\t"; + koss << dna.id << '\t' << call << '\t' << dna.seq.size() << '\t'; if (Quick_mode) { koss << "Q:" << hits; } else { - if (hitlist_string.empty() && last_counter == 0) + if (taxa.empty()) koss << "0:0"; - else { - koss << hitlist_string - << (last_taxon == ambig_taxon? "A" : std::to_string(last_taxon)) - << ':' << std::to_string(last_counter); - } + else + koss << hitlist_string(taxa); + //if (hitlist_string.empty() && last_counter == 0) + // koss << "0:0"; + //else { + // koss << hitlist_string + // << (last_taxon == ambig_taxon? "A" : std::to_string(last_taxon)) + // << ':' << std::to_string(last_counter); + //} } if (Print_sequence) From 7bbd9862eeb7567da6f89a35074e3bcf381023d7 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 14:06:24 -0400 Subject: [PATCH 065/105] Correct number of arguments --- src/dump_taxdb.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp index b2c73f0..79e668f 100644 --- a/src/dump_taxdb.cpp +++ b/src/dump_taxdb.cpp @@ -7,8 +7,8 @@ using namespace std; int main(int argc, char **argv) { - if (argc != 3) { - std::cerr << "Usage: build_taxdb taxDB names.dmp nodes.dmp\n"; + if (argc != 4) { + std::cerr << "Usage: dump_taxdb taxDB names.dmp nodes.dmp\n"; return 1; } TaxonomyDB taxdb {(string)argv[1]}; From 10213696ebaea78e229325ba5e61ccacfbc51c8c Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 14:06:53 -0400 Subject: [PATCH 066/105] Ignore taxa not in taxonomy DB --- src/grade_classification.cpp | 5 +++++ src/taxdb.h | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp index f787065..edfc999 100644 --- a/src/grade_classification.cpp +++ b/src/grade_classification.cpp @@ -94,6 +94,11 @@ int main(int argc, char **argv) { exit(1); } else { seq_taxid = it->second; + if (!taxdb.hasTaxon(seq_taxid)) { + cerr << "Ignoring taxon " << seq_taxid << " - not in database" << endl; + continue; + } + //cerr <<"seqid" << seq_taxid; // go up to species level or next proper (i.e. not 'no rank') rank for // both real and assigned taxon diff --git a/src/taxdb.h b/src/taxdb.h index eb8fe78..df45568 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -214,6 +214,7 @@ class TaxonomyDB { TaxonomyEntry getEntry(TAXID taxID) const; bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_); + bool hasTaxon(TAXID taxonomyID_); size_t distance(TAXID taxID1, TAXID taxID2) const; @@ -667,6 +668,12 @@ TAXID TaxonomyDB::getLowestCommonAncestor( return consensus; } + +template +bool TaxonomyDB::hasTaxon(TAXID taxonomyID_) { + return taxIDsAndEntries.find(taxonomyID_) != taxIDsAndEntries.end(); +} + template bool TaxonomyDB::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) { @@ -684,8 +691,9 @@ bool TaxonomyDB::insert(TAXID taxonomyID_, TAXID parentTaxono newEntry.parent = &(parentIt->second); auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry}); - parentIt->second.children.push_back(&insert_res.first->second); - + if (insert_res.second) { + parentIt->second.children.push_back(&insert_res.first->second); + } return insert_res.second; } @@ -784,8 +792,9 @@ TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, return 0; auto entry = taxIDsAndEntries.find(taxID); // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl; - while (entry != taxIDsAndEntries.end() && - entry->second.parentTaxonomyID != 1) { + while (entry != taxIDsAndEntries.end() + && entry->second.parentTaxonomyID != 1 + && entry->second.parentTaxonomyID != entry->first) { // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl; if (entry->second.rank == rank) { return entry->second.taxonomyID; From c6b2d04ca59e95e08bc8f39221584cfdca16dfbc Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 14:08:08 -0400 Subject: [PATCH 067/105] Add comments --- src/set_lcas.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 7e69bab..7a20dc5 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -274,6 +274,7 @@ void process_single_file() { if (it != ID_to_taxon_map.end()) { taxid = it->second; } else if (dna.id.size() >= prefix.size() && dna.id.substr(0,prefix.size()) == prefix) { + // if the AC is not in the map, check if the fasta entry starts with '>kraken:taxid' taxid = std::stol(dna.id.substr(prefix.size())); if (taxid == 0) { cerr << "Error: taxonomy ID is zero for sequence '" << dna.id << "'?!" << endl; @@ -288,6 +289,7 @@ void process_single_file() { } if (Add_taxIds_for_Sequences) { + // Update entry based on header line auto entryIt = taxdb.taxIDsAndEntries.find(taxid); if (entryIt == taxdb.taxIDsAndEntries.end()) { cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["< 0) { + // exclude taxid! + //} + if (taxid) { #pragma omp parallel for schedule(dynamic) for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) From 131daed7c77e6860d963bc0674609734a0a457eb Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 14:11:49 -0400 Subject: [PATCH 068/105] Update to read sim --- tests/classify-reads.sh | 16 ---------------- ...ulate-reads.sh => test-on-simulated-reads.sh} | 0 2 files changed, 16 deletions(-) delete mode 100755 tests/classify-reads.sh rename tests/{simulate-reads.sh => test-on-simulated-reads.sh} (100%) diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh deleted file mode 100755 index 802b29b..0000000 --- a/tests/classify-reads.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -set -xeu - -[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 -SDIR=$DIR/simulated_reads -CDIR=$DIR/classification-results -mkdir -p $CDIR - -NAM=viral-neighbors-10m -for K in 21 26 31; do - KFILE=$CDIR/$NAM.k$K.krakenu - [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $KFILE.report > $KFILE 2> $KFILE.log - [[ -s $KFILE.results ]] || $DIR/install/grade_classification $DIR/dbs/refseq-viral-k$K/taxDB $DIR/data/all-viral-neighbors.map $KFILE > $KFILE.results - [[ -s $KFILE.results.stats ]] || cut -f 4 $KFILE.results | sort | uniq -c | sort -n > $KFILE.results.stats - -done diff --git a/tests/simulate-reads.sh b/tests/test-on-simulated-reads.sh similarity index 100% rename from tests/simulate-reads.sh rename to tests/test-on-simulated-reads.sh From 3f5faf10c1abc3eac079e789c7e1581e3ea726b4 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 14:11:49 -0400 Subject: [PATCH 069/105] Update to read sim --- tests/classify-reads.sh | 16 ---------------- ...ulate-reads.sh => test-on-simulated-reads.sh} | 0 2 files changed, 16 deletions(-) delete mode 100755 tests/classify-reads.sh rename tests/{simulate-reads.sh => test-on-simulated-reads.sh} (100%) diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh deleted file mode 100755 index 802b29b..0000000 --- a/tests/classify-reads.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -set -xeu - -[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 -SDIR=$DIR/simulated_reads -CDIR=$DIR/classification-results -mkdir -p $CDIR - -NAM=viral-neighbors-10m -for K in 21 26 31; do - KFILE=$CDIR/$NAM.k$K.krakenu - [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $KFILE.report > $KFILE 2> $KFILE.log - [[ -s $KFILE.results ]] || $DIR/install/grade_classification $DIR/dbs/refseq-viral-k$K/taxDB $DIR/data/all-viral-neighbors.map $KFILE > $KFILE.results - [[ -s $KFILE.results.stats ]] || cut -f 4 $KFILE.results | sort | uniq -c | sort -n > $KFILE.results.stats - -done diff --git a/tests/simulate-reads.sh b/tests/test-on-simulated-reads.sh similarity index 100% rename from tests/simulate-reads.sh rename to tests/test-on-simulated-reads.sh From 6e909c1d28777be9ddd26cd150137cd404a72da7 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 22:29:28 -0400 Subject: [PATCH 070/105] Various improvements and fixes for building and classification --- scripts/krakenu | 7 +- scripts/krakenu-build_db.sh | 8 +-- src/Makefile | 16 +++-- src/classify.cpp | 117 +++++++++++++++++++++---------- src/taxdb.h | 32 ++++++++- src/uid_mapping.cpp | 62 ++++++++++------ src/uid_mapping.hpp | 7 +- tests/build-dbs.sh | 11 +-- tests/test-on-simulated-reads.sh | 58 ++++++++------- 9 files changed, 214 insertions(+), 104 deletions(-) diff --git a/scripts/krakenu b/scripts/krakenu index 243bcda..006a078 100755 --- a/scripts/krakenu +++ b/scripts/krakenu @@ -97,7 +97,9 @@ if ($@) { die "$PROG: $@"; } -my @kdb_files = map { "$_/database.kdb" } @db_prefix; +my $database = $uid_mapping? "uid_database.kdb" : "database.kdb"; +my @kdb_files = map { "$_/$database" } @db_prefix; + my @idx_files = map { "$_/database.idx" } @db_prefix; foreach my $file (@kdb_files,@idx_files) { @@ -148,7 +150,7 @@ push @flags, "-r", $report_file if defined $report_file; push @flags, "-a", $db_prefix[0]."/taxDB"; push @flags, "-s" if $print_sequence; if ($uid_mapping) { - my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid"; + my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid.map"; if (!-f $uid_mapping_file) { print STDERR "Missing required file $uid_mapping_file for UID mapping.\n"; exit(1); @@ -220,6 +222,7 @@ Usage: $PROG [options] Options: --db NAME Name for Kraken DB (default: $default_db) --report-file FILENAME Write Kraken report to FILENAME + --uid-mapping Map using UID database --threads NUM Number of threads (default: $def_thread_ct) --fasta-input Input is FASTA format --fastq-input Input is FASTQ format diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index a39432f..96f1aa8 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -254,7 +254,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then start_time1=$(date "+%s.%N") set -x set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \ - -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kmer_count \ + -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kdb.counts \ -F <( cat_library ) > seqid2taxid-plus.map set +x if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then @@ -292,7 +292,7 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then fi start_time1=$(date "+%s.%N") set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \ - -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kmer_count -F <( cat_library ) + -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kdb.counts -F <( cat_library ) echo "UID Database created. [$(report_time_elapsed $start_time1)]" fi @@ -300,8 +300,8 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then ## Make a classification report REPNAME=uid_database if [[ ! -s $REPNAME.report.tsv ]]; then - echo "Creating UID database summary report $REPNAME.report.tsv ..." - krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $REPNAME.kraken.tsv + #echo "Creating UID database summary report $REPNAME.report.tsv ..." + #krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv fi fi diff --git a/src/Makefile b/src/Makefile index 0ed70b3..38e8e21 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,8 @@ CXX = g++ FOPENMP?=-fopenmp CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors -PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb +#CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors +PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream .PHONY: all install clean @@ -18,17 +19,21 @@ db_shrink: krakendb.o quickfile.o db_sort: krakendb.o quickfile.o -set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp +set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o grade_classification: taxdb.h -classify: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp - $(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS) +read_uid_mapping: quickfile.o + +classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h + $(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS) build_taxdb: taxdb.h make_seqid_to_taxid_map: quickfile.o +read_uid_mapping: quickfile.o krakenutil.o uid_mapping.o + krakenutil.o: krakenutil.cpp krakenutil.hpp taxdb.h $(CXX) $(CXXFLAGS) -c krakenutil.cpp @@ -40,3 +45,6 @@ seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp quickfile.o: quickfile.cpp quickfile.hpp $(CXX) $(CXXFLAGS) -c quickfile.cpp + +uid_mapping.o: krakenutil.hpp uid_mapping.hpp uid_mapping.cpp + $(CXX) $(CXXFLAGS) -c uid_mapping.cpp diff --git a/src/classify.cpp b/src/classify.cpp index 703b5d8..a6076d3 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -41,7 +41,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, unordered_map&); inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna); -string hitlist_string(const vector &taxa); +string hitlist_string(const vector &taxa, const vector& ambig_list); set get_ancestry(uint32_t taxon); @@ -211,7 +211,7 @@ int main(int argc, char **argv) { Report_output = cout_or_file(Report_output_file); } - cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl; + //cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl; struct timeval tv1, tv2; gettimeofday(&tv1, NULL); @@ -222,21 +222,26 @@ int main(int argc, char **argv) { std::cerr << "Finishing up ..\n"; if (Print_kraken_report) { + for (auto fname : DB_filenames) { + ifstream ifs(fname + ".counts"); + if (ifs.good()) { + ifs.close(); + taxdb.readGenomeSizes(fname+".counts"); + } + } + taxdb.setReadCounts(taxon_counts); TaxReport rep = TaxReport(*Report_output, taxdb, false); rep.setReportCols({ - "percReadsClade", - "numReadsClade", - "numReadsTaxon", - "numUniqueKmersClade", - "numUniqueKmersTaxon", - "numKmersClade", - "numKmersTaxon", - "numKmersInDatabaseClade", - "numKmersInDatabaseTaxon", + "%", + "reads", + "taxReads", + "kmers", + "dup", + "cov", "taxID", - "taxRank", - "indentedName"}); + "rank", + "taxName"}); rep.printReport("kraken","blu"); } @@ -336,8 +341,9 @@ void process_file(char *filename) { total_sequences += work_unit.size(); total_bases += total_nt; //if (Print_Progress && total_sequences % 100000 < work_unit.size()) - if (Print_Progress && total_sequences % 100000 < work_unit.size()) + if (Print_Progress) { cerr << "\rProcessed " << total_sequences << " sequences (" << total_classified << " classified) ..."; + } } } } // end parallel section @@ -345,13 +351,13 @@ void process_file(char *filename) { delete reader; } +inline uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& current_bin_key, int64_t& current_min_pos, int64_t& current_max_pos) { uint32_t* val_ptr = database.kmer_query( database.canonical_representation(*kmer_ptr), ¤t_bin_key, ¤t_min_pos, ¤t_max_pos); - uint32_t taxon = val_ptr ? *val_ptr : 0; - return taxon; + return val_ptr ? *val_ptr : 0; } @@ -385,7 +391,45 @@ void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_ } } -string hitlist_string(const vector &taxa) +string hitlist_string(const vector &taxa, const vector &ambig) +{ + int64_t last_code; + int code_count = 1; + ostringstream hitlist; + + if (ambig[0]) { last_code = -1; } + else { last_code = taxa[0]; } + + for (size_t i = 1; i < taxa.size(); i++) { + int64_t code; + if (ambig[i]) { code = -1; } + else { code = taxa[i]; } + + if (code == last_code) { + code_count++; + } + else { + if (last_code >= 0) { + hitlist << last_code << ":" << code_count << " "; + } + else { + hitlist << "A:" << code_count << " "; + } + code_count = 1; + last_code = code; + } + } + if (last_code >= 0) { + hitlist << last_code << ":" << code_count; + } + else { + hitlist << "A:" << code_count; + } + return hitlist.str(); +} + + +string hitlist_string_depr(const vector &taxa) { uint32_t last_code = taxa[0]; int code_count = 1; @@ -421,11 +465,8 @@ string hitlist_string(const vector &taxa) bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, unordered_map& my_taxon_counts) { - size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1; vector taxa; - taxa.reserve(n_kmers); - //vector ambig_list; - //ambig_list.reserve(n_kmers); + vector ambig_list; unordered_map hit_counts; uint64_t *kmer_ptr; uint32_t taxon = 0; @@ -445,39 +486,40 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, vector db_statuses(KrakenDatabases.size()); if (dna.seq.size() >= KrakenDatabases[0]->get_k()) { + size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1; + taxa.reserve(n_kmers); + ambig_list.reserve(n_kmers); KmerScanner scanner(dna.seq); while ((kmer_ptr = scanner.next_kmer()) != NULL) { taxon = 0; if (scanner.ambig_kmer()) { //append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon); - //ambig_list.push_back(1); - taxa.push_back(-1); + ambig_list.push_back(1); } else { - //ambig_list.push_back(0); - + ambig_list.push_back(0); // go through multiple databases to map k-mer for (size_t i=0; ikmer_query( + // KrakenDatabases[i]->canonical_representation(*kmer_ptr), &db_statuses[i].current_bin_key, + // &db_statuses[i].current_min_pos, &db_statuses[i].current_max_pos); + //taxon = val_ptr ? *val_ptr : 0; if (taxon) break; } - //cerr << "taxon for " << *kmer_ptr << " is " << taxon << endl; - + // cerr << "taxon for " << *kmer_ptr << " is " << taxon << endl; my_taxon_counts[taxon].add_kmer(*kmer_ptr); if (taxon) { - if (taxon == -1) { - cerr << "ERROR: Invalid taxon (-1)" << endl; - exit(1); - } hit_counts[taxon]++; if (Quick_mode && ++hits >= Minimum_hit_count) break; } - taxa.push_back(taxon); } + taxa.push_back(taxon); //append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon); } } @@ -488,7 +530,8 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, cerr << "Quick mode not available when mapping UIDs" << endl; exit(1); } else { - call = resolve_uids2(hit_counts, Parent_map, (const uint32_t *)UID_to_TaxID_map_file.ptr(), UID_to_TaxID_map_file.size()); + call = resolve_uids2(hit_counts, Parent_map, + UID_to_TaxID_map_file.ptr(), UID_to_TaxID_map_file.size()); } } else { if (Quick_mode) @@ -497,8 +540,6 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, call = resolve_tree(hit_counts, Parent_map); } - - #pragma omp atomic ++(my_taxon_counts[call].n_reads); if (Print_unclassified && !call) @@ -528,7 +569,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, if (taxa.empty()) koss << "0:0"; else - koss << hitlist_string(taxa); + koss << hitlist_string(taxa, ambig_list); //if (hitlist_string.empty() && last_counter == 0) // koss << "0:0"; //else { diff --git a/src/taxdb.h b/src/taxdb.h index df45568..f49d9dc 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -29,6 +29,7 @@ #include #include #include +#include #include "report-cols.h" using namespace std; @@ -223,6 +224,7 @@ class TaxonomyDB { void setGenomeSizes(const std::unordered_map & genomeSizes); void setReadCounts(const std::unordered_map& readCounts); + void readGenomeSizes(string file); void setGenomeSize(const TAXID taxid, const uint64_t genomeSize); void addReadCount(const TAXID taxid, const READCOUNTS& readCounts_); @@ -876,6 +878,23 @@ void TaxonomyDB::setGenomeSize(const TAXID taxid, const uint64 } +template +void TaxonomyDB::readGenomeSizes(string file) { + for (auto& entry : taxIDsAndEntries) { + entry.second.genomeSize = 0; + entry.second.genomeSizeOfChildren = 0; + } + log_msg("Reading genome sizes from " + file); + std::ifstream inFile(file); + if (!inFile.is_open()) + throw std::runtime_error("unable to open file " + file); + TAXID taxonomyID; + uint64_t size; + while (!inFile.eof()) { + inFile >> taxonomyID >> size; + setGenomeSize(taxonomyID, size); + } +} template void TaxonomyDB::setReadCounts(const unordered_map& readCounts) { @@ -967,22 +986,29 @@ void TaxReport::printReport(TaxonomyEntry& t template void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { + + long long unique_kmers_for_clade = ( tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality()); + double genome_size = double(tax.genomeSize+tax.genomeSizeOfChildren); + for (auto& col : _report_cols) { switch (col) { case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << setprecision(4) << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break; //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.readCounts); break; case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break; case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break; - case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << (tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality()); break; + case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << unique_kmers_for_clade; break; case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break; case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers; break; - case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; + case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; + case REPORTCOLS::CLADE_KMER_COVERAGE: if (genome_size == 0) { _reportOfb << "NA"; } else { + _reportOfb << setprecision(4) << (unique_kmers_for_clade / genome_size); }; break; + case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers) / unique_kmers_for_clade ); break; case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp index 966a685..4a80946 100644 --- a/src/uid_mapping.cpp +++ b/src/uid_mapping.cpp @@ -122,7 +122,7 @@ namespace kraken { uint32_t resolve_uids2( const unordered_map &uid_hit_counts, const unordered_map &parent_map, - const uint32_t* fptr, const size_t fsize) { + const char* fptr, const size_t fsize) { unordered_map taxid_counts; unordered_map frac_taxid_counts; @@ -131,33 +131,17 @@ namespace kraken { return(0); } - for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { - uint32_t next_uid = it->first; - if (next_uid == 0) { + for (const auto& it : uid_hit_counts) { + if (it.first == 0) { continue; } - uint32_t taxid; // TODO: Just get a uint64_t and shift the bits, probably faster - vector taxids; - do { - // Check if the accessed memory is out of range - // -- move this to a DEBUG-only assert - // UID-1 is used because UIDs start at 1 - uint32_t offset = (next_uid-1)*UID_BLOCK_SIZE; - if (offset >= fsize) { - cerr << "It seems you are trying to access a block after the file end: \n" << - " fptr: " << fptr << "; uid: " << next_uid << "; " << " addr: " << (offset + INT_SIZE) << endl; - exit(1); - } - taxid = *(fptr + offset); - next_uid = *(fptr+ offset + INT_SIZE); - taxid_counts[taxid] += it->second; - taxids.push_back(taxid); - } while (next_uid != 0); + vector taxids = get_taxids_for_uid(it.first, fptr); - double frac_count = (double)it->second / (double)taxids.size(); + double frac_count = (double)it.second / (double)taxids.size(); for (uint32_t taxid : taxids) { frac_taxid_counts[taxid] += frac_count; + taxid_counts[taxid] += it.second; } } @@ -194,3 +178,37 @@ namespace kraken { } } + +vector get_taxids_for_uid(const uint32_t uid, const char* fptr) { + size_t int_size = sizeof(int); + size_t block_size = sizeof(int)*2; + // TODO: Just get a uint64_t and shift the bits, probably faster + uint32_t taxid = *(uint32_t*)(fptr+(uid-1)*block_size); + uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size); + + vector taxids = {taxid}; + while (parent_uid != 0) { + // TODO: Consider checking if the accessed meory is out of range. + // if (offset >= fsize) { + // cerr << "It seems you are trying to access a block after the file end: \n" << + // " fptr: " << fptr << "; uid: " << next_uid << "; " << " addr: " << (offset + INT_SIZE) << endl; + // exit(1); + //} + taxid = *(uint32_t*)(fptr+(parent_uid-1)*block_size); + parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size); + taxids.push_back(taxid); + } + //std::sort(taxids.begin(), taxids.end()); + return(taxids); +} + +vector get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map >& uid_map ) { + auto it = uid_map.find(uid); + if (it != uid_map.end()) { + return it->second; + } + vector taxids = get_taxids_for_uid(uid, fptr); + uid_map[uid] = taxids; + return(taxids); +} + diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp index 7c7d0fa..1f84c40 100644 --- a/src/uid_mapping.hpp +++ b/src/uid_mapping.hpp @@ -40,6 +40,11 @@ uint32_t resolve_uids( uint32_t resolve_uids2( const unordered_map &uid_hit_counts, const unordered_map &parent_map, - const uint32_t* fptr, const size_t fsize); + const char* fptr, const size_t fsize); } + +vector get_taxids_for_uid(const uint32_t uid, const char* fptr); + +vector get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map >& uid_map ); + #endif diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index 082bac5..9922df7 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -6,12 +6,15 @@ set -xeu export PATH="$DIR/install:$PATH" for K in 31 26 21; do - #mkdir -p $DIR/dbs/refseq-viral-k$K - #krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy + mkdir -p $DIR/dbs/refseq-viral-k$K + time krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy 2>&1 | tee $DIR/dbs/refseq-viral-k$K/build.log + + mkdir -p $DIR/dbs/refseq-viral-k$K/taxonomy + dump_taxdb $DIR/dbs/refseq-viral-k$K/taxDB $DIR/dbs/refseq-viral-k$K/taxonomy/names.dmp $DIR/dbs/refseq-viral-k$K/taxonomy/nodes.dmp if [[ `uname` != "Darwin" ]]; then - #mkdir -p $DIR/dbs/refseq-bacteria-k$K - #krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy + mkdir -p $DIR/dbs/refseq-bacteria-k$K + krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy mkdir -p $DIR/dbs/refseq-oct2017-k$K krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/libray/vertebrate_mammalia --taxonomy-dir=$DIR/data/taxonomy fi diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh index 09d7db7..df18b14 100755 --- a/tests/test-on-simulated-reads.sh +++ b/tests/test-on-simulated-reads.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -xeu +set -eu [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 SDIR=$DIR/simulated_reads @@ -8,33 +8,38 @@ CDIR=$DIR/classification-results mkdir -p $CDIR mkdir -p $SDIR -run_krakenu_viral() { - FQ=$1 - NAM=$2 - K=$3 - DAT=$4 - - KFILE=$CDIR/$NAM.k$K.krakenu - [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ --report-file $KFILE.report > $KFILE 2> $KFILE.log - [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map - [[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats -} +[[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10 -run_kraken_viral() { +run_kraken() { FQ=$1 NAM=$2 - K=$3 - DAT=$4 - - KFILE=$CDIR/$NAM.k$K.kraken - [[ -s $KFILE ]] || time kraken --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ > $KFILE 2> $KFILE.log - [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map - #[[ -s $KFILE.results.stats ]] || - $DIR/install/grade_classification $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats + DAT=$3 + DB_DAT=$4 + DB_K=$5 + PROG=$6 + DB=refseq-$DB_DAT-k$K + mkdir -p $CDIR/against-$DB + KFILE=$CDIR/against-$DB/$NAM.against-$DB.$PROG + + if [[ "$PROG" == "kraken" ]]; then + CMD="kraken" + elif [[ "$PROG" == "krakenu" ]]; then + CMD="$DIR/install/krakenu --report-file $KFILE.report" + elif [[ "$PROG" == "krakenuid" ]]; then + CMD="$DIR/install/krakenu --report-file $KFILE.report --uid-mapping" + else + echo "Unknown $PROG" + return; + fi + + if [[ ! -s $KFILE ]]; then + echo "$CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE" + time $CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE 2>&1 | tee $KFILE.log + fi + #[[ "$DAT" == "$DB_DAT" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map + #[[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification $DIR/dbs/$DB/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats } - - AB=1m for i in 1 2 3; do for dat in viral viral-neighbors bacteria archaea; do @@ -42,9 +47,10 @@ for i in 1 2 3; do NAM=$dat.$AB${len}bp.$i FQ=$SDIR/$NAM.fq [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i - for K in 21 26 31; do - run_krakenu_viral $FQ $NAM $K $dat - run_kraken_viral $FQ $NAM $K $dat + for K in 31; do + run_kraken $FQ $NAM $dat viral $K kraken + run_kraken $FQ $NAM $dat viral $K krakenu + #run_kraken $FQ $NAM $dat viral $K krakenuid done done done From 69815b814cef06b39fe0dc43fd8ee31c7805bfa1 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 22:30:54 -0400 Subject: [PATCH 071/105] Special treatment for host and conaminant taxids --- src/set_lcas.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 7e69bab..653089c 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -77,6 +77,8 @@ KrakenDB Database; TaxonomyDB taxdb; const string prefix = "kraken:taxid|"; +unordered_set host_taxids = {9606}; +uint32_t contaminant_taxids = {32630}; int main(int argc, char **argv) { @@ -232,7 +234,7 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi taxid = get_new_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly"); } - if (Add_taxIds_for_Sequences) { + if (Add_taxIds_for_Sequences && taxid != 9606) { taxid = get_new_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence"); } if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) { @@ -287,7 +289,7 @@ void process_single_file() { continue; } - if (Add_taxIds_for_Sequences) { + if (Add_taxIds_for_Sequences && taxid != 9606) { auto entryIt = taxdb.taxIDsAndEntries.find(taxid); if (entryIt == taxdb.taxIDsAndEntries.end()) { cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["< Date: Sun, 1 Oct 2017 22:31:18 -0400 Subject: [PATCH 072/105] Added build --- tests/build-dbs.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index 082bac5..677d1c8 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -10,10 +10,10 @@ for K in 31 26 21; do #krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy if [[ `uname` != "Darwin" ]]; then - #mkdir -p $DIR/dbs/refseq-bacteria-k$K - #krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy mkdir -p $DIR/dbs/refseq-oct2017-k$K - krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/libray/vertebrate_mammalia --taxonomy-dir=$DIR/data/taxonomy + krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/library/vertebrate_mammalian --library-dir=$DIR/data/library/contaminants --taxonomy-dir=$DIR/data/taxonomy + mkdir -p $DIR/dbs/refseq-bacteria-k$K + krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy fi done From 4206549d90ef449450beccfb6257f6f364967882 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 22:36:08 -0400 Subject: [PATCH 073/105] Add new columns --- src/report-cols.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/report-cols.h b/src/report-cols.h index 2392bd8..e5fa0a5 100644 --- a/src/report-cols.h +++ b/src/report-cols.h @@ -26,6 +26,8 @@ enum class REPORTCOLS : uint8_t { NUM_UNIQUE_KMERS_CLADE, NUM_KMERS_IN_DATABASE, NUM_KMERS_IN_DATABASE_CLADE, + CLADE_KMER_COVERAGE, + CLADE_KMER_DUPLICITY, TOTAL_SCORE, TOTAL_HIT_LENGTH, ABUNDANCE, @@ -37,8 +39,10 @@ enum class REPORTCOLS : uint8_t { static const std::map report_col_name_map = { {"name", REPORTCOLS::NAME}, {"indentedName", REPORTCOLS::SPACED_NAME}, + {"taxName", REPORTCOLS::SPACED_NAME}, {"taxID", REPORTCOLS::TAX_ID}, {"taxRank", REPORTCOLS::TAX_RANK}, + {"rank", REPORTCOLS::TAX_RANK}, {"depth", REPORTCOLS::DEPTH}, {"genomeSize", REPORTCOLS::GENOME_SIZE}, {"numReadsTaxon", REPORTCOLS::NUM_READS}, @@ -54,8 +58,25 @@ static const std::map report_col_name_map = { {"abundance", REPORTCOLS::ABUNDANCE}, {"abundance_len", REPORTCOLS::ABUNDANCE_LEN}, + {"taxReads", REPORTCOLS::NUM_READS}, + {"reads", REPORTCOLS::NUM_READS_CLADE}, + {"cladeReads", REPORTCOLS::NUM_READS_CLADE}, + {"taxKmers", REPORTCOLS::NUM_KMERS}, + {"cladeKmers", REPORTCOLS::NUM_KMERS_CLADE}, + {"kmers", REPORTCOLS::NUM_UNIQUE_KMERS_CLADE}, + {"kmerDup", REPORTCOLS::CLADE_KMER_DUPLICITY}, + {"dup", REPORTCOLS::CLADE_KMER_DUPLICITY}, + {"kmerCov", REPORTCOLS::CLADE_KMER_COVERAGE}, + {"cov", REPORTCOLS::CLADE_KMER_COVERAGE}, + {"specificTaxKmers", REPORTCOLS::NUM_UNIQUE_KMERS}, + {"specificCladeKmers", REPORTCOLS::NUM_UNIQUE_KMERS_CLADE}, + {"taxKmersInDB", REPORTCOLS::NUM_KMERS_IN_DATABASE}, + {"cladeKmersInDB", REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE}, + + {"cladePerc", REPORTCOLS::PERCENTAGE}, {"percReadsClade", REPORTCOLS::PERCENTAGE}, {"percent", REPORTCOLS::PERCENTAGE}, + {"%", REPORTCOLS::PERCENTAGE}, {"taxId", REPORTCOLS::TAX_ID}, {"reads_clade", REPORTCOLS::NUM_READS_CLADE}, // Change to clade reads! {"reads_stay", REPORTCOLS::NUM_READS}, // Change to clade reads! From e84091ad848bbfb78b3164d69c85602842ed6a19 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 22:36:26 -0400 Subject: [PATCH 074/105] Up default precision to 12 --- src/hyperloglogplus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h index 8cd2bdc..b4d9a81 100644 --- a/src/hyperloglogplus.h +++ b/src/hyperloglogplus.h @@ -237,7 +237,7 @@ class HyperLogLogPlusMinus { * @param precision * @param sparse */ - HyperLogLogPlusMinus(uint8_t precision=10, bool sparse=true):p(precision),sparse(sparse) { + HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) { if (precision > 18 || precision < 4) { throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18"); } From 0afdcf769a05f92240fabcd0af2febd48f93590a Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 22:36:53 -0400 Subject: [PATCH 075/105] use unordered maps --- src/grade_classification.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp index edfc999..8159ca8 100644 --- a/src/grade_classification.cpp +++ b/src/grade_classification.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include using namespace std; @@ -45,13 +46,14 @@ int main(int argc, char **argv) { cerr << "Read " << seqid_map.size() << " taxa mappings" << endl; ofstream out_file(argv[4]); - set all_ranks; + unordered_set all_ranks; unordered_map< string, size_t > rank_counts; - map< int, set > simulated_taxids_at_rank; - map< int, set > identified_taxids_at_rank; - map< int, size_t > correct_reads_at_rank; - map< int, size_t > incorrect_reads_at_rank; - map< int, size_t > reads_at_higher_rank; + unordered_map< int, set > simulated_taxids_at_rank; + unordered_map< int, set > identified_taxids_at_rank; + unordered_map< int, size_t > correct_reads_at_rank; + unordered_map< int, size_t > incorrect_reads_at_rank; + unordered_map< int, size_t > reads_at_higher_rank; + unordered_set ignored_taxa; size_t total_reads = 0; size_t unidentified_reads = 0; @@ -95,7 +97,10 @@ int main(int argc, char **argv) { } else { seq_taxid = it->second; if (!taxdb.hasTaxon(seq_taxid)) { - cerr << "Ignoring taxon " << seq_taxid << " - not in database" << endl; + if (ignored_taxa.count(seq_taxid) == 0) { + cerr << "Ignoring taxon " << seq_taxid << " - not in database" << endl; + ignored_taxa.insert(seq_taxid); + } continue; } //cerr <<"seqid" << seq_taxid; From 13adecff1625843aff578a2c5f0aff0fdb482766 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 22:38:16 -0400 Subject: [PATCH 076/105] Check library directories exist --- scripts/krakenu-build | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/krakenu-build b/scripts/krakenu-build index 8f72697..e90b353 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenu-build @@ -321,6 +321,17 @@ sub standard_installation { } sub build_database { + foreach (@library_dirs) { + if (!-d $_) { + print STDERR "Library directory $_ does not exist!\n"; + exit(1); + } + } + if (! -d $taxonomy_dir) { + print STDERR "Taxonomy directory $taxonomy_dir does not exist!\n"; + exit(1); + } + $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0); $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq; $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome; From 1eda03ee57e15c094969c06bd2abfde6e1b1acdc Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 1 Oct 2017 22:39:24 -0400 Subject: [PATCH 077/105] Use uid_mapping headers --- src/read_uid_mapping.cpp | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp index 0ac84db..6c40d65 100644 --- a/src/read_uid_mapping.cpp +++ b/src/read_uid_mapping.cpp @@ -1,4 +1,5 @@ +#include "uid_mapping.hpp" #include "kraken_headers.hpp" #include "quickfile.hpp" #include @@ -7,39 +8,10 @@ using namespace std; using namespace kraken; -inline -vector get_taxids_for_uid(uint32_t uid, char* fptr) { - size_t int_size = sizeof(int); - size_t block_size = sizeof(int)*2; - // TODO: Just get a uint64_t and shift the bits, probably faster - uint32_t taxid = *(uint32_t*)(fptr+(uid-1)*block_size); - uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size); - - vector taxids = {taxid}; - while (parent_uid != 0) { - taxid = *(uint32_t*)(fptr+(parent_uid-1)*block_size); - parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size); - taxids.push_back(taxid); - } - std::sort(taxids.begin(), taxids.end()); - return(taxids); -} - -inline -vector get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map >& uid_map ) { - auto it = uid_map.find(uid); - if (it != uid_map.end()) { - return it->second; - } - vector taxids = get_taxids_for_uid(uid, fptr); - uid_map[uid] = taxids; - return(taxids); -} - int main(int argc, char **argv) { if (argc < 2) { std::cerr << "Usage: read_uid_mapping []" - "The file is supposed to have lines terminated by '\n'." + "The file is supposed to have lines terminated by '\n'.it.second" << std::endl; return 1; } @@ -61,10 +33,10 @@ int main(int argc, char **argv) { cout << ++i << '\t' << *taxid_ptr << '\t' << *parent_uid << endl; } } else { - unordered_map > UID_to_TaxID_map; + //unordered_map > UID_to_TaxID_map; for (int i=2; i taxids = get_taxids_for_uid(UID, UID_to_TaxID_map, fptr); + vector taxids = get_taxids_for_uid(UID, fptr); cout << UID << '\t'; for (auto t : taxids) { cout << t << ' '; From b84380b234d75eb9ade47f250127f7d5b34640dd Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 2 Oct 2017 14:01:01 -0400 Subject: [PATCH 078/105] update init.sh --- tests/init.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/init.sh b/tests/init.sh index f4c73d3..7d74c10 100755 --- a/tests/init.sh +++ b/tests/init.sh @@ -10,6 +10,7 @@ set -xeu ## Download taxonomy and genomic data into data/ #$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any #$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 +$DIR/install/krakenu-download --db $DIR/data -R contaminants for i in viral viral-neighbors archaea bacteria; do [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna @@ -18,5 +19,5 @@ for i in viral viral-neighbors archaea bacteria; do [[ -s $DUSTED_F ]] || dustmasker -infmt fasta -in $DIR/data/all-$i.fna -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > "$DUSTED_F" mkdir -p $DIR/data/library/$i-dusted [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna" ]] || ln "$DUSTED_F" "$DIR/data/library/$i-dusted/all-$i-dusted.fna" - [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna.map" ]] || ln "$DIR/data/all-$i.map" "$DIR/data/library/$i-dusted/all-$i.map" + [[ -f "$DIR/data/library/$i-dusted/all-$i.map" ]] || ln "$DIR/data/all-$i.map" "$DIR/data/library/$i-dusted/all-$i.map" done From a352122ea68ec3f20f9bfff555b5ce98bf939b61 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 2 Oct 2017 15:33:26 -0400 Subject: [PATCH 079/105] Fix bug in reading names and nodes --- src/taxdb.h | 58 +++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/src/taxdb.h b/src/taxdb.h index f49d9dc..1593c10 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -445,25 +445,20 @@ void TaxonomyDB::parseNodesDump(const std::string nodesDumpFil TAXID taxonomyID; TAXID parentTaxonomyID; std::string rank; + char delim; - while (nodesDumpFile.good()) { - getline(nodesDumpFile, line); - std::vector tokens = tokenise(line, "\t|\t", 3, 2); - if (tokens.size() < 3) { - continue; - } - - taxonomyID = string_to_T(tokens[0]); - parentTaxonomyID = string_to_T(tokens[1]); - rank = tokens[2]; - + while (nodesDumpFile >> taxonomyID >> delim >> parentTaxonomyID) { + nodesDumpFile.ignore(3); + getline(nodesDumpFile, rank, '\t'); auto entryIt = taxIDsAndEntries.find(taxonomyID); - if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); - } else { + if (entryIt == taxIDsAndEntries.end()) { + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); + } else { entryIt->second.parentTaxonomyID = parentTaxonomyID; entryIt->second.rank = rank; } + + nodesDumpFile.ignore(2560, '\n'); } } @@ -475,22 +470,25 @@ void TaxonomyDB::parseNamesDump(const std::string namesDumpFil std::string line; TAXID taxonomyID; - std::string scientificName; + std::string scientificName, type; while (namesDumpFile.good()) { - getline(namesDumpFile, line); - std::vector tokens = tokenise(line, "\t|\t", 4, 2); - if (tokens.size() < 4 || tokens[3] != "scientific name") { - continue; - } - taxonomyID = string_to_T(tokens[0]); - scientificName = tokens[1]; - - auto entryIt = taxIDsAndEntries.find(taxonomyID); - if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); - } else { - entryIt->second.scientificName = scientificName; + namesDumpFile >> taxonomyID; + namesDumpFile.ignore(3); + getline(namesDumpFile, scientificName, '\t'); + namesDumpFile.ignore(3); + namesDumpFile.ignore(256, '|'); + namesDumpFile.ignore(1); + getline(namesDumpFile, type, '\t'); + + if (type == "scientific name") { + auto entryIt = taxIDsAndEntries.find(taxonomyID); + if (entryIt == taxIDsAndEntries.end()) { + taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); + } else { + entryIt->second.scientificName = scientificName; + } } + namesDumpFile.ignore(2560, '\n'); } } @@ -549,6 +547,10 @@ std::unordered_map > std::string line; while (!inFile.eof()) { inFile >> taxonomyID >> parentTaxonomyID; + if (taxonomyID > 1 && taxonomyID == parentTaxonomyID) { + cerr << "ERROR: the parent of " << taxonomyID << " is itself. Should not happend!\n"; + exit(1); + } inFile.get(); // read tab std::getline(inFile, scientificName, '\t'); if (hasGenomeSizes) { From f05a219624c58131edea7e7b6e8c2e3289e80ec3 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 18 Oct 2017 10:51:16 -0400 Subject: [PATCH 080/105] Dump taxdb without ending separators (for kraken-report) --- src/dump_taxdb.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp index 79e668f..3e0d442 100644 --- a/src/dump_taxdb.cpp +++ b/src/dump_taxdb.cpp @@ -11,6 +11,8 @@ int main(int argc, char **argv) { std::cerr << "Usage: dump_taxdb taxDB names.dmp nodes.dmp\n"; return 1; } + + cerr << "Reading taxonomy database from " << argv[1] << ", writing nodes dump to " << argv[3] << " and names dump to " << argv[2] << "." << endl; TaxonomyDB taxdb {(string)argv[1]}; ofstream names_file(argv[2]); names_file.exceptions(ifstream::failbit | ifstream::badbit); @@ -22,12 +24,12 @@ int main(int argc, char **argv) { nodes_file << taxon.second.taxonomyID << "\t|\t" << taxon.second.parentTaxonomyID << "\t|\t" << taxon.second.rank - << "\t|\n"; // there are further columns, but Kraken does not care about them + << endl; // there are further columns, but Kraken does not care about them names_file << taxon.second.taxonomyID << "\t|\t" << taxon.second.scientificName << "\t|\t" - << "\t|\t" << "scientific name" << "\t|\n"; + << "\t|\t" << "scientific name" << endl; } names_file.close(); nodes_file.close(); From 2172caa02b029024bad88f8624b35e9e80d33bdc Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 18 Oct 2017 10:55:49 -0400 Subject: [PATCH 081/105] Allow to run build_taxdb with a taxdb as consistency check --- src/build_taxdb.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index 6f33763..763a8a0 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -26,11 +26,22 @@ using namespace std; int main(int argc, char **argv) { - if (argc < 3 || argc > 4) { - std::cerr << "Usage: build_taxdb names.dmp nodes.dmp [taxon-counts]\n"; + if (argc < 2 || argc > 4) { + std::cerr << "USAGE:\n" + << "With two or three arguments, echo taxDB based on NCBI taxonomy dump:\n" + << "build_taxdb names.dmp nodes.dmp [taxon-counts]\n" + << "\n" + << "With one argument, read in taxDB and echo it again for consistency checks:\n" + << "build_taxdb taxDB\n"; return 1; } - TaxonomyDB taxdb {(string)argv[1], (string)argv[2]}; + + TaxonomyDB taxdb; + if (argc == 2) { + taxdb = TaxonomyDB ((string)argv[1]); + } else { + taxdb = TaxonomyDB ((string)argv[1], (string)argv[2]); + } if (argc == 4) { ifstream ifs(argv[3]); uint32_t taxon; uint64_t count; From eb9447ebbb386853737455701b2a19c08627af60 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 18 Oct 2017 10:56:28 -0400 Subject: [PATCH 082/105] Update to classification grading --- src/grade_classification.cpp | 70 ++++++++++++++++++++------------ tests/build-dbs.sh | 49 ++++++++++++++++------ tests/test-on-simulated-reads.sh | 49 +++++++++++++++------- 3 files changed, 116 insertions(+), 52 deletions(-) diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp index 8159ca8..c4dec80 100644 --- a/src/grade_classification.cpp +++ b/src/grade_classification.cpp @@ -24,15 +24,12 @@ unordered_map read_seqid_mapping(string filename) { string line, seq_id; uint32_t taxid; - while (map_file.good()) { - getline(map_file, line); - if (line.empty()) - break; - istringstream iss(line); - iss >> seq_id >> taxid; + while (map_file >> seq_id >> taxid) { ID_to_taxon_map[seq_id] = taxid; + map_file.ignore(std::numeric_limits::max(), '\n'); } map_file.close(); + cerr << "Read " << ID_to_taxon_map.size() << " taxa mappings" << endl; return ID_to_taxon_map; } @@ -43,7 +40,6 @@ int main(int argc, char **argv) { } TaxonomyDB taxdb = TaxonomyDB(argv[1], false); unordered_map seqid_map = read_seqid_mapping(argv[2]); - cerr << "Read " << seqid_map.size() << " taxa mappings" << endl; ofstream out_file(argv[4]); unordered_set all_ranks; @@ -74,7 +70,11 @@ int main(int argc, char **argv) { if (line.empty()) continue; istringstream iss(line); - iss >> classification_state >> read_id >> identified_taxid; + string l; + string classi; + iss >> classification_state >> read_id >> identified_taxid >> l; + iss.get(); + getline(iss,classi); ++total_reads; if (identified_taxid == 0) { @@ -117,7 +117,7 @@ int main(int argc, char **argv) { } } - string seq_species = taxdb.getScientificName(seq_taxid); + string seq_name = taxdb.getScientificName(seq_taxid); // getLowestCommonAncestor returns lca taxon as well as distance between the taxa pair lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, identified_taxid); string lca_rank_string = taxdb.getNextProperRank(lca_taxid_dist.first); @@ -144,31 +144,49 @@ int main(int argc, char **argv) { if (identified_taxid == 0) lca_rank_string = "unidentified"; + ++rank_counts[lca_rank_string]; - out_file << seq_species << '\t' << seq_taxid << '\t' << identified_taxid << '\t' << lca_rank_string << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << '\n'; + out_file + << read_id << '\t' << seq_name << '\t' << seq_taxid << '\t' + << identified_taxid << '\t' << taxdb.getRank(taxdb.getTaxIDAtNextProperRank(identified_taxid)) << '\t' + << lca_rank_string << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << '\t' << classi << '\n'; } } k_file.close(); - cout << "#LCA_RANK_READ_COUNTS" << endl; - for (const auto & kv : rank_counts) { - cout << kv.first << '\t' << kv.second << endl; + char delim = '\t'; + + if (0) { + cout << "#LCA_RANK_READ_COUNTS" << endl; + for (const auto & kv : rank_counts) { + cout << kv.first << delim << kv.second << endl; + } + cout << endl; } - cout << "\n#rank; total_reads; correct; incorrect; at_higher_rank; unidentified" << endl; + + cout << "#rank" << delim << "total_reads" << delim << "correct"<< delim << "incorrect"<< delim << "sensitivity" << delim << "precision" << delim << "higher_rank" << delim << "unidentified" << endl; for (TaxRank::RANK rank : ranks_of_interest) { - cout << TaxRank::toString(rank) << '\t' << total_reads - << '\t' << correct_reads_at_rank[rank] - << '\t' << incorrect_reads_at_rank[rank] - << '\t' << reads_at_higher_rank[rank] - << '\t' << unidentified_reads + size_t true_positives = correct_reads_at_rank.at(rank); + size_t false_positives = incorrect_reads_at_rank.at(rank); + double sensitivity = 100.0*(double)true_positives/(double)total_reads; + double specificity = 100.0*(double)true_positives/(double)(true_positives+false_positives); + cout << TaxRank::toString(rank) << delim << total_reads + << delim << true_positives + << delim << false_positives + << delim << sensitivity << '%' + << delim << specificity << '%' + << delim << reads_at_higher_rank.at(rank) + << delim << unidentified_reads + << setprecision(2) << std::fixed << '\n'; } - cout << "\n#rank;P;TP;FP;sens;prec" << endl; + cout << "#rank" << delim << "true_count" << delim << "correct" << delim << "incorrect" << delim << "recall" << delim << "precision" << endl; for (TaxRank::RANK rank : ranks_of_interest) { size_t true_positives = 0; size_t false_positives = 0; + if (identified_taxids_at_rank.find(rank) != identified_taxids_at_rank.end()) { for (const auto & tid : identified_taxids_at_rank[rank]) { if (simulated_taxids_at_rank[rank].count(tid) == 1) { ++true_positives; @@ -176,16 +194,18 @@ int main(int argc, char **argv) { ++false_positives; } } + } double sensitivity = 100.0*(double)true_positives/(double)simulated_taxids_at_rank[rank].size(); double specificity = 100.0*(double)true_positives/(double)(true_positives+false_positives); cout << TaxRank::toString(rank) - << '\t' << simulated_taxids_at_rank[rank].size() - << '\t' << true_positives - << '\t' << false_positives << setprecision(2) << std::fixed - << '\t' << sensitivity << '%' - << '\t' << specificity << '%' + << delim << simulated_taxids_at_rank[rank].size() + << delim << true_positives + << delim << false_positives + << setprecision(2) << std::fixed + << delim << sensitivity << '%' + << delim << specificity << '%' << '\n'; } } diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index 864cb0c..affc323 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -1,22 +1,47 @@ #!/bin/bash -set -xeu +set -eu [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 +[[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10 -export PATH="$DIR/install:$PATH" -for K in 31 26 21; do - mkdir -p $DIR/dbs/refseq-viral-k$K - time krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy 2>&1 | tee $DIR/dbs/refseq-viral-k$K/build.log - mkdir -p $DIR/dbs/refseq-viral-k$K/taxonomy - dump_taxdb $DIR/dbs/refseq-viral-k$K/taxDB $DIR/dbs/refseq-viral-k$K/taxonomy/names.dmp $DIR/dbs/refseq-viral-k$K/taxonomy/nodes.dmp +build_db() { + K=$1; shift + NAM=$1; shift - if [[ `uname` != "Darwin" ]]; then - mkdir -p $DIR/dbs/refseq-oct2017-k$K - krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/library/vertebrate_mammalian --library-dir=$DIR/data/library/contaminants --taxonomy-dir=$DIR/data/taxonomy - mkdir -p $DIR/dbs/refseq-bacteria-k$K - krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy + DB_NAM=refseq-$NAM-k$K + DB_DIR=$DIR/dbs/$DB_NAM + + mkdir -p $DB_DIR + CMD="krakenu-build --kmer-len $K --minimizer-len 12 --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database" + for L in $@; do + CMD="$CMD --library-dir=$DIR/data/library/$L" + done + if [[ ! -f "$DB_DIR/is.busy" ]]; then + echo "EXECUTING $CMD" + touch $DB_DIR/is.busy + $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log + if [[ ! -f "$DB_DIR/taxonomy/nodes.dmp" ]]; then + mkdir -p $DB_DIR/taxonomy + echo "EXECUTING dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp" + dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp + fi + rm $DB_DIR/is.busy + else + echo "IGNORING $DB_DIR" + fi +} + +#export PATH="$DIR/install:$PATH" +for K in 31 21; do + if [[ `uname` == "Darwin" ]]; then + build_db $K viral viral + build_db $K all-viral viral viral-neighbors + else + build_db $K oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \ + vertebrate_mammalian contaminants + #build_db $K bacteria bacteria archaea fi done diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh index df18b14..580f218 100755 --- a/tests/test-on-simulated-reads.sh +++ b/tests/test-on-simulated-reads.sh @@ -5,21 +5,27 @@ set -eu [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 SDIR=$DIR/simulated_reads CDIR=$DIR/classification-results +CCDIR=$DIR/classification-stats mkdir -p $CDIR +mkdir -p $CCDIR mkdir -p $SDIR [[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10 run_kraken() { - FQ=$1 - NAM=$2 - DAT=$3 - DB_DAT=$4 - DB_K=$5 - PROG=$6 - DB=refseq-$DB_DAT-k$K + local FQ=$1 + local NAM=$2 + local DAT=$3 + local DB_DAT=$4 + local DB_K=$5 + local PROG=$6 + local ALWAYS_SEQMAP=$7; + local DB=refseq-$DB_DAT-k$K + mkdir -p $CDIR/against-$DB - KFILE=$CDIR/against-$DB/$NAM.against-$DB.$PROG + mkdir -p $CCDIR/against-$DB + local KFILE=$CDIR/against-$DB/$NAM.against-$DB.$PROG + local KKFILE=$CCDIR/against-$DB/$NAM.against-$DB.$PROG if [[ "$PROG" == "kraken" ]]; then CMD="kraken" @@ -36,21 +42,34 @@ run_kraken() { echo "$CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE" time $CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE 2>&1 | tee $KFILE.log fi - #[[ "$DAT" == "$DB_DAT" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map - #[[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification $DIR/dbs/$DB/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats + + [[ "$DAT" == "$DB_DAT" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map + [[ "$ALWAYS_SEQMAP" == "ALWAYS_SEQMAP" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map + + if [[ ! -s "$KKFILE.results.stats" ]]; then + $DIR/install/grade_classification $DIR/dbs/$DB/taxDB $SEQMAP $KFILE $KKFILE.results > $KKFILE.results.stats + else + echo "$KKFILE.results.stats exist" + fi } AB=1m -for i in 1 2 3; do +for i in 1; do # 2 3 for dat in viral viral-neighbors bacteria archaea; do - for len in 75 100 150; do + for len in 100; do ## 75 150 NAM=$dat.$AB${len}bp.$i FQ=$SDIR/$NAM.fq [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i for K in 31; do - run_kraken $FQ $NAM $dat viral $K kraken - run_kraken $FQ $NAM $dat viral $K krakenu - #run_kraken $FQ $NAM $dat viral $K krakenuid + # run_kraken $FQ $NAM $dat viral $K krakenuid + if [[ `uname` != "Darwin" ]]; then + run_kraken $FQ $NAM $dat oct2017 $K kraken ALWAYS_SEQMAP + run_kraken $FQ $NAM $dat oct2017 $K krakenu ALWAYS_SEQMAP + run_kraken $FQ $NAM $dat oct2017 $K krakenuid ALWAYS_SEQMAP + else + run_kraken $FQ $NAM $dat viral $K kraken + run_kraken $FQ $NAM $dat viral $K krakenu + fi done done done From 75b24aefb9db83addd023215bb6883958c44f0e4 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 18 Oct 2017 10:57:25 -0400 Subject: [PATCH 083/105] Do not give separate taxids for human sequences (fixed now) --- src/set_lcas.cpp | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index a1a545a..3872c4d 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -227,6 +227,8 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi continue; } + uint32_t orig_taxid = taxid; + if (Add_taxIds_for_Assembly && iss.good()) { iss.get(); getline(iss, name); @@ -234,7 +236,7 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi taxid = get_new_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly"); } - if (Add_taxIds_for_Sequences && taxid != 9606) { + if (Add_taxIds_for_Sequences && orig_taxid != 9606) { taxid = get_new_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence"); } if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) { @@ -289,6 +291,12 @@ void process_single_file() { ++seqs_skipped; continue; } + + if (Parent_map.find(taxid) == Parent_map.end()) { + cerr << "Skipping sequence " << dna.id << " since taxonomy ID " << taxid << " is not in taxonomy database!" << endl; + ++ seqs_skipped; + continue; + } if (Add_taxIds_for_Sequences && taxid != 9606) { // Update entry based on header line @@ -308,12 +316,12 @@ void process_single_file() { if (taxid) { if (Parent_map.find(taxid) == Parent_map.end()) { cerr << "Ignoring sequence for taxID " << taxid << " - not in taxDB\n"; + } else { + #pragma omp parallel for schedule(dynamic) + for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) + set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1); + ++seqs_processed; } - #pragma omp parallel for schedule(dynamic) - for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) - set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1); - - ++seqs_processed; } else { if (verbose) cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl; @@ -396,7 +404,9 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { #pragma omp critical(new_uid) *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file); } else if (!force_taxid && taxid != contaminant_taxids) { - *val_ptr = lca(Parent_map, taxid, *val_ptr); + if (Parent_map.find(taxid) != Parent_map.end()) { + *val_ptr = lca(Parent_map, taxid, *val_ptr); + } } else { // When force_taxid is set, do not compute lca, but assign the taxid // of the (last) sequence to k-mers From f15112d5002abb349d956d849f6efec15982382e Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 18 Oct 2017 10:57:49 -0400 Subject: [PATCH 084/105] Add species subgroup and group ranks --- src/taxdb.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/taxdb.h b/src/taxdb.h index 1593c10..b522866 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -63,7 +63,7 @@ struct TaxRank { //"superkingdom", "root"}; enum RANK { unknown, no_rank, sequence, assembly, - subspecies, species, subgenus, genus, tribe, subfamily, + subspecies, species, species_subgroup, species_group, subgenus, genus, tribe, subfamily, family, superfamily, parvorder, infraorder, suborder, order, superorder, parvclass, infraclass, subclass, class_, superclass, subphylum, phylum, kingdom, @@ -73,7 +73,12 @@ struct TaxRank { static const unordered_map string_to_rank; static const RANK toRank(const string& rank) { - return string_to_rank.at(rank); + const auto& it = string_to_rank.find(rank); + if (it == string_to_rank.end()) { + cerr << "ERROR: Could not find rank " << rank << endl; + exit(1); + } + return it->second; } static const char* toString(const TaxRank::RANK& rank) { @@ -84,6 +89,8 @@ struct TaxRank { case RANK::assembly: return "assembly"; case RANK::subspecies: return "subspecies"; case RANK::species: return "species"; + case RANK::species_subgroup: return "species subgroup"; + case RANK::species_group: return "species group"; case RANK::subgenus: return "subgenus"; case RANK::genus: return "genus"; case RANK::tribe: return "tribe"; @@ -120,6 +127,8 @@ const unordered_map TaxRank::string_to_rank = { {"assembly", TaxRank::assembly}, {"subspecies", TaxRank::subspecies}, {"species", TaxRank::species}, + {"species subgroup", TaxRank::species_subgroup}, + {"species group", TaxRank::species_group}, {"subgenus", TaxRank::subgenus}, {"genus", TaxRank::genus}, {"tribe", TaxRank::tribe}, From 3090304362b635866a211fe91d68c5462117de4b Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 18 Oct 2017 11:00:02 -0400 Subject: [PATCH 085/105] Don't use .at() for vector access --- src/uid_mapping.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp index 4a80946..d2100d3 100644 --- a/src/uid_mapping.cpp +++ b/src/uid_mapping.cpp @@ -29,7 +29,7 @@ namespace kraken { cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl; exit(1); } - taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1)); + taxid_set = *(UID_to_taxids_vec[kmer_uid-1]); auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order if (it == taxid_set.end() || *it != taxid) { // add the taxid to the set, in the right position such that it remains sorted From 9c11d877d49e5efee7288e2eb385946c94d2de78 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 18 Oct 2017 11:34:49 -0400 Subject: [PATCH 086/105] Test modification of lca --- src/krakenutil.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index 2da433e..365238f 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -55,20 +55,23 @@ namespace kraken { return a ? a : b; // create a path from a to the root - std::unordered_set a_path; - while (a > 0 && a != parent_map.at(a)) { - if (a == b) - return a; + std::vector a_path; + do { + if (a == b) + return a; a_path.insert(a); a = parent_map.at(a); - } + } while (a != a_path.back()) // search for b in the path from a to the root - while (b > 0 && b != parent_map.at(b)) { - if (a_path.count(b) > 0) + uint32_t last_b = 0; + do { + if (a_path.find(b) != a_path.end()) return b; + + last_b = b; b = parent_map.at(b); - } + } while (last_b != b) return 1; } @@ -77,8 +80,8 @@ namespace kraken { // Tree resolution: take all hit taxa (plus ancestors), then // return leaf of highest weighted leaf-to-root path. uint32_t resolve_tree(const unordered_map &hit_counts, - const unordered_map &parent_map) - { + const unordered_map &parent_map) { + set max_taxa; uint32_t max_taxon = 0, max_score = 0; @@ -93,6 +96,10 @@ namespace kraken { if (it2 != hit_counts.end()) { score += it2->second; } + if (node == parent_map.at(node)) { + cerr << "Taxon " << node << " has itself as parent!" << endl; + break; + } node = parent_map.at(node); } From 7d1ab245ada965bae731f0505d3dfbe46fa9115d Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 23 Oct 2017 10:10:50 -0400 Subject: [PATCH 087/105] Update --- scripts/krakenu-build_db.sh | 6 +++--- src/Makefile | 6 +++--- src/krakenutil.cpp | 30 +++++++++++++++++++++++----- src/set_lcas.cpp | 39 +++++++++++++++++++++++-------------- tests/build-dbs.sh | 16 ++++++++++----- tests/init.sh | 5 +++-- 6 files changed, 69 insertions(+), 33 deletions(-) diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index 96f1aa8..e4707bd 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -255,7 +255,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then set -x set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \ -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kdb.counts \ - -F <( cat_library ) > seqid2taxid-plus.map + -F <( cat_library ) -T > seqid2taxid-plus.map set +x if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then mv seqid2taxid.map seqid2taxid.map.orig @@ -300,8 +300,8 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then ## Make a classification report REPNAME=uid_database if [[ ! -s $REPNAME.report.tsv ]]; then - #echo "Creating UID database summary report $REPNAME.report.tsv ..." - #krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv + echo "Creating UID database summary report $REPNAME.report.tsv ..." + krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv fi fi diff --git a/src/Makefile b/src/Makefile index 2145bd7..2603974 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,8 +1,8 @@ CXX = g++ FOPENMP?=-fopenmp -CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors +CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -g -Wfatal-errors #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors -PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping +PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream .PHONY: all install clean @@ -28,7 +28,7 @@ read_uid_mapping: quickfile.o classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h $(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS) -build_taxdb: taxdb.h report-cols.h +build_taxdb: taxdb.h report-cols.h quickfile.o make_seqid_to_taxid_map: quickfile.o diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index 365238f..d58cf39 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -21,6 +21,7 @@ #include "kraken_headers.hpp" #include "krakenutil.hpp" #include +#include using namespace std; @@ -50,7 +51,26 @@ namespace kraken { // Return lowest common ancestor of a and b // LCA(0,x) = LCA(x,0) = x // Default ancestor is 1 (root of tree) - uint32_t lca(const unordered_map &parent_map, uint32_t a, uint32_t b) { + uint32_t lca(const unordered_map &parent_map, + uint32_t a, uint32_t b) + { + if (a == 0 || b == 0) + return a ? a : b; + + unordered_set a_path; + while (a > 0) { + a_path.insert(a); + a = parent_map.at(a); + } + while (b > 0) { + if (a_path.count(b) > 0) + return b; + b = parent_map.at(b); + } + return 1; + } + + uint32_t lca_vec(const unordered_map &parent_map, uint32_t a, uint32_t b) { if (a == 0 || b == 0) return a ? a : b; @@ -59,19 +79,19 @@ namespace kraken { do { if (a == b) return a; - a_path.insert(a); + a_path.push_back(a); a = parent_map.at(a); - } while (a != a_path.back()) + } while (a != a_path.back()); // search for b in the path from a to the root uint32_t last_b = 0; do { - if (a_path.find(b) != a_path.end()) + if (std::find(a_path.begin(), a_path.end(), b) != a_path.end()) return b; last_b = b; b = parent_map.at(b); - } while (last_b != b) + } while (last_b != b); return 1; } diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 3872c4d..2de456a 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -39,7 +39,7 @@ void usage(int exit_code=EX_USAGE); void process_files(); void process_single_file(); void process_file(string filename, uint32_t taxid); -void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish); +void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid = false); int Num_threads = 1; string DB_filename, Index_filename, @@ -47,8 +47,8 @@ string DB_filename, Index_filename, Kmer_count_filename, File_to_taxon_map_filename, ID_to_taxon_map_filename, Multi_fasta_filename; -bool force_taxid = false; -int New_taxid_start = 1000000000; +bool force_contaminant_taxid = false; +uint32_t New_taxid_start = 1000000000; bool Allow_extra_kmers = false; bool verbose = false; @@ -88,7 +88,7 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); - if (!TaxDB_filename.empty() && !force_taxid) { + if (!TaxDB_filename.empty()) { taxdb = TaxonomyDB(TaxDB_filename); for (const auto & tax : taxdb.taxIDsAndEntries) { if (tax.first != 0) @@ -211,6 +211,15 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi string line, seq_id, name; uint32_t taxid; + if (Add_taxIds_for_Assembly && Add_taxIds_for_Sequences) { + for (const auto& k : taxdb.taxIDsAndEntries) { + if (k.first >= New_taxid_start) { + New_taxid_start = k.first; + } + } + cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl; + } + // Used when adding new taxids for assembly or sequence unordered_map name_to_taxid_map; @@ -298,6 +307,8 @@ void process_single_file() { continue; } + bool is_contaminant_taxid = taxid == 32630 || taxid == 81077; + if (Add_taxIds_for_Sequences && taxid != 9606) { // Update entry based on header line auto entryIt = taxdb.taxIDsAndEntries.find(taxid); @@ -319,7 +330,7 @@ void process_single_file() { } else { #pragma omp parallel for schedule(dynamic) for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN) - set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1); + set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1, is_contaminant_taxid); ++seqs_processed; } } else { @@ -378,7 +389,7 @@ void process_sequence(DNASequence dna) { // Or maybe asembly_summary file? } -void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { +void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid) { KmerScanner scanner(seq, start, finish); uint64_t *kmer_ptr; uint32_t *val_ptr; @@ -403,14 +414,12 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) { if (Use_uids_instead_of_taxids) { #pragma omp critical(new_uid) *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file); - } else if (!force_taxid && taxid != contaminant_taxids) { - if (Parent_map.find(taxid) != Parent_map.end()) { - *val_ptr = lca(Parent_map, taxid, *val_ptr); - } - } else { - // When force_taxid is set, do not compute lca, but assign the taxid + } else if (force_contaminant_taxid && is_contaminant_taxid) { + // When force_contaminant_taxid is set, do not compute lca, but assign the taxid // of the (last) sequence to k-mers *val_ptr = taxid; + } else { + *val_ptr = lca(Parent_map, taxid, *val_ptr); } } } @@ -454,7 +463,7 @@ void parse_command_line(int argc, char **argv) { #endif break; case 'T' : - force_taxid = true; + force_contaminant_taxid = true; break; case 'v' : verbose = true; @@ -516,8 +525,8 @@ void usage(int exit_code) { << " -f filename File to taxon map" << endl << " -F filename Multi-FASTA file with sequence data" << endl << " -m filename Sequence ID to taxon map" << endl - << " -a Add taxonomy IDs (starting with "< $DIR/data/all-$i.fna [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map DUSTED_F="$DIR/data/all-$i-dusted.fna" From 5ea9f0602e37a44c9b9806405e70468639a84aa8 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 25 Oct 2017 12:51:41 -0400 Subject: [PATCH 088/105] Update --- scripts/krakenu-build_db.sh | 1 - scripts/krakenu-download | 136 +++++++++++++++++++++++++++--------- src/classify.cpp | 4 +- src/readcounts.hpp | 34 +++++---- src/set_lcas.cpp | 27 ++++--- tests/build-dbs.sh | 45 ++++++------ tests/init.sh | 13 ++-- 7 files changed, 173 insertions(+), 87 deletions(-) diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh index e4707bd..f8c1450 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenu-build_db.sh @@ -24,7 +24,6 @@ set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands -set -x function report_time_elapsed() { set -x diff --git a/scripts/krakenu-download b/scripts/krakenu-download index 1d67438..b70a24f 100755 --- a/scripts/krakenu-download +++ b/scripts/krakenu-download @@ -23,6 +23,7 @@ sub download(@); sub print_header_lines(@); sub download_domain(@); sub download_viral_neighbors(@); +sub download_viral_neighbors2(@); my $FTP="ftp://ftp.ncbi.nih.gov"; my @ALL_GENOMES=qw/bacteria viral archaea fungi protozoa invertebrate plant vertebrate_mammalian vertebrate_other/; @@ -68,7 +69,7 @@ ARGUMENT COMMON OPTIONS -o Folder to which the files are downloaded. Default: '.' --db Alternative to -o: Download to /{library,taxonomy}. - -P <# of threads> Number of processes when downloading (uses xargs). Default: '$N_PROC' + --threads <# of threads> Number of processes when downloading (uses xargs). Default: '$N_PROC' --rsync, -R Download using rsync. --overwrite Redownload and overwrite files with the same name. -v Verbose. @@ -174,10 +175,13 @@ foreach my $DATABASE (@ARGV) { } } +my %taxid_name_map; + if ($INCLUDE_VIRAL_NEIGHBORS) { if (!$downloaded_viral_refseq) { print STDERR "--include-viral-neighbors only works when RefSeq viral is downloaded in the same session!"; } else { + my $nbr_lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors"; my $viral_lib_dir = $add_dir? "$BASE_DIR/library/viral" : "$BASE_DIR/viral"; download_viral_neighbors($viral_lib_dir, $nbr_lib_dir); @@ -261,25 +265,64 @@ sub end_fork() { exit() unless $N_PROC <= 1; } +sub download_viral_neighbors2(@) { + my ($viral_dir, $nbr_dir) = @_; + my $dir = get_dir($BASE_DIR,"taxonomy"); + print STDERR "Reading names file ...\n"; + my $names_file = "$dir/names.dmp"; + open (my $N, "<", $names_file); + while (<$N>) { + my ($taxid, undef, $name, undef, $type) = split /\t|\t/; + next unless $type eq "scientific name"; + $taxid_name_map{$taxid} = $name; + } + close($N); + + print STDERR "Downloading nucl_gb.accession2taxid ...\n"; + my $url = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz"; + download($url, "$dir/nucl_gb.accession2taxid.gz"); + + print STDERR "Sorting mapping file ...\n"; + system("gunzip -c $dir/nucl_gb.accession2taxid.gz | cut -f 2,3 | sort --parallel $N_PROC > $dir/nucl_gb.accession2taxid.sorted") unless -s "$dir/nucl_gb.accession2taxid.sorted"; + + if (!-f "$nbr_dir/all-nbrs.fa"){ + my $FMT="fasta"; + my $TERM="Viruses[Organism]+NOT+cellular+organisms[ORGN]+NOT+wgs[PROP]+NOT+AC_000001:AC_999999[pacc]+NOT+gbdiv+syn[prop]+AND+nuccore+genome+samespecies[Filter]"; + my $ESEARCH_URL="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"; + ## TODO: Go through it 10,000 entries at a time + my $URL_PARAMS=`curl -g "$ESEARCH_URL?db=nuccore&usehistory=y&retmax=1&retmode=json&term=$TERM" | grep -e 'querykey' -e 'webenv' | sed -e 's/^ *"querykey": "/query_key=/' -e 's/^ *"webenv": "/WebEnv=/' -e 's/",//' | paste -sd\\&`; + chomp $URL_PARAMS; + download("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&query_key=1&$URL_PARAMS&rettype=fasta", "$nbr_dir/all-nbrs.fa"); +} + +} + sub download_viral_neighbors(@) { my ($viral_dir, $nbr_dir) = @_; - print STDERR "Reading map files from $viral_dir ... \n"; - my %ac_to_taxid; - foreach my $f (glob("$viral_dir/*.map")) { - open (my $F, "<", $f); - while (<$F>) { - chomp; - my ($ac, $taxid, $name) = split(/\t/); - $ac =~ s/\.[0-9]*$//; - $ac_to_taxid{$ac} = [$name, $taxid]; - } - close ($F); + + print STDERR "Reading names file ...\n"; + my $dir = get_dir($BASE_DIR,"taxonomy"); + my $names_file = "$dir/names.dmp"; + open (my $N, "<", $names_file); + while (<$N>) { + next unless /scientific name/; + my ($taxid, $name) = split /\t\|\t/; + $taxid_name_map{$taxid} = $name; } + close($N); + + print STDERR "Downloading nucl_gb.accession2taxid ...\n"; + my $url = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz"; + download($url, "$dir/nucl_gb.accession2taxid.gz"); + + my $sorted_map_f = "$dir/nucl_gb.accession2taxid.sorted"; + print STDERR "Sorting mapping file ...\n"; + system("gunzip -c $dir/nucl_gb.accession2taxid.gz | cut -f 2,3 | sort --parallel $N_PROC > $sorted_map_f") unless -s $sorted_map_f; print STDERR "Downloading viral neighbors into $nbr_dir ...\n"; - my $url = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2"; + my $url1 = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2"; my $nbr_file = "$nbr_dir/viral_neighbors-taxid10239.nbr"; - download($url, $nbr_file); + download($url1, $nbr_file); open(my $F, "<", $nbr_file); my @file = <$F>; close($F); @@ -292,28 +335,37 @@ sub download_viral_neighbors(@) { ++$i; print STDERR "\r Downloading viral neighbor sequence $i/$n_genomes ..." unless $VERBOSE; # my $pid = $pm->start and next; - + chomp; my ($rep_acs, $nbr_ac, undef, undef, $nname, $sname) = split /\t/; - my ($name, $taxid); - foreach my $rep_ac (split (/,/, $rep_acs)) { - if (defined $ac_to_taxid{$rep_ac}) { - ($name, $taxid) = @{$ac_to_taxid{$rep_ac}}; - last; - } + my $taxid = `look $nbr_ac $sorted_map_f | cut -f 2`; + chomp $taxid; + + if (!defined $taxid || !defined $taxid_name_map{$taxid}) { + my $res = `curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=$nbr_ac&rettype=fasta&retmode=xml" | head -n 12 | egrep '|' | sed -e 's###' -e 's#.*<.*>##' | paste -sd\$'\\t'`; + chomp $res; + ($taxid) = split /\t/, $res; } - if (!defined $taxid) { - print STDERR "\nNo mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n"; - next; + + my $name = $taxid_name_map{$taxid}; + if (!defined $taxid || !defined $name) { + print STDERR "\nNo mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname, $taxid]!\n"; + next; } (my $name1 = $name) =~ s/[^a-zA-Z0-9_]/_/g; $name1 =~ s/__/_/g; my $file = "$nbr_dir/$name1-tax$taxid/$nbr_ac.fna"; + system("mkdir -p $nbr_dir/$name1-tax$taxid"); + if (-s "$nbr_dir/$nbr_ac.fna") { + system("mv -v $nbr_dir/$nbr_ac.fna $nbr_dir/$name1-tax$taxid/$nbr_ac.fna"); + } my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac"; - start_fork() and next; - if (download($url,$file)) { - print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac"); + if (! -s $file || ! -s "$file.map") { + start_fork() and next; + if (download($url,$file)) { + print_header_lines($file, $taxid, "$nname neighbors"); + } + end_fork(); } - end_fork(); } print STDERR "\n"; wait_children(); @@ -330,7 +382,11 @@ sub print_header_lines(@) { while (<$G>) { next unless /^>([^ ]*)/; my $ac = $1; - print $F "$ac\t$taxid\t$name\n"; + if (defined $name) { + print $F "$ac\t$taxid\t$name\n"; + } else { + print $F "$ac\t$taxid\n"; + } } close($G); close($F); @@ -395,7 +451,8 @@ sub download_taxonomy(@) { sub download_domain(@) { my ($domain_dir, $domain, $_assembly_level, $_taxid) = @_; - print STDERR "Downloading assembly summary file for $domain genomes, and filtering to assembly level $_assembly_level and taxid $_taxid.\n"; + print STDERR "Downloading assembly summary file for $domain genomes, and filtering to assembly level $_assembly_level"; + print STDERR (defined $_taxid? "and taxid $_taxid.\n" : ".\n"); die unless defined $domain_dir && defined $domain; if (-d $domain_dir) { print STDERR "WARNING: $domain_dir already exists - potentially overwriting files.\n"; @@ -430,6 +487,9 @@ sub download_domain(@) { close $A2; close $A1; + my $downloaded_files = 0; + my $existing_files = 0; + my $i = 0; foreach my $g (@genomes_to_dl) { my ($ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession) = @$g; @@ -450,15 +510,22 @@ sub download_domain(@) { my $bname1 = "${organism_name1}-tax${taxid}-${bname}"; foreach my $ext (split(/,/, $FNA_FILES)) { - start_fork() and next; my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz"; my $bfname = $bname1."_".$ext; my $fname = $bfname.".fna"; + if (!$OVERWRITE_FILES && -s "$domain_dir/$fname") { + ++$existing_files; + } else { + ++$downloaded_files; + } + if (!$OVERWRITE_FILES && -s "$domain_dir/$fname") { print STDERR "$domain_dir/$fname exists - not downloading.. \n" if $VERBOSE; } else { + start_fork() and next; download($full_ftp_path, "$domain_dir/$fname.gz", "$domain_dir/$fname"); + end_fork(); } if ($CHANGE_HEADER) { @@ -470,18 +537,19 @@ sub download_domain(@) { ## Output sequenceID to taxonomy ID map to STDOUT - print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession"); + print_header_lines("$domain_dir/$fname", $taxid, "$assembly_accession $organism_name"); if ($DO_DUST) { + start_fork() and next; ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps system("dustmasker -infmt fasta -in '$domain_dir/$fname' -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > '$domain_dir/${bfname}_dustmasked.fna'"); unlink("$domain_dir/$fname"); + end_fork(); } - end_fork(); } } wait_children(); - print STDERR "\n"; + print STDERR " downloaded $downloaded_files files, $existing_files already existed.\n"; } diff --git a/src/classify.cpp b/src/classify.cpp index a6076d3..1652b45 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -480,8 +480,8 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, }; //string hitlist_string; - uint32_t last_taxon; - uint32_t last_counter; + //uint32_t last_taxon; + //uint32_t last_counter; vector db_statuses(KrakenDatabases.size()); diff --git a/src/readcounts.hpp b/src/readcounts.hpp index 486edbd..e878381 100644 --- a/src/readcounts.hpp +++ b/src/readcounts.hpp @@ -7,24 +7,30 @@ namespace kraken { struct ReadCounts { - uint64_t n_reads = 0; - uint64_t n_kmers = 0; + uint64_t n_reads = 0; + uint64_t n_kmers = 0; HyperLogLogPlusMinus kmers; // unique k-mer count per taxon - void add_kmer(uint64_t kmer) { - ++ n_kmers; - kmers.add(kmer); - } - ReadCounts& operator+=(const ReadCounts& b) { - n_reads += b.n_reads; - n_kmers += b.n_kmers; - kmers += b.kmers; - return *this; - } + + ReadCounts() { } + + ReadCounts(size_t precision) : kmers(HyperLogLogPlusMinus(precision)) { + } + + void add_kmer(uint64_t kmer) { + ++ n_kmers; + kmers.add(kmer); + } + + ReadCounts& operator+=(const ReadCounts& b) { + n_reads += b.n_reads; + n_kmers += b.n_kmers; + kmers += b.kmers; + return *this; + } }; - inline uint64_t reads(const ReadCounts& read_count) { - return(read_count.n_reads); + return(read_count.n_reads); } } #endif diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 2de456a..7a5c6e0 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -74,6 +74,7 @@ map< TaxidSet, uint32_t> Taxids_to_UID_map; unordered_map ID_to_taxon_map; unordered_map SeqId_added; KrakenDB Database; +const size_t hll_prec = 10; TaxonomyDB taxdb; const string prefix = "kraken:taxid|"; @@ -211,7 +212,7 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi string line, seq_id, name; uint32_t taxid; - if (Add_taxIds_for_Assembly && Add_taxIds_for_Sequences) { + if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) { for (const auto& k : taxdb.taxIDsAndEntries) { if (k.first >= New_taxid_start) { New_taxid_start = k.first; @@ -301,15 +302,15 @@ void process_single_file() { continue; } - if (Parent_map.find(taxid) == Parent_map.end()) { + auto it_p = Parent_map.find(taxid); + if (it_p == Parent_map.end()) { cerr << "Skipping sequence " << dna.id << " since taxonomy ID " << taxid << " is not in taxonomy database!" << endl; ++ seqs_skipped; continue; } bool is_contaminant_taxid = taxid == 32630 || taxid == 81077; - - if (Add_taxIds_for_Sequences && taxid != 9606) { + if (Add_taxIds_for_Sequences && taxid != 9606 && it_p->second != 9606) { // Update entry based on header line auto entryIt = taxdb.taxIDsAndEntries.find(taxid); if (entryIt == taxdb.taxIDsAndEntries.end()) { @@ -414,12 +415,20 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_ if (Use_uids_instead_of_taxids) { #pragma omp critical(new_uid) *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file); - } else if (force_contaminant_taxid && is_contaminant_taxid) { - // When force_contaminant_taxid is set, do not compute lca, but assign the taxid - // of the (last) sequence to k-mers - *val_ptr = taxid; } else { - *val_ptr = lca(Parent_map, taxid, *val_ptr); + if (!force_contaminant_taxid) { + *val_ptr = lca(Parent_map, taxid, *val_ptr); + } else { + if (*val_ptr == 32630 || *val_ptr == 81077) { + // keep value + } else if (is_contaminant_taxid) { + // When force_contaminant_taxid is set, do not compute lca, but assign the taxid + // of the (last) sequence to k-mers + *val_ptr = taxid; + } else { + *val_ptr = lca(Parent_map, taxid, *val_ptr); + } + } } } } diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index d1ca09c..bfbd3f8 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -2,14 +2,15 @@ set -eu -[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 +#[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1 +DIR=`pwd` [[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10 build_db() { local K=$1; shift + local MIN=$1; shift local NAM=$1; shift - local MIN=15 local DB_NAM=refseq-$NAM-k$K DB_DIR=$DIR/dbs/$DB_NAM @@ -19,7 +20,7 @@ build_db() { for L in $@; do CMD="$CMD --library-dir=$DIR/data/library/$L" done - if [[ ! -f "$DB_DIR/is.busy" ]]; then + #if [[ ! -f "$DB_DIR/is.busy" ]]; then echo "EXECUTING $CMD" touch $DB_DIR/is.busy $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log @@ -29,25 +30,27 @@ build_db() { dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp fi rm $DB_DIR/is.busy - else - echo "IGNORING $DB_DIR" - fi + #else + # echo "IGNORING $DB_DIR" + #fi } -#export PATH="$DIR/install:$PATH" -for K in 31; do - if [[ `uname` == "Darwin" ]]; then - build_db $K viral viral - build_db $K all-viral viral viral-neighbors - else - build_db $K oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \ - vertebrate_mammalian contaminants - - EUKD=$DIR/dbs/refseq-euk-oct2017-k31 - [[ -d $EUKD ]] || mkdir -p $EUKD - [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD - build_db $K euk-oct2017 fungi protozoa - #build_db $K bacteria bacteria archaea - fi +K=$1; shift; + +for VAR in $@; do + case "$VAR" in + viral) build_db $K 12 viral viral ;; + all-viral) build_db $K 12 all-viral viral viral-neighbors ;; + prok) build_db $K 15 prok archaea-dusted bacteria-dusted ;; + oct2017) build_db $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \ + vertebrate_mammalian contaminants ;; + euk-oct2017) + EUKD=$DIR/dbs/refseq-euk-oct2017-k31 + [[ -d $EUKD ]] || mkdir -p $EUKD + [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD + build_db $K euk-oct2017 fungi protozoa ;; + *) echo "Usage: $0 K {viral|all-viral|prok|oct2017|euk-oct2017}" + exit 1 ;; + esac done diff --git a/tests/init.sh b/tests/init.sh index 6ec60a0..d029fb1 100755 --- a/tests/init.sh +++ b/tests/init.sh @@ -8,14 +8,15 @@ set -xeu #$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install ## Download taxonomy and genomic data into data/ -#$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any -$DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome -#$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 -#$DIR/install/krakenu-download --db $DIR/data -R contaminants +time $DIR/install/krakenu-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria +time $DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any +time $DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome +time $DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 +time $DIR/install/krakenu-download --db $DIR/data -R contaminants for i in fungi protozoa viral viral-neighbors archaea bacteria; do - [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna - [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map + [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.fna + [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.map DUSTED_F="$DIR/data/all-$i-dusted.fna" [[ -s $DUSTED_F ]] || dustmasker -infmt fasta -in $DIR/data/all-$i.fna -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > "$DUSTED_F" mkdir -p $DIR/data/library/$i-dusted From 970c4cac37a8beee4cf8061e3a58d03ce7677996 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 25 Oct 2017 17:15:05 -0400 Subject: [PATCH 089/105] Many small changes to make it working for GCC 4.4 / C++0x --- src/Makefile | 2 +- src/build_taxdb.cpp | 6 +- src/classify.cpp | 49 ++-- src/dump_taxdb.cpp | 8 +- src/grade_classification.cpp | 22 +- src/read_uid_mapping.cpp | 4 +- src/readcounts.hpp | 6 +- src/set_lcas.cpp | 27 +- src/taxdb.h | 554 ++++++++++++++++++----------------- src/uid_mapping.cpp | 21 +- src/uid_mapping.hpp | 5 +- 11 files changed, 373 insertions(+), 331 deletions(-) diff --git a/src/Makefile b/src/Makefile index 2603974..e51a28f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,6 @@ CXX = g++ FOPENMP?=-fopenmp -CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -g -Wfatal-errors +CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -g -Wfatal-errors #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index 763a8a0..fc81cec 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -36,11 +36,11 @@ int main(int argc, char **argv) { return 1; } - TaxonomyDB taxdb; + TaxonomyDB taxdb; if (argc == 2) { - taxdb = TaxonomyDB ((string)argv[1]); + taxdb = TaxonomyDB ((string)argv[1]); } else { - taxdb = TaxonomyDB ((string)argv[1], (string)argv[2]); + taxdb = TaxonomyDB ((string)argv[1], (string)argv[2]); } if (argc == 4) { ifstream ifs(argv[3]); diff --git a/src/classify.cpp b/src/classify.cpp index 1652b45..f2ac91b 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -77,9 +77,16 @@ ostream *Report_output; vector Open_fstreams; vector Open_gzstreams; size_t Work_unit_size = DEF_WORK_UNIT_SIZE; -TaxonomyDB taxdb; +TaxonomyDB taxdb; static vector KrakenDatabases (DB_filenames.size()); +struct db_status { + db_status() : current_bin_key(0), current_min_pos(1), current_max_pos(0) {} + uint64_t current_bin_key; + int64_t current_min_pos; + int64_t current_max_pos; +}; + uint64_t total_classified = 0; uint64_t total_sequences = 0; uint64_t total_bases = 0; @@ -146,12 +153,8 @@ int main(int argc, char **argv) { if (!TaxDB_file.empty()) { // TODO: Define if the taxDB has read counts or not!! - taxdb = TaxonomyDB(TaxDB_file, false); - for (const auto & tax : taxdb.taxIDsAndEntries) { - if (tax.first != 0) - Parent_map[tax.first] = tax.second.parentTaxonomyID; - } - Parent_map[1] = 0; + taxdb = TaxonomyDB(TaxDB_file, false); + Parent_map = taxdb.getParentMap(); } else { cerr << "TaxDB argument is required!" << endl; return 1; @@ -222,7 +225,8 @@ int main(int argc, char **argv) { std::cerr << "Finishing up ..\n"; if (Print_kraken_report) { - for (auto fname : DB_filenames) { + for (size_t i = 0; i < DB_filenames.size(); ++i) { + const auto& fname = DB_filenames[i]; ifstream ifs(fname + ".counts"); if (ifs.good()) { ifs.close(); @@ -230,9 +234,8 @@ int main(int argc, char **argv) { } } - taxdb.setReadCounts(taxon_counts); - TaxReport rep = TaxReport(*Report_output, taxdb, false); - rep.setReportCols({ + TaxReport rep = TaxReport(*Report_output, taxdb, taxon_counts, false); + rep.setReportCols(vector { "%", "reads", "taxReads", @@ -245,10 +248,13 @@ int main(int argc, char **argv) { rep.printReport("kraken","blu"); } - for (ofstream* ofs : Open_fstreams) { + for (size_t i = 0; i < Open_fstreams.size(); ++i) { + ofstream* ofs = Open_fstreams[i]; ofs->close(); } - for (ogzstream* ogzs : Open_gzstreams) { + + for (size_t i = 0; i < Open_gzstreams.size(); ++i) { + ogzstream* ogzs = Open_gzstreams[i]; ogzs->close(); } @@ -328,8 +334,8 @@ void process_file(char *filename) { #pragma omp critical(write_output) { total_classified += my_total_classified; - for (auto &it : my_taxon_counts) { - taxon_counts[it.first] += it.second; + for (auto it = my_taxon_counts.begin(); it != my_taxon_counts.end(); ++it) { + taxon_counts[it->first] += it->second; } if (Print_kraken) @@ -374,6 +380,7 @@ inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna) { } } +/* inline void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_t& last_counter, uint32_t current_taxon) { if (last_taxon == current_taxon) { @@ -390,6 +397,7 @@ void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_ last_taxon = current_taxon; } } +*/ string hitlist_string(const vector &taxa, const vector &ambig) { @@ -428,7 +436,7 @@ string hitlist_string(const vector &taxa, const vector &ambig return hitlist.str(); } - +/* string hitlist_string_depr(const vector &taxa) { uint32_t last_code = taxa[0]; @@ -460,7 +468,7 @@ string hitlist_string_depr(const vector &taxa) } return hitlist.str(); } - +*/ bool classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss, @@ -472,13 +480,6 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, uint32_t taxon = 0; uint32_t hits = 0; // only maintained if in quick mode - - struct db_status { - uint64_t current_bin_key; - int64_t current_min_pos = 1; - int64_t current_max_pos = 0; - }; - //string hitlist_string; //uint32_t last_taxon; //uint32_t last_counter; diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp index 3e0d442..76246e4 100644 --- a/src/dump_taxdb.cpp +++ b/src/dump_taxdb.cpp @@ -13,16 +13,18 @@ int main(int argc, char **argv) { } cerr << "Reading taxonomy database from " << argv[1] << ", writing nodes dump to " << argv[3] << " and names dump to " << argv[2] << "." << endl; - TaxonomyDB taxdb {(string)argv[1]}; + TaxonomyDB taxdb {(string)argv[1]}; ofstream names_file(argv[2]); names_file.exceptions(ifstream::failbit | ifstream::badbit); ofstream nodes_file(argv[3]); nodes_file.exceptions(ifstream::failbit | ifstream::badbit); - for (const auto &taxon : taxdb.taxIDsAndEntries) { + for (auto it = taxdb.entries.begin(); it != taxdb.entries.end(); ++it) { + const auto &taxon = *it; std::string scientificName; + uint32_t parentTaxonomyID = taxon.second.parent == NULL? taxon.first : taxon.second.parent->taxonomyID; nodes_file << taxon.second.taxonomyID - << "\t|\t" << taxon.second.parentTaxonomyID + << "\t|\t" << parentTaxonomyID << "\t|\t" << taxon.second.rank << endl; // there are further columns, but Kraken does not care about them diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp index c4dec80..5ea7922 100644 --- a/src/grade_classification.cpp +++ b/src/grade_classification.cpp @@ -10,10 +10,12 @@ #include #include #include +#include using namespace std; -using TAXID = uint32_t; +//using TAXID = uint32_t; +typedef uint32_t TAXID; unordered_map read_seqid_mapping(string filename) { unordered_map ID_to_taxon_map; @@ -38,7 +40,7 @@ int main(int argc, char **argv) { std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file result_file\n"; return 1; } - TaxonomyDB taxdb = TaxonomyDB(argv[1], false); + TaxonomyDB taxdb = TaxonomyDB(argv[1], false); unordered_map seqid_map = read_seqid_mapping(argv[2]); ofstream out_file(argv[4]); @@ -124,7 +126,8 @@ int main(int argc, char **argv) { TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string); TaxRank::RANK identified_rank = TaxRank::toRank(taxdb.getRank(identified_taxid)); - for (TaxRank::RANK rank : ranks_of_interest) { + for (size_t i=0; i < ranks_of_interest.size(); ++i) { + TaxRank::RANK rank = ranks_of_interest[i]; TAXID simulated_taxid_at_rank = taxdb.getTaxIDAtRank(seq_taxid, TaxRank::toString(rank)); TAXID identified_taxid_at_rank = taxdb.getTaxIDAtRank(identified_taxid, TaxRank::toString(rank)); simulated_taxids_at_rank[rank].insert(simulated_taxid_at_rank); @@ -158,14 +161,15 @@ int main(int argc, char **argv) { if (0) { cout << "#LCA_RANK_READ_COUNTS" << endl; - for (const auto & kv : rank_counts) { - cout << kv.first << delim << kv.second << endl; + for (auto it = rank_counts.begin(); it != rank_counts.end(); ++it) { + cout << it->first << delim << it->second << endl; } cout << endl; } cout << "#rank" << delim << "total_reads" << delim << "correct"<< delim << "incorrect"<< delim << "sensitivity" << delim << "precision" << delim << "higher_rank" << delim << "unidentified" << endl; - for (TaxRank::RANK rank : ranks_of_interest) { + for (size_t i=0; i < ranks_of_interest.size(); ++i) { + TaxRank::RANK rank = ranks_of_interest[i]; size_t true_positives = correct_reads_at_rank.at(rank); size_t false_positives = incorrect_reads_at_rank.at(rank); double sensitivity = 100.0*(double)true_positives/(double)total_reads; @@ -182,12 +186,14 @@ int main(int argc, char **argv) { } cout << "#rank" << delim << "true_count" << delim << "correct" << delim << "incorrect" << delim << "recall" << delim << "precision" << endl; - for (TaxRank::RANK rank : ranks_of_interest) { + for (size_t i=0; i < ranks_of_interest.size(); ++i) { + TaxRank::RANK rank = ranks_of_interest[i]; size_t true_positives = 0; size_t false_positives = 0; if (identified_taxids_at_rank.find(rank) != identified_taxids_at_rank.end()) { - for (const auto & tid : identified_taxids_at_rank[rank]) { + for (auto it = identified_taxids_at_rank[rank].begin(); it != identified_taxids_at_rank[rank].end(); ++it) { + const auto & tid = *it; if (simulated_taxids_at_rank[rank].count(tid) == 1) { ++true_positives; } else { diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp index 6c40d65..8f83742 100644 --- a/src/read_uid_mapping.cpp +++ b/src/read_uid_mapping.cpp @@ -38,8 +38,8 @@ int main(int argc, char **argv) { uint32_t UID = atol(argv[i]); vector taxids = get_taxids_for_uid(UID, fptr); cout << UID << '\t'; - for (auto t : taxids) { - cout << t << ' '; + for (auto it = taxids.begin(); it != taxids.end(); ++it) { + cout << *it << ' '; } cout << endl; } diff --git a/src/readcounts.hpp b/src/readcounts.hpp index e878381..eddca78 100644 --- a/src/readcounts.hpp +++ b/src/readcounts.hpp @@ -7,11 +7,11 @@ namespace kraken { struct ReadCounts { - uint64_t n_reads = 0; - uint64_t n_kmers = 0; + uint64_t n_reads; + uint64_t n_kmers; HyperLogLogPlusMinus kmers; // unique k-mer count per taxon - ReadCounts() { } + ReadCounts() : n_reads(0), n_kmers(0) { } ReadCounts(size_t precision) : kmers(HyperLogLogPlusMinus(precision)) { } diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 7a5c6e0..1b32721 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -75,7 +75,7 @@ unordered_map ID_to_taxon_map; unordered_map SeqId_added; KrakenDB Database; const size_t hll_prec = 10; -TaxonomyDB taxdb; +TaxonomyDB taxdb; const string prefix = "kraken:taxid|"; unordered_set host_taxids = {9606}; @@ -90,13 +90,8 @@ int main(int argc, char **argv) { parse_command_line(argc, argv); if (!TaxDB_filename.empty()) { - taxdb = TaxonomyDB(TaxDB_filename); - for (const auto & tax : taxdb.taxIDsAndEntries) { - if (tax.first != 0) - Parent_map[tax.first] = tax.second.parentTaxonomyID; -// Children_map[tax.second.parentTaxonomyID].insert(tax.first); - } - Parent_map[1] = 0; + taxdb = TaxonomyDB(TaxDB_filename); + Parent_map = taxdb.getParentMap(); } else { cerr << "TaxDB argument is required!" << endl; return 1; @@ -145,8 +140,8 @@ int main(int argc, char **argv) { ofstream ofs(Kmer_count_filename.c_str()); cerr << "Writing kmer counts to " << Kmer_count_filename << "..." << endl; auto counts = Database.count_taxons(); - for (auto const & kv : counts) { - ofs << kv.first << '\t' << kv.second << '\n'; + for (auto it = counts.begin(); it != counts.end(); ++it) { + ofs << it->first << '\t' << it->second << '\n'; } ofs.close(); } @@ -199,7 +194,7 @@ uint32_t get_new_taxid( } unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_filename, - TaxonomyDB& taxdb, unordered_map& Parent_map, + TaxonomyDB& taxdb, unordered_map& Parent_map, bool Add_taxIds_for_Assembly, bool Add_taxIds_for_Sequences) { cerr << "Reading sequence ID to taxonomy ID mapping ... "; @@ -213,9 +208,9 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi uint32_t taxid; if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) { - for (const auto& k : taxdb.taxIDsAndEntries) { - if (k.first >= New_taxid_start) { - New_taxid_start = k.first; + for (auto it = taxdb.entries.begin(); it != taxdb.entries.end(); ++it) { + if (it->first >= New_taxid_start) { + New_taxid_start = it->first+100; } } cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl; @@ -312,8 +307,8 @@ void process_single_file() { bool is_contaminant_taxid = taxid == 32630 || taxid == 81077; if (Add_taxIds_for_Sequences && taxid != 9606 && it_p->second != 9606) { // Update entry based on header line - auto entryIt = taxdb.taxIDsAndEntries.find(taxid); - if (entryIt == taxdb.taxIDsAndEntries.end()) { + auto entryIt = taxdb.entries.find(taxid); + if (entryIt == taxdb.entries.end()) { cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["<second.scientificName = dna.header_line; diff --git a/src/taxdb.h b/src/taxdb.h index b522866..608bd33 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -30,9 +30,25 @@ #include #include #include +#include +#include #include "report-cols.h" +//#include "readcounts.hpp" + using namespace std; +//using kraken::ReadCounts; + +namespace patch +{ + template < typename T > std::string to_string( const T& n ) + { + std::ostringstream stm ; + stm << n ; + return stm.str() ; + } +} + void log_msg (const std::string& s); @@ -113,7 +129,7 @@ struct TaxRank { case RANK::superkingdom: return "superkingdom"; case RANK::root: return "root"; default: - log_msg("Invalid rank!"); + log_msg("Invalid rank!\n"); } return "NA"; } @@ -153,37 +169,32 @@ const unordered_map TaxRank::string_to_rank = { }; -template +template class TaxonomyEntry { public: - TAXID taxonomyID = 0; - TAXID parentTaxonomyID = 0; + TAXID taxonomyID; + TaxonomyEntry* parent; + std::vector children; + string rank; std::string scientificName; + uint64_t genomeSize; + uint64_t genomeSizeOfChildren; - TaxonomyEntry() {} + TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {} - TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) : - taxonomyID(taxonomyID_), scientificName(scientificName_) {} + TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) : + taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_), + genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) { + + if (parent_ != NULL) { + parent->children.push_back(this); + } - TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) : - taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {} - - TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) : - taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_), - genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) {} + } inline bool operator==(const TaxonomyEntry& other) const; - TaxonomyEntry* parent = nullptr; - std::vector children; - - READCOUNTS readCounts = READCOUNTS(); - READCOUNTS readCountsOfChildren = READCOUNTS(); - bool used = false; - uint64_t genomeSize = 0; - uint64_t genomeSizeOfChildren = 0; - uint64_t numBelow = 0; }; //template<> @@ -192,13 +203,15 @@ class TaxonomyEntry { // readCountsOfChildren = 0; //} -template +/* +template struct TaxonomyEntryPtr_comp { - bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const; + bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const; }; +*/ -template +template class TaxonomyDB { public: TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName); @@ -221,7 +234,7 @@ class TaxonomyDB { std::unordered_map getScientificNameMap() const; std::string getLineage(TAXID taxonomyID) const; std::string getMetaPhlAnLineage(TAXID taxonomyID) const; - TaxonomyEntry getEntry(TAXID taxID) const; + TaxonomyEntry getEntry(TAXID taxID) const; bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_); bool hasTaxon(TAXID taxonomyID_); @@ -232,23 +245,24 @@ class TaxonomyDB { int isBelowInTree(TAXID upper, TAXID lower) const; void setGenomeSizes(const std::unordered_map & genomeSizes); - void setReadCounts(const std::unordered_map& readCounts); void readGenomeSizes(string file); void setGenomeSize(const TAXID taxid, const uint64_t genomeSize); - void addReadCount(const TAXID taxid, const READCOUNTS& readCounts_); void printReport(); - std::unordered_map > taxIDsAndEntries; - bool genomeSizes_are_set = false; + std::unordered_map > entries; + bool genomeSizes_are_set; private: - std::unordered_map > + std::unordered_map > readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes); void parseNamesDump(const std::string namesDumpFileName); - void parseNodesDump(const std::string nodesDumpFileName); - void createPointers(std::unordered_map >& taxIDsAndEntries); + std::unordered_map parseNodesDump(const std::string nodesDumpFileName); + void createPointers( + std::unordered_map >& entries, + const std::unordered_map& parentMap + ); }; @@ -256,15 +270,17 @@ template class TaxReport { private: std::ostream& _reportOfb; - TaxonomyDB & _taxdb; + TaxonomyDB & _taxdb; + std::unordered_map _readCounts; + std::unordered_map _readCountsIncludingChildren; uint64_t _total_n_reads; bool _show_zeros; - void printLine(TaxonomyEntry& tax, unsigned depth); + void printLine(TaxonomyEntry& tax, unsigned depth); public: - TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros); + TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, std::unordered_map, bool _show_zeros); void printReport(std::string format, std::string rank); - void printReport(TaxonomyEntry& tax, unsigned depth); + void printReport(TaxonomyEntry& tax, unsigned depth); void setReportCols(std::vector names); std::vector _report_col_names; @@ -277,7 +293,7 @@ V find_or_use_default(const std::unordered_map& my_map, const K& query, co //////////////////////////// DEFINITIONS void log_msg (const std::string& s) { - std::cerr << s << "\n"; + std::cerr << s; } template @@ -311,8 +327,10 @@ std::vector in_betweens(const std::string &s, const char start_char next_start = s.find(start_char, next_end + 1), ++i) { next_end = s.find(end_char, next_start + 1); - if (next_end == string::npos) - throw std::runtime_error("unmatched start and end!"); + if (next_end == string::npos) { + cerr << "unmatched start and end!"; + exit(1); + } tokens.push_back(s.substr(next_start+1, next_end-1)); } @@ -368,14 +386,18 @@ std::vector get_fields(const std::string &s, const std::string& del // readCounts = 0; // readCountsOfChildren = 0; //} -template -bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { - return ((reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren))); - } +/* +template +bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { -template -TAXID TaxonomyDB::getByScientificName(string name) const { - for (const auto & tax : taxIDsAndEntries) { + return ( + (reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren))); + } +*/ +/* +template +TAXID TaxonomyDB::getByScientificName(string name) const { + for (const auto & tax : entries) { if (tax.second.scientificName == name) { return tax.first; } @@ -383,96 +405,112 @@ TAXID TaxonomyDB::getByScientificName(string name) const { return 0; } -template -std::unordered_map TaxonomyDB::getScientificNameMap() const { +template +std::unordered_map TaxonomyDB::getScientificNameMap() const { std::unordered_map scientificNameMap; - for (const auto & tax : taxIDsAndEntries) { + for (const auto & tax : entries) { scientificNameMap[tax.second.scientificName] = tax.first; } return scientificNameMap; } +*/ -template -unordered_map TaxonomyDB::getParentMap() const { +template +unordered_map TaxonomyDB::getParentMap() const { unordered_map Parent_map; - for (const auto & tax : taxIDsAndEntries) { - if (tax.first != 0) - Parent_map[tax.first] = tax.second.parentTaxonomyID; + //for (const auto & tax : entries) { + for (auto it = entries.begin(); it != entries.end(); ++it) { + const auto&tax = *it; + if (tax.first != 0) + continue; + if (tax.second.parent == NULL) + Parent_map[tax.first] = 0; // for kraken::lca + else + Parent_map[tax.first] = tax.second.parent->taxonomyID; } - Parent_map[1] = 1; return Parent_map; } -template -TaxonomyEntry TaxonomyDB::getEntry(TAXID taxID) const { - auto it = taxIDsAndEntries.find(taxID); - if (it == taxIDsAndEntries.end()) { - TaxonomyEntry ti { 0, 0, "NA"}; +template +TaxonomyEntry TaxonomyDB::getEntry(TAXID taxID) const { + auto it = entries.find(taxID); + if (it == entries.end()) { + TaxonomyEntry ti { 0, 0, "NA"}; return ti; } else { return it->second; } } -template -void TaxonomyDB::createPointers(std::unordered_map >& taxIDsAndEntries) { - for (auto& tax : taxIDsAndEntries) { - if (tax.second.parentTaxonomyID != tax.first) { - auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID); - if (parentIt != taxIDsAndEntries.end()) { - tax.second.parent = &(parentIt->second); - parentIt->second.children.push_back(&tax.second); - } - } +template +void TaxonomyDB::createPointers( + std::unordered_map >& entries, + const std::unordered_map& parentMap) { + for (auto it = entries.begin(); it != entries.end(); ++it) { + TAXID taxonomyID = it->first; + TAXID parentTaxonomyID = parentMap.at(taxonomyID); + if (taxonomyID != parentTaxonomyID) { + auto parent_ptr = entries.find(parentTaxonomyID); + if (parent_ptr != entries.end()) { + it->second.parent = &parent_ptr->second; + parent_ptr->second.children.push_back(&it->second); + } else { + cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl; + } + } } } -template -TaxonomyDB::TaxonomyDB() { } +template +TaxonomyDB::TaxonomyDB() : genomeSizes_are_set(false) { } -template -TaxonomyDB::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) : - taxIDsAndEntries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes) +template +TaxonomyDB::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) : + entries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes) { } -template -TaxonomyDB::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) { +template +TaxonomyDB::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) { log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName); - parseNodesDump(nodesDumpFileName); + unordered_map parentMap = parseNodesDump(nodesDumpFileName); parseNamesDump(namesDumpFileName); - createPointers(taxIDsAndEntries); - log_msg("Built a tree with " + std::to_string(taxIDsAndEntries.size()) + " taxa"); + createPointers(entries, parentMap); + log_msg(". Done, got " + patch::to_string(entries.size()) + " taxa\n"); } -template -void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { +template +std::unordered_map TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { std::ifstream nodesDumpFile(nodesDumpFileName); if (!nodesDumpFile.is_open()) throw std::runtime_error("unable to open nodes file"); + std::string line; TAXID taxonomyID; TAXID parentTaxonomyID; std::string rank; char delim; + std::unordered_map parentMap; while (nodesDumpFile >> taxonomyID >> delim >> parentTaxonomyID) { nodesDumpFile.ignore(3); getline(nodesDumpFile, rank, '\t'); - auto entryIt = taxIDsAndEntries.find(taxonomyID); - if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, parentTaxonomyID, rank); + auto entryIt = entries.find(taxonomyID); + if (entryIt == entries.end()) { + entries[taxonomyID] = TaxonomyEntry(taxonomyID, NULL, rank, ""); + parentMap[taxonomyID] = parentTaxonomyID; } else { - entryIt->second.parentTaxonomyID = parentTaxonomyID; + parentMap[taxonomyID] = parentTaxonomyID; entryIt->second.rank = rank; } nodesDumpFile.ignore(2560, '\n'); } + return parentMap; } -template -void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { +template +void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { std::ifstream namesDumpFile(namesDumpFileName); if (!namesDumpFile.is_open()) throw std::runtime_error("unable to open names file"); @@ -490,9 +528,9 @@ void TaxonomyDB::parseNamesDump(const std::string namesDumpFil getline(namesDumpFile, type, '\t'); if (type == "scientific name") { - auto entryIt = taxIDsAndEntries.find(taxonomyID); - if (entryIt == taxIDsAndEntries.end()) { - taxIDsAndEntries[taxonomyID] = TaxonomyEntry(taxonomyID, scientificName); + auto entryIt = entries.find(taxonomyID); + if (entryIt == entries.end()) { + entries[taxonomyID] = TaxonomyEntry(taxonomyID, NULL, "", scientificName); } else { entryIt->second.scientificName = scientificName; } @@ -502,21 +540,24 @@ void TaxonomyDB::parseNamesDump(const std::string namesDumpFil } template -std::vector getSortedKeys(const std::unordered_map& unordered) { +std::vector getSortedKeys(const std::unordered_map& my_unordered_map) { std::vector keys; - keys.reserve (unordered.size()); - for (auto& it : unordered) { - keys.push_back(it.first); + keys.reserve (my_unordered_map.size()); + for (auto it = my_unordered_map.begin(); it != my_unordered_map.end(); ++it) { + keys.push_back(it->first); } std::sort (keys.begin(), keys.end()); return keys; } -template -void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { - for (TAXID& key : getSortedKeys(taxIDsAndEntries)) { - const auto& entry = taxIDsAndEntries.at(key); - outs << key << '\t' << entry.parentTaxonomyID << '\t' +template +void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { + std::vector sorted_keys = getSortedKeys(entries); + for (size_t i = 0; i < sorted_keys.size(); ++i) { + TAXID taxonomyID = sorted_keys[i]; + const auto& entry = entries.at(taxonomyID); + TAXID parentTaxonomyID = (entry.parent==NULL? taxonomyID : entry.parent->taxonomyID); + outs << taxonomyID << '\t' << parentTaxonomyID << '\t' << entry.scientificName << '\t' << entry.rank; if (genomeSizes_are_set) { outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren; @@ -526,29 +567,30 @@ void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const outs.flush(); } -template -void TaxonomyDB::setGenomeSizes(const std::unordered_map & genomeSizes) { - for (const auto& it : genomeSizes) { - setGenomeSize(it.first, it.second); +template +void TaxonomyDB::setGenomeSizes(const std::unordered_map & genomeSizes) { + for (auto it = genomeSizes.begin(); it != genomeSizes.end(); ++it) { + setGenomeSize(it->first, it->second); } genomeSizes_are_set = true; } -template -void TaxonomyDB::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) { - taxIDsAndEntries = readTaxonomyIndex_(inFileName, hasGenomeSizes); +template +void TaxonomyDB::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) { + entries = readTaxonomyIndex_(inFileName, hasGenomeSizes); genomeSizes_are_set = hasGenomeSizes; } -template -std::unordered_map > - TaxonomyDB::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) { +template +std::unordered_map > + TaxonomyDB::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) { log_msg("Reading taxonomy index from " + inFileName); std::ifstream inFile(inFileName); if (!inFile.is_open()) throw std::runtime_error("unable to open taxonomy index file " + inFileName); - std::unordered_map > taxIDsAndEntries; + std::unordered_map > entries; + std::unordered_map parentMap; TAXID taxonomyID, parentTaxonomyID; std::string scientificName, rank; uint64_t genomeSize, genomeSizeOfChildren = 0; @@ -557,7 +599,7 @@ std::unordered_map > while (!inFile.eof()) { inFile >> taxonomyID >> parentTaxonomyID; if (taxonomyID > 1 && taxonomyID == parentTaxonomyID) { - cerr << "ERROR: the parent of " << taxonomyID << " is itself. Should not happend!\n"; + cerr << "ERROR: the parent of " << taxonomyID << " is itself. Should not happend for taxa other than the root.\n"; exit(1); } inFile.get(); // read tab @@ -568,22 +610,20 @@ std::unordered_map > } else { std::getline(inFile, rank, '\n'); } - TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName, genomeSize, genomeSizeOfChildren); + TaxonomyEntry newEntry(taxonomyID, NULL, rank, scientificName, genomeSize, genomeSizeOfChildren); - //cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl; - taxIDsAndEntries.insert({ - taxonomyID, newEntry - }); + auto insert_res = entries.insert({ taxonomyID, newEntry }); + parentMap[taxonomyID] = parentTaxonomyID; } - taxIDsAndEntries.insert({0, {0, 0, "no rank", "unclassified" }}); - //taxIDsAndEntries.insert({-1, {-1, 0, "no rank", "uncategorized" }}); - createPointers(taxIDsAndEntries); - log_msg("done reading TaxDB, read " + std::to_string(taxIDsAndEntries.size()) + " taxa"); - return(taxIDsAndEntries); + entries.insert({0, {0, NULL, "no rank", "unclassified" }}); + //entries.insert({-1, {-1, 0, "no rank", "uncategorized" }}); + createPointers(entries, parentMap); + log_msg(". Done, read " + patch::to_string(entries.size()) + " taxa.\n"); + return(entries); } -template -string TaxonomyDB::getNextProperRank(TAXID a) const { +template +string TaxonomyDB::getNextProperRank(TAXID a) const { if (a == 0) { return "NA"; } @@ -596,8 +636,8 @@ string TaxonomyDB::getNextProperRank(TAXID a) const { return getRank(a); } -template -TAXID TaxonomyDB::getTaxIDAtNextProperRank(TAXID a) const { +template +TAXID TaxonomyDB::getTaxIDAtNextProperRank(TAXID a) const { if (a == 0 || a == 1) { return 0; } @@ -607,8 +647,8 @@ TAXID TaxonomyDB::getTaxIDAtNextProperRank(TAXID a) const { return a; } -template -pair TaxonomyDB::getLowestCommonAncestor(TAXID a, TAXID b) const { +template +pair TaxonomyDB::getLowestCommonAncestor(TAXID a, TAXID b) const { if (a == 0 || b == 0) { return a ? pair(a,-1) : pair(b,-1); } @@ -637,10 +677,10 @@ pair TaxonomyDB::getLowestCommonAncestor(TAXID a, T return pair(1, distA+distB); } +/* - -template -TAXID TaxonomyDB::getLowestCommonAncestor( +template +TAXID TaxonomyDB::getLowestCommonAncestor( const std::vector& taxIDs) const { if (taxIDs.size() == 0) { return 0; @@ -680,30 +720,30 @@ TAXID TaxonomyDB::getLowestCommonAncestor( } return consensus; } +*/ - -template -bool TaxonomyDB::hasTaxon(TAXID taxonomyID_) { - return taxIDsAndEntries.find(taxonomyID_) != taxIDsAndEntries.end(); +template +bool TaxonomyDB::hasTaxon(TAXID taxonomyID_) { + return entries.find(taxonomyID_) != entries.end(); } -template -bool TaxonomyDB::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, +template +bool TaxonomyDB::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) { if (parentTaxonomyID_ == taxonomyID_) { return false; } - auto parentIt = taxIDsAndEntries.find(parentTaxonomyID_); - if (parentIt == taxIDsAndEntries.end()) { + auto parentIt = entries.find(parentTaxonomyID_); + if (parentIt == entries.end()) { cerr << "ERROR with taxon [" << taxonomyID_ <<";"< newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0); + TaxonomyEntry newEntry(taxonomyID_, &parentIt->second, rank_, scientificName_, 0, 0); newEntry.parent = &(parentIt->second); - auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry}); + auto insert_res = entries.insert({taxonomyID_, newEntry}); if (insert_res.second) { parentIt->second.children.push_back(&insert_res.first->second); } @@ -711,35 +751,35 @@ bool TaxonomyDB::insert(TAXID taxonomyID_, TAXID parentTaxono } -template -TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { - auto entry = taxIDsAndEntries.find(taxID); - if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1) - return entry->second.parentTaxonomyID; +template +TAXID TaxonomyDB::getParentTaxID(const TAXID taxID) const { + auto entry = entries.find(taxID); + if (entry != entries.end() && entry->second.parent != NULL) + return entry->second.parent->taxonomyID; else return 0; } -template -std::string TaxonomyDB::getScientificName(const TAXID taxID) const { - auto entry = taxIDsAndEntries.find(taxID); - if (entry != taxIDsAndEntries.end()) { +template +std::string TaxonomyDB::getScientificName(const TAXID taxID) const { + auto entry = entries.find(taxID); + if (entry != entries.end()) { return entry->second.scientificName; } else return std::string(); } -template -std::string TaxonomyDB::getRank(const TAXID taxID) const { - auto entry = taxIDsAndEntries.find(taxID); - if (entry != taxIDsAndEntries.end()) { +template +std::string TaxonomyDB::getRank(const TAXID taxID) const { + auto entry = entries.find(taxID); + if (entry != entries.end()) { return entry->second.rank; } else return std::string(); } -template -std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { +template +std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { std::string lineage; while (true) { // 131567 = Cellular organisms @@ -757,8 +797,8 @@ std::string TaxonomyDB::getLineage(TAXID taxonomyID) const { return lineage; } -template -std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { +template +std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { std::string rank = getRank(taxonomyID); if (rank == "superphylum") return std::string(); std::string lineage; @@ -798,102 +838,51 @@ std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) return lineage; } -template -TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, +template +TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, const std::string& rank) const { if (taxID == 0 || taxID == 1) return 0; - auto entry = taxIDsAndEntries.find(taxID); + auto entry_it = entries.find(taxID); // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl; - while (entry != taxIDsAndEntries.end() - && entry->second.parentTaxonomyID != 1 - && entry->second.parentTaxonomyID != entry->first) { + if (entry_it != entries.end()) { + const TaxonomyEntry* entry_ptr = &entry_it->second; + while (entry_ptr != NULL + && entry_ptr->parent != NULL) { // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl; - if (entry->second.rank == rank) { - return entry->second.taxonomyID; + if (entry_ptr->rank == rank) { + return entry_ptr->taxonomyID; } else { - entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); + entry_ptr = entry_ptr->parent; } } + } return 0; } -template -int TaxonomyDB::isBelowInTree(TAXID upper, TAXID lower) const { - auto entry = taxIDsAndEntries.find(lower); - unsigned level = 0; - while (entry != taxIDsAndEntries.end() && - entry->second.parentTaxonomyID != 1) { - if (entry->first == upper) { - return level; - } else { - entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); - level++; - } - } - return -1; -} -template -bool TaxonomyDB::isSubSpecies(TAXID taxonomyID) const { - bool isSubSpecies = false; - auto entry = taxIDsAndEntries.find(taxonomyID); - int numLevels = 0; - while (entry != taxIDsAndEntries.end() && - entry->second.parentTaxonomyID != 1) { - if (entry->second.rank == "species") { - if (numLevels > 0) { - isSubSpecies = true; - } - break; - } else - entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID); - numLevels++; - } - return isSubSpecies; -} - -template -void TaxonomyDB::addReadCount(const TAXID taxid, const READCOUNTS& readCounts_) { - auto it = taxIDsAndEntries.find(taxid); - if (it == taxIDsAndEntries.end()) { +template +void TaxonomyDB::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) { + auto it = entries.find(taxid); + if (it == entries.end()) { cerr << "No taxonomy entry for " << taxid << "!!" << endl; return; } - TaxonomyEntry* tax = &it->second; - //cerr << taxid << " rc before: " << tax->readCounts << endl; - tax->readCounts += readCounts_; - //cerr << taxid << " rc after: " << tax->readCounts << endl; - - while (tax->parent != nullptr) { - tax = tax->parent; - tax->readCountsOfChildren += readCounts_; - } -} - -template -void TaxonomyDB::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) { - auto it = taxIDsAndEntries.find(taxid); - if (it == taxIDsAndEntries.end()) { - cerr << "No taxonomy entry for " << taxid << "!!" << endl; - return; - } - TaxonomyEntry* tax = &it->second; + TaxonomyEntry* tax = &it->second; tax->genomeSize += genomeSize; - while (tax->parent != nullptr) { + while (tax->parent != NULL) { tax = tax->parent; //std::cerr << "setting genomeSizeOfChildren of parent" << std::endl; tax->genomeSizeOfChildren += genomeSize; } } - -template -void TaxonomyDB::readGenomeSizes(string file) { - for (auto& entry : taxIDsAndEntries) { - entry.second.genomeSize = 0; - entry.second.genomeSizeOfChildren = 0; +template +void TaxonomyDB::readGenomeSizes(string file) { + for (auto entry_it = entries.begin(); entry_it != entries.end(); ++entry_it) { + entry_it->second.genomeSize = 0; + entry_it->second.genomeSizeOfChildren = 0; } log_msg("Reading genome sizes from " + file); std::ifstream inFile(file); @@ -907,21 +896,33 @@ void TaxonomyDB::readGenomeSizes(string file) { } } -template -void TaxonomyDB::setReadCounts(const unordered_map& readCounts) { +/* +template +void TaxonomyDB::setReadCounts(const unordered_map& readCounts) { for (auto& elem : readCounts) { addReadCount(elem.first, elem.second); } - for (auto& tax : taxIDsAndEntries) { - std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); + for (auto& tax : entries) { + std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); } } +*/ template - TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, - bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) { + TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, + std::unordered_map readCounts, + bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _readCounts(readCounts), _show_zeros(show_zeros) { + + for (auto it = _readCounts.begin(); it != _readCounts.end(); ++it) { + TaxonomyEntry* tax = &taxdb.entries.at(it->first); + while (tax != NULL) { + _readCountsIncludingChildren[tax->taxonomyID] += it->second; + tax = tax->parent; + } + } + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, @@ -932,7 +933,8 @@ template template void TaxReport::setReportCols(std::vector names) { _report_cols.clear(); - for (auto& s : names) { + for (size_t i = 0; i< names.size(); ++i) { + auto& s = names[i]; auto it = report_col_name_map.find(s); if (it == report_col_name_map.end()) { throw std::runtime_error(s + " is not a valid report column name"); @@ -945,11 +947,7 @@ void TaxReport::setReportCols(std::vector names) template void TaxReport::printReport(std::string format, std::string rank) { - _total_n_reads = - reads(_taxdb.taxIDsAndEntries.at(0).readCounts) + - reads(_taxdb.taxIDsAndEntries.at(0).readCountsOfChildren) + - reads(_taxdb.taxIDsAndEntries.at(1).readCounts) + - reads(_taxdb.taxIDsAndEntries.at(1).readCountsOfChildren);// + + uint64_t _total_n_reads = reads(_readCountsIncludingChildren[0]) + reads(_readCountsIncludingChildren[1]); if (_total_n_reads == 0) { std::cerr << "total number of reads is zero - not creating a report!" << endl; return; @@ -957,7 +955,8 @@ void TaxReport::printReport(std::string format, std::string ra if (_report_cols.size() == _report_col_names.size()) { // print header bool first_one = true; - for (std::string s : _report_col_names) { + for (size_t i=0; i < _report_col_names.size(); ++i) { + const std::string& s = _report_col_names[i]; if (first_one) { first_one = false; } else { @@ -970,12 +969,12 @@ void TaxReport::printReport(std::string format, std::string ra if (format == "kraken") { // A: print number of unidentified reads - printReport(_taxdb.taxIDsAndEntries.at(0),0u); + printReport(_taxdb.entries.at(0),0u); // B: print normal results - printReport(_taxdb.taxIDsAndEntries.at(1),0u); + printReport(_taxdb.entries.at(1),0u); // C: Print Unclassified stuff - auto it = _taxdb.taxIDsAndEntries.find(-1); - if (it != _taxdb.taxIDsAndEntries.end()) { + auto it = _taxdb.entries.find(-1); + if (it != _taxdb.entries.end()) { printReport(it->second,0u); } } else { @@ -986,40 +985,69 @@ void TaxReport::printReport(std::string format, std::string ra } } +template +struct CompareReadCounts : std::binary_function { + CompareReadCounts(std::vector counts_) : counts(counts_) {} + + bool operator()(size_t a, size_t b) const { + if (counts[a]->n_reads == counts[b]->n_reads) { + return counts[a]->n_kmers < counts[b]->n_kmers; + } else { + return counts[a]->n_reads < counts[b]->n_reads; + } + } + + std::vector& counts; +}; + template -void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { - if (_show_zeros || (reads(tax.readCounts)+reads(tax.readCountsOfChildren)) > 0) { +void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { + if (_show_zeros || reads(_readCountsIncludingChildren[tax.taxonomyID]) > 0) { printLine(tax, depth); - for (auto child : tax.children) - printReport(*child, depth+1); + // TODO: Order children ... + + std::vector pos(tax.children.size()); + std::vector counts(tax.children.size()); + for (size_t i=0; i < tax.children.size(); ++i) { + pos[i] = i; + counts[i] = &_readCountsIncludingChildren[i]; + } + + std::sort(pos.begin(), pos.end(), CompareReadCounts(counts)); + + for (size_t i=0; i < tax.children.size(); ++i) { + auto child_it = tax.children[ pos[i] ]; + printReport(*child_it, depth+1); + } } } template -void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { +void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) { - long long unique_kmers_for_clade = ( tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality()); + long long unique_kmers_for_clade = _readCountsIncludingChildren[tax.taxonomyID].kmers.cardinality(); double genome_size = double(tax.genomeSize+tax.genomeSizeOfChildren); - for (auto& col : _report_cols) { + for (size_t i = 0; i< _report_cols.size(); ++i) { + auto& col = _report_cols[i]; switch (col) { case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << setprecision(4) << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << setprecision(4) << 100.0*(reads(_readCountsIncludingChildren[tax.taxonomyID]))/_total_n_reads; break; //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; - case REPORTCOLS::NUM_READS: _reportOfb << reads(tax.readCounts); break; - case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break; - case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break; + case REPORTCOLS::NUM_READS: _reportOfb << reads(_readCounts[tax.taxonomyID]); break; + case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(_readCountsIncludingChildren[tax.taxonomyID])); break; + case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << _readCounts[tax.taxonomyID].kmers.cardinality(); break; case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << unique_kmers_for_clade; break; - case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break; - case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers; break; + case REPORTCOLS::NUM_KMERS: _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break; + case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break; case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; case REPORTCOLS::CLADE_KMER_COVERAGE: if (genome_size == 0) { _reportOfb << "NA"; } else { _reportOfb << setprecision(4) << (unique_kmers_for_clade / genome_size); }; break; - case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers) / unique_kmers_for_clade ); break; + case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break; case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp index d2100d3..2914468 100644 --- a/src/uid_mapping.cpp +++ b/src/uid_mapping.cpp @@ -1,5 +1,6 @@ #include +#include #include "uid_mapping.hpp" #include "krakenutil.hpp" #include "assert_helpers.h" @@ -83,15 +84,18 @@ namespace kraken { for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) { uint32_t uid = it->first; double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size()); - for (auto taxid : UID_to_taxids_vec[uid-1]) { - taxid_counts[taxid] += it->second; - frac_taxid_counts[taxid] += frac_count; + //for (auto taxid : UID_to_taxids_vec[uid-1]) { + for (auto taxid_it = UID_to_taxids_vec[uid-1].begin(); taxid_it != UID_to_taxids_vec[uid-1].end(); ++taxid_it) { // supporting gcc 4.4 + taxid_counts[*taxid_it] += it->second; + frac_taxid_counts[*taxid_it] += frac_count; } } vector max_taxids; uint32_t max_count = 0; double max_frac_count = 0; - for (auto it : taxid_counts) { + // for (auto it : taxid_counts) { + for (auto itt = taxid_counts.begin(); itt != taxid_counts.end(); ++itt) { // supporting gcc 4.4 + const auto& it = *itt; if (it.second == max_count) { if (frac_taxid_counts[it.first] == max_frac_count) { max_taxids.push_back(it.first); @@ -131,7 +135,8 @@ namespace kraken { return(0); } - for (const auto& it : uid_hit_counts) { + for (auto it1=uid_hit_counts.begin(); it1 != uid_hit_counts.end(); ++it1) { // supporting gcc 4.4 + const auto &it = *it1; if (it.first == 0) { continue; } @@ -139,7 +144,8 @@ namespace kraken { vector taxids = get_taxids_for_uid(it.first, fptr); double frac_count = (double)it.second / (double)taxids.size(); - for (uint32_t taxid : taxids) { + for (size_t i = 0; i < taxids.size(); ++i) { // supporting gcc 4.4 + uint32_t taxid = taxids[i]; frac_taxid_counts[taxid] += frac_count; taxid_counts[taxid] += it.second; } @@ -151,7 +157,8 @@ namespace kraken { vector max_taxids; uint32_t max_count = 0; double max_frac_count = 0; - for (auto it : taxid_counts) { + for (auto it1 = taxid_counts.begin(); it1 != taxid_counts.end(); ++it1) { + const auto& it = *it1; if (it.second == max_count) { if (frac_taxid_counts[it.first] == max_frac_count) { max_taxids.push_back(it.first); diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp index 1f84c40..93d1680 100644 --- a/src/uid_mapping.hpp +++ b/src/uid_mapping.hpp @@ -20,9 +20,12 @@ using namespace std; // - write the mapping to UID_map_file // -using TaxidSet = vector; +//using TaxidSet = typename std::vector; +typedef std::vector TaxidSet; namespace kraken { + + uint32_t uid_mapping( map< TaxidSet, uint32_t>& Taxids_to_UID_map, vector< const TaxidSet* >& UID_to_taxids_vec, From d5b8dc2a3756a1c1518ac82a19529b27a832aff4 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 5 Nov 2017 12:52:00 -0500 Subject: [PATCH 090/105] Make sure TaxDB is not copied --- src/build_taxdb.cpp | 6 +-- src/grade_classification.cpp | 4 +- src/set_lcas.cpp | 3 +- src/taxdb.h | 91 ++++++++++++++++++++++----------- tests/build-dbs.sh | 99 ++++++++++++++++++++++++++++-------- 5 files changed, 145 insertions(+), 58 deletions(-) diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index fc81cec..263de5c 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -35,12 +35,12 @@ int main(int argc, char **argv) { << "build_taxdb taxDB\n"; return 1; } - + TaxonomyDB taxdb; if (argc == 2) { - taxdb = TaxonomyDB ((string)argv[1]); + taxdb = TaxonomyDB ((string)argv[1]); } else { - taxdb = TaxonomyDB ((string)argv[1], (string)argv[2]); + taxdb = TaxonomyDB ((string)argv[1], (string)argv[2]); } if (argc == 4) { ifstream ifs(argv[3]); diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp index 5ea7922..148c7e9 100644 --- a/src/grade_classification.cpp +++ b/src/grade_classification.cpp @@ -40,7 +40,7 @@ int main(int argc, char **argv) { std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file result_file\n"; return 1; } - TaxonomyDB taxdb = TaxonomyDB(argv[1], false); + TaxonomyDB taxdb (argv[1], false); unordered_map seqid_map = read_seqid_mapping(argv[2]); ofstream out_file(argv[4]); @@ -123,7 +123,7 @@ int main(int argc, char **argv) { // getLowestCommonAncestor returns lca taxon as well as distance between the taxa pair lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, identified_taxid); string lca_rank_string = taxdb.getNextProperRank(lca_taxid_dist.first); - TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string); + // TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string); TaxRank::RANK identified_rank = TaxRank::toRank(taxdb.getRank(identified_taxid)); for (size_t i=0; i < ranks_of_interest.size(); ++i) { diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index 1b32721..f4e28f6 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -180,11 +180,12 @@ uint32_t get_new_taxid( if (it == name_to_taxid_map.end()) { uint32_t new_taxid = ++New_taxid_start; bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name); + cerr << "Adding assembly: " << name << " with taxid " << new_taxid; if (!insert_res) { return 0; } + cerr << "Oida " << (insert_res? "success" : "naaa") << endl; // insert_res shows if insert failed, but we don't care - // cerr << "Adding assembly: " << name << " with taxid " << new_taxid << endl; Parent_map[new_taxid] = parent_taxid; name_to_taxid_map[name] = new_taxid; return new_taxid; diff --git a/src/taxdb.h b/src/taxdb.h index 608bd33..91e60ec 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -173,7 +173,7 @@ template class TaxonomyEntry { public: TAXID taxonomyID; - TaxonomyEntry* parent; + TaxonomyEntry* parent; std::vector children; string rank; @@ -183,7 +183,7 @@ class TaxonomyEntry { TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {} - TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) : + TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) : taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_), genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) { @@ -195,8 +195,17 @@ class TaxonomyEntry { inline bool operator==(const TaxonomyEntry& other) const; + friend std::ostream &operator<<(std::ostream &os, const TaxonomyEntry &m) { + TAXID parentTaxonomyID = (m.parent == NULL)? m.taxonomyID : m.parent->taxonomyID; + os << '[' << m.taxonomyID << ";parent="<< parentTaxonomyID << ";name=" << m.scientificName << ";rank=" << m.rank << ']'; + return os; +} + }; + + + //template<> //TaxonomyEntry::TaxonomyEntry () { // readCounts = 0; @@ -217,6 +226,16 @@ class TaxonomyDB { TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName); TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false); TaxonomyDB(); + + TaxonomyDB(TaxonomyDB&& rhs) : entries(std::move(rhs.entries)) { + } + + TaxonomyDB& operator=(TaxonomyDB&& rhs) { + entries = std::move(rhs.entries); + return *this; + } + + void writeTaxonomyIndex(std::ostream & outs) const; void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes); @@ -257,12 +276,6 @@ class TaxonomyDB { std::unordered_map > readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes); - void parseNamesDump(const std::string namesDumpFileName); - std::unordered_map parseNodesDump(const std::string nodesDumpFileName); - void createPointers( - std::unordered_map >& entries, - const std::unordered_map& parentMap - ); }; @@ -443,19 +456,24 @@ TaxonomyEntry TaxonomyDB::getEntry(TAXID taxID) const { } template -void TaxonomyDB::createPointers( +void createPointers( std::unordered_map >& entries, const std::unordered_map& parentMap) { - for (auto it = entries.begin(); it != entries.end(); ++it) { - TAXID taxonomyID = it->first; - TAXID parentTaxonomyID = parentMap.at(taxonomyID); - if (taxonomyID != parentTaxonomyID) { - auto parent_ptr = entries.find(parentTaxonomyID); - if (parent_ptr != entries.end()) { - it->second.parent = &parent_ptr->second; - parent_ptr->second.children.push_back(&it->second); - } else { - cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl; + for (auto entry_it = entries.begin(); entry_it != entries.end(); ++entry_it) { + TAXID taxonomyID = entry_it->first; + auto parent_it = parentMap.find(taxonomyID); + if (parent_it == parentMap.end()) { + cerr << "Cannot find parent for " << taxonomyID << endl; + } else { + TAXID parentTaxonomyID = parent_it->second; + if (taxonomyID != parentTaxonomyID) { + auto parent_ptr = entries.find(parentTaxonomyID); + if (parent_ptr != entries.end()) { + entry_it->second.parent = &parent_ptr->second; + parent_ptr->second.children.push_back(&entry_it->second); + } else { + cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl; + } } } } @@ -470,16 +488,23 @@ TaxonomyDB::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) { } template -TaxonomyDB::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) { +unordered_map> readDumps(const std::string namesDumpFileName, const std::string nodesDumpFileName) { + std::unordered_map > entries; log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName); - unordered_map parentMap = parseNodesDump(nodesDumpFileName); - parseNamesDump(namesDumpFileName); + unordered_map parentMap = parseNodesDump(nodesDumpFileName, entries); createPointers(entries, parentMap); + parseNamesDump(namesDumpFileName, entries); log_msg(". Done, got " + patch::to_string(entries.size()) + " taxa\n"); + return(entries); +} + +template +TaxonomyDB::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) : + entries(readDumps(namesDumpFileName, nodesDumpFileName)) { } template -std::unordered_map TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) { +std::unordered_map parseNodesDump(const std::string nodesDumpFileName, std::unordered_map >& entries) { std::ifstream nodesDumpFile(nodesDumpFileName); if (!nodesDumpFile.is_open()) throw std::runtime_error("unable to open nodes file"); @@ -510,7 +535,7 @@ std::unordered_map TaxonomyDB::parseNodesDump(const std::str } template -void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { +void parseNamesDump(const std::string namesDumpFileName, std::unordered_map >& entries) { std::ifstream namesDumpFile(namesDumpFileName); if (!namesDumpFile.is_open()) throw std::runtime_error("unable to open names file"); @@ -530,7 +555,8 @@ void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) { if (type == "scientific name") { auto entryIt = entries.find(taxonomyID); if (entryIt == entries.end()) { - entries[taxonomyID] = TaxonomyEntry(taxonomyID, NULL, "", scientificName); + cerr << "Entry for " << taxonomyID << " does not exist - it should!" << '\n'; + //entries[taxonomyID] = TaxonomyEntry(taxonomyID, NULL, "", scientificName); } else { entryIt->second.scientificName = scientificName; } @@ -593,7 +619,8 @@ std::unordered_map > std::unordered_map parentMap; TAXID taxonomyID, parentTaxonomyID; std::string scientificName, rank; - uint64_t genomeSize, genomeSizeOfChildren = 0; + uint64_t genomeSize = 0; + uint64_t genomeSizeOfChildren = 0; std::string line; while (!inFile.eof()) { @@ -1044,10 +1071,14 @@ void TaxReport::printLine(TaxonomyEntry& tax, unsigned case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << unique_kmers_for_clade; break; case REPORTCOLS::NUM_KMERS: _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break; case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break; - case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; - case REPORTCOLS::CLADE_KMER_COVERAGE: if (genome_size == 0) { _reportOfb << "NA"; } else { - _reportOfb << setprecision(4) << (unique_kmers_for_clade / genome_size); }; break; - case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break; + case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; + case REPORTCOLS::CLADE_KMER_COVERAGE: + if (genome_size == 0) { + _reportOfb << "NA"; + } else { + _reportOfb << setprecision(4) << (unique_kmers_for_clade / genome_size); + }; break; + case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break; case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; //case REPORTCOLS::GENOME_SIZE: ; break; //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index bfbd3f8..d9284b8 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -6,50 +6,105 @@ set -eu DIR=`pwd` [[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10 - build_db() { + local PROG=$1; shift local K=$1; shift local MIN=$1; shift local NAM=$1; shift + set -eu + local DB_NAM=refseq-$NAM-k$K - DB_DIR=$DIR/dbs/$DB_NAM - - mkdir -p $DB_DIR - CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database" - for L in $@; do - CMD="$CMD --library-dir=$DIR/data/library/$L" - done - #if [[ ! -f "$DB_DIR/is.busy" ]]; then + DB_DIR=$DIR/dbs-$PROG/$DB_NAM + + if [[ "$PROG" == "kraken" ]]; then + mkdir -p $DB_DIR + CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database" + for L in $@; do + CMD="$CMD --library-dir=$DIR/data/library/$L" + done + elif [[ "$PROG" == "kallisto" ]]; then + CMD="kallisto index -k $K -i $DB_DIR" + for L in $@; do + CMD="$CMD $DIR/data/all-$L.fna" + done + fi + if [[ ! -f "$DB_DIR/is.busy" ]]; then echo "EXECUTING $CMD" - touch $DB_DIR/is.busy - $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log - if [[ ! -f "$DB_DIR/taxonomy/nodes.dmp" ]]; then + touch $DB_DIR-is.busy + $CMD 2>&1 | tee $DIR/dbs-$PROG/$DB_NAM-build.log + if [[ $PROG == "kraken" && ! -f "$DB_DIR/taxonomy/nodes.dmp" ]]; then mkdir -p $DB_DIR/taxonomy echo "EXECUTING dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp" dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp fi - rm $DB_DIR/is.busy - #else - # echo "IGNORING $DB_DIR" - #fi + rm $DB_DIR-is.busy + else + echo "$DB_DIR/is.busy exists, ignoring directory." + fi } -K=$1; shift; + +VERBOSE=false +HELP=false +DRY_RUN=false +K=31 +THREADS=10 + +USAGE=" +`basename $0` [options] {kraken,kaiju} {viral|all-viral|prok|oct2017|euk-oct2017} + +Options: + -k KMER_SIZE default $K + -t THREADS default $THREADS +" + +OPTS=`getopt -o vhnk:t:p: --long verbose,dry-run,help,threads:,path: -n 'parse-options' -- "$@"` +if [ $? != 0 ] ; then echo "Failed parsing options. Usage: $USAGE" >&2 ; exit 1 ; fi +eval set -- "$OPTS" + +while true; do + case "$1" in + -v | --verbose ) VERBOSE=true; shift ;; + -h | --help ) HELP=true; shift ;; + -n | --dry-run ) DRY_RUN=true; shift ;; + -k | --kmer-size ) K="$2"; shift; shift ;; + -t | --threads ) THREADS="$2"; shift; shift ;; + -p | --path ) PATH1="$2"; shift; shift ;; + -- ) shift; break ;; + * ) break ;; + esac +done +shift $((OPTIND -1)) + +if [[ "$#" -le 1 ]]; then + echo "$USAGE" + exit 1 +fi + +[[ "$PATH" != "" ]] && export PATH="$PATH1:$PATH" + +PROG=$1 +shift for VAR in $@; do case "$VAR" in - viral) build_db $K 12 viral viral ;; - all-viral) build_db $K 12 all-viral viral viral-neighbors ;; - prok) build_db $K 15 prok archaea-dusted bacteria-dusted ;; - oct2017) build_db $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \ + viral) build_db $PROG $K 12 viral viral ;; + all-viral) build_db $PROG $K 12 all-viral viral viral-neighbors ;; + prok) build_db $PROG $K 15 prok archaea-dusted bacteria-dusted ;; + oct2017) build_db $PROG $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \ vertebrate_mammalian contaminants ;; euk-oct2017) + DB_DIR=$DIR/dbs/refseq-oct2017-k31 EUKD=$DIR/dbs/refseq-euk-oct2017-k31 + if [[ ! -f "$DB_DIR/taxDB" ]]; then + echo "Build oct2017 database first!"; + exit 1; + fi [[ -d $EUKD ]] || mkdir -p $EUKD [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD build_db $K euk-oct2017 fungi protozoa ;; - *) echo "Usage: $0 K {viral|all-viral|prok|oct2017|euk-oct2017}" + *) echo "$USAGE" exit 1 ;; esac done From 16813a7e1ee2634c2fb75bb787d323f6732c8c23 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 5 Nov 2017 15:04:01 -0500 Subject: [PATCH 091/105] Fox parent map generation in taxDB --- src/krakenutil.cpp | 19 +++++++++++++++---- src/set_lcas.cpp | 9 ++++----- src/taxdb.h | 20 +++++++++++--------- tests/build-dbs.sh | 4 +++- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index d58cf39..46fd953 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -58,14 +58,25 @@ namespace kraken { return a ? a : b; unordered_set a_path; - while (a > 0) { + while (a > 1) { a_path.insert(a); - a = parent_map.at(a); + auto a_it = parent_map.find(a); + if (a_it == parent_map.end()) { + cerr << "No parent for " << a << "!\n"; + break; + } + a = a_it->second; } - while (b > 0) { + while (b > 1) { if (a_path.count(b) > 0) return b; - b = parent_map.at(b); + + auto b_it = parent_map.find(b); + if (b_it == parent_map.end()) { + cerr << "No parent for " << b << "!\n"; + break; + } + b = b_it->second; } return 1; } diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index f4e28f6..dc75c63 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -180,11 +180,10 @@ uint32_t get_new_taxid( if (it == name_to_taxid_map.end()) { uint32_t new_taxid = ++New_taxid_start; bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name); - cerr << "Adding assembly: " << name << " with taxid " << new_taxid; + //cerr << "Adding assembly: " << name << " with taxid " << new_taxid; if (!insert_res) { return 0; } - cerr << "Oida " << (insert_res? "success" : "naaa") << endl; // insert_res shows if insert failed, but we don't care Parent_map[new_taxid] = parent_taxid; name_to_taxid_map[name] = new_taxid; @@ -214,7 +213,7 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi New_taxid_start = it->first+100; } } - cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl; + cerr << "[starting new taxonomy IDs with " << (New_taxid_start+1) << ']'; } // Used when adding new taxids for assembly or sequence @@ -253,7 +252,7 @@ unordered_map read_seqid_to_taxid_map(string ID_to_taxon_map_fi if (ID_to_taxon_map.size() == 0) { cerr << "Error: No ID mappings present!!" << endl; } - cerr << " Done - read " << ID_to_taxon_map.size() << " mappings." << endl; + cerr << " got " << ID_to_taxon_map.size() << " mappings." << endl; return std::move(ID_to_taxon_map); } @@ -322,7 +321,7 @@ void process_single_file() { //} if (taxid) { - if (Parent_map.find(taxid) == Parent_map.end()) { + if (Parent_map.find(taxid) == Parent_map.end() || taxdb.entries.find(taxid) == taxdb.entries.end()) { cerr << "Ignoring sequence for taxID " << taxid << " - not in taxDB\n"; } else { #pragma omp parallel for schedule(dynamic) diff --git a/src/taxdb.h b/src/taxdb.h index 91e60ec..aef1e50 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -432,14 +432,15 @@ template unordered_map TaxonomyDB::getParentMap() const { unordered_map Parent_map; //for (const auto & tax : entries) { - for (auto it = entries.begin(); it != entries.end(); ++it) { - const auto&tax = *it; - if (tax.first != 0) + for (auto tax_it = entries.begin(); tax_it != entries.end(); ++tax_it) { + if (tax_it->first == 0) continue; - if (tax.second.parent == NULL) - Parent_map[tax.first] = 0; // for kraken::lca - else - Parent_map[tax.first] = tax.second.parent->taxonomyID; + if (tax_it->second.parent == NULL) { + //cerr << "Parent for " << tax.first << " is 0\n"; + Parent_map[tax_it->first] = 0; // for kraken::lca + } else { + Parent_map[tax_it->first] = tax_it->second.parent->taxonomyID; + } } return Parent_map; } @@ -639,8 +640,9 @@ std::unordered_map > } TaxonomyEntry newEntry(taxonomyID, NULL, rank, scientificName, genomeSize, genomeSizeOfChildren); - auto insert_res = entries.insert({ taxonomyID, newEntry }); - parentMap[taxonomyID] = parentTaxonomyID; + //auto insert_res = entries.insert({ taxonomyID, newEntry }); + entries.insert({ taxonomyID, newEntry }); + parentMap[taxonomyID] = parentTaxonomyID; } entries.insert({0, {0, NULL, "no rank", "unclassified" }}); //entries.insert({-1, {-1, 0, "no rank", "uncategorized" }}); diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index d9284b8..d0675f2 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -51,9 +51,10 @@ HELP=false DRY_RUN=false K=31 THREADS=10 +PATH1="." USAGE=" -`basename $0` [options] {kraken,kaiju} {viral|all-viral|prok|oct2017|euk-oct2017} +`basename $0` [options] {kraken,kaiju} {viral|all-viral|prok|oct2017|euk-oct2017|archaea} Options: -k KMER_SIZE default $K @@ -92,6 +93,7 @@ for VAR in $@; do viral) build_db $PROG $K 12 viral viral ;; all-viral) build_db $PROG $K 12 all-viral viral viral-neighbors ;; prok) build_db $PROG $K 15 prok archaea-dusted bacteria-dusted ;; + archaea) build_db $PROG $K 15 archaea archaea ;; oct2017) build_db $PROG $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \ vertebrate_mammalian contaminants ;; euk-oct2017) From 746681052fdfeaa7bc27c98548b4ddc2d1f42479 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Sun, 5 Nov 2017 17:19:27 -0500 Subject: [PATCH 092/105] Add environment CPPFLAGS and LDFLAGS --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index e51a28f..cc84b11 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,9 +1,9 @@ CXX = g++ FOPENMP?=-fopenmp -CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -g -Wfatal-errors +CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -O2 -Wfatal-errors ${CPPFLAGS} #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping -LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream +LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream ${LDFLAGS} .PHONY: all install clean From dd7fc4fd6cae0db0234f70577334df7afc5c5e18 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 6 Nov 2017 13:26:47 -0500 Subject: [PATCH 093/105] Change name to KrakenHLL --- README.md | 12 ++++++------ install_kraken.sh | 2 +- scripts/{krakenu => krakenhll} | 0 ...to_library.sh => krakenhll-add_to_library.sh} | 0 scripts/{krakenu-build => krakenhll-build} | 16 ++++++++-------- ...krakenu-build_db.sh => krakenhll-build_db.sh} | 6 +++--- ...yfish.sh => krakenhll-check_for_jellyfish.sh} | 0 ...krakenu-clean_db.sh => krakenhll-clean_db.sh} | 0 ...tempfile.pl => krakenhll-cp_into_tempfile.pl} | 0 scripts/{krakenu-download => krakenhll-download} | 2 +- scripts/{krakenu-filter => krakenhll-filter} | 0 .../{krakenu-mpa-report => krakenhll-mpa-report} | 0 ...u-read_merger.pl => krakenhll-read_merger.pl} | 0 scripts/{krakenu-report => krakenhll-report} | 0 ...akenu-shrink_db.sh => krakenhll-shrink_db.sh} | 0 ...ion.sh => krakenhll-standard_installation.sh} | 10 +++++----- .../{krakenu-translate => krakenhll-translate} | 0 ...enu-upgrade_db.sh => krakenhll-upgrade_db.sh} | 0 ...numbers.pl => krakenhll-verify_gi_numbers.pl} | 0 src/Makefile | 12 ++++++------ src/classify.cpp | 2 +- src/get_kmers.cpp | 2 +- src/krakenutil.cpp | 2 +- src/krakenutil.hpp | 4 ++-- src/set_lcas.cpp | 2 +- src/uid_mapping.cpp | 2 +- tests/build-dbs.sh | 2 +- tests/init.sh | 12 ++++++------ tests/test-on-simulated-reads.sh | 16 ++++++++-------- 29 files changed, 52 insertions(+), 52 deletions(-) rename scripts/{krakenu => krakenhll} (100%) rename scripts/{krakenu-add_to_library.sh => krakenhll-add_to_library.sh} (100%) rename scripts/{krakenu-build => krakenhll-build} (96%) rename scripts/{krakenu-build_db.sh => krakenhll-build_db.sh} (96%) rename scripts/{krakenu-check_for_jellyfish.sh => krakenhll-check_for_jellyfish.sh} (100%) rename scripts/{krakenu-clean_db.sh => krakenhll-clean_db.sh} (100%) rename scripts/{krakenu-cp_into_tempfile.pl => krakenhll-cp_into_tempfile.pl} (100%) rename scripts/{krakenu-download => krakenhll-download} (99%) rename scripts/{krakenu-filter => krakenhll-filter} (100%) rename scripts/{krakenu-mpa-report => krakenhll-mpa-report} (100%) rename scripts/{krakenu-read_merger.pl => krakenhll-read_merger.pl} (100%) rename scripts/{krakenu-report => krakenhll-report} (100%) rename scripts/{krakenu-shrink_db.sh => krakenhll-shrink_db.sh} (100%) rename scripts/{krakenu-standard_installation.sh => krakenhll-standard_installation.sh} (77%) rename scripts/{krakenu-translate => krakenhll-translate} (100%) rename scripts/{krakenu-upgrade_db.sh => krakenhll-upgrade_db.sh} (100%) rename scripts/{krakenu-verify_gi_numbers.pl => krakenhll-verify_gi_numbers.pl} (100%) diff --git a/README.md b/README.md index 83ae11b..bf23151 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -Kraken taxonomic sequence classification system with unique k-mer counting +KrakenHLL taxonomic sequence classification system with unique k-mer counting =============================================== [Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count using the HyperLogLog algorithm. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. -kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). +KrakenHLL adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). Here's a small example of a classification against a viral database with k=25. There are three species identified by just one read - Enterobacteria phage BP-4795, Salmonella phage SEN22, Sulfolobus monocaudavirus SMV1. Out of those, the identification of Salmonella phage SEN22 is the strongest, as there read was matched with 116 k-mers that are unique to the sequence, while the match to Sulfolobus monocaudavirus SMV1 is only based on a single 25-mer. @@ -33,13 +33,13 @@ Here's a small example of a classification against a viral database with k=25. T ## Usage -For usage, see `krakenu --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM). +For usage, see `krakenhll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM). ### Differences to `kraken` - - Use `krakenu --report-file FILENAME ...` to write the kraken report to `FILENAME`. - - Use `krakenu --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenu-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same. + - Use `krakenhll --report-file FILENAME ...` to write the kraken report to `FILENAME`. + - Use `krakenhll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenhll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same. - Add a suffix `.gz` to output files to generate gzipped output files ### Differences to `kraken-build` - - Use `krakenu-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`. + - Use `krakenhll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`. - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space diff --git a/install_kraken.sh b/install_kraken.sh index 0e662b6..a251cd3 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -87,7 +87,7 @@ echo "Kraken installation complete." echo echo "To make things easier for you, you may want to copy/symlink the following" echo "files into a directory in your PATH:" -for file in $KRAKEN_DIR/krakenu* +for file in $KRAKEN_DIR/krakenhll* do [ -x "$file" ] && echo " $file" done diff --git a/scripts/krakenu b/scripts/krakenhll similarity index 100% rename from scripts/krakenu rename to scripts/krakenhll diff --git a/scripts/krakenu-add_to_library.sh b/scripts/krakenhll-add_to_library.sh similarity index 100% rename from scripts/krakenu-add_to_library.sh rename to scripts/krakenhll-add_to_library.sh diff --git a/scripts/krakenu-build b/scripts/krakenhll-build similarity index 96% rename from scripts/krakenu-build rename to scripts/krakenhll-build index e90b353..74f0eb7 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenhll-build @@ -288,7 +288,7 @@ sub display_version { } sub download_taxonomy { - exec "krakenu-download_taxonomy.sh"; + exec "krakenhll-download_taxonomy.sh"; } sub download_library { @@ -297,12 +297,12 @@ sub download_library { warn "Unknown library type \"$type\"\n"; usage(); } - exec "krakenu-download_genomic_library.sh", $type; + exec "krakenhll-download_genomic_library.sh", $type; } sub add_to_library { my $arg = shift; - exec "krakenu-add_to_library.sh", $arg; + exec "krakenhll-add_to_library.sh", $arg; } sub shrink_db { @@ -313,11 +313,11 @@ sub shrink_db { if (! defined($new_db)) { die "Must specify new database name to perform shrink task\n"; } - exec "krakenu-shrink_db.sh", $new_count, $new_db, $shrink_block_offset; + exec "krakenhll-shrink_db.sh", $new_count, $new_db, $shrink_block_offset; } sub standard_installation { - exec "krakenu-standard_installation.sh"; + exec "krakenhll-standard_installation.sh"; } sub build_database { @@ -340,13 +340,13 @@ sub build_database { $ENV{"KRAKEN_LIBRARY_DIRS"} = "@library_dirs"; $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir; my $opt = ($verbose? "-x" : ""); - exec "krakenu-build_db.sh"; + exec "krakenhll-build_db.sh"; } sub clean_database { - exec "krakenu-clean_db.sh"; + exec "krakenhll-clean_db.sh"; } sub upgrade_database { - exec "krakenu-upgrade_db.sh"; + exec "krakenhll-upgrade_db.sh"; } diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenhll-build_db.sh similarity index 96% rename from scripts/krakenu-build_db.sh rename to scripts/krakenhll-build_db.sh index f8c1450..959f041 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenhll-build_db.sh @@ -54,7 +54,7 @@ script_dir=`dirname $0` DATABASE_DIR="$KRAKEN_DB_NAME" FIND_OPTS=-L -JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh` +JELLYFISH_BIN=`$script_dir/krakenhll-check_for_jellyfish.sh` NCBI_SERVER="ftp.ncbi.nih.gov" FTP_SERVER="ftp://$NCBI_SERVER" @@ -267,7 +267,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then REPNAME=database if [[ ! -s $REPNAME.report.tsv ]]; then echo "Creating database summary report $REPNAME.report.tsv ..." - krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv + krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv fi fi @@ -300,7 +300,7 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then REPNAME=uid_database if [[ ! -s $REPNAME.report.tsv ]]; then echo "Creating UID database summary report $REPNAME.report.tsv ..." - krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv + krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv fi fi diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenhll-check_for_jellyfish.sh similarity index 100% rename from scripts/krakenu-check_for_jellyfish.sh rename to scripts/krakenhll-check_for_jellyfish.sh diff --git a/scripts/krakenu-clean_db.sh b/scripts/krakenhll-clean_db.sh similarity index 100% rename from scripts/krakenu-clean_db.sh rename to scripts/krakenhll-clean_db.sh diff --git a/scripts/krakenu-cp_into_tempfile.pl b/scripts/krakenhll-cp_into_tempfile.pl similarity index 100% rename from scripts/krakenu-cp_into_tempfile.pl rename to scripts/krakenhll-cp_into_tempfile.pl diff --git a/scripts/krakenu-download b/scripts/krakenhll-download similarity index 99% rename from scripts/krakenu-download rename to scripts/krakenhll-download index b70a24f..c052463 100755 --- a/scripts/krakenu-download +++ b/scripts/krakenhll-download @@ -1,7 +1,7 @@ #!/usr/bin/env perl #vim: et:ts=2:sw=2 -# krakenu-download.pl - based on centrifuge-download +# krakenhll-download.pl - based on centrifuge-download # (c) Florian Breitwieser, 2017 use strict; diff --git a/scripts/krakenu-filter b/scripts/krakenhll-filter similarity index 100% rename from scripts/krakenu-filter rename to scripts/krakenhll-filter diff --git a/scripts/krakenu-mpa-report b/scripts/krakenhll-mpa-report similarity index 100% rename from scripts/krakenu-mpa-report rename to scripts/krakenhll-mpa-report diff --git a/scripts/krakenu-read_merger.pl b/scripts/krakenhll-read_merger.pl similarity index 100% rename from scripts/krakenu-read_merger.pl rename to scripts/krakenhll-read_merger.pl diff --git a/scripts/krakenu-report b/scripts/krakenhll-report similarity index 100% rename from scripts/krakenu-report rename to scripts/krakenhll-report diff --git a/scripts/krakenu-shrink_db.sh b/scripts/krakenhll-shrink_db.sh similarity index 100% rename from scripts/krakenu-shrink_db.sh rename to scripts/krakenhll-shrink_db.sh diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenhll-standard_installation.sh similarity index 77% rename from scripts/krakenu-standard_installation.sh rename to scripts/krakenhll-standard_installation.sh index e09de80..b34dd44 100755 --- a/scripts/krakenu-standard_installation.sh +++ b/scripts/krakenhll-standard_installation.sh @@ -30,11 +30,11 @@ then WOD_FLAG="--work-on-disk" fi -krakenu-check_for_jellyfish.sh -krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy -krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map -krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map -krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ +krakenhll-check_for_jellyfish.sh +krakenhll-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy +krakenhll-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map +krakenhll-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map +krakenhll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \ --max-db-size "$KRAKEN_MAX_DB_SIZE" \ --minimizer-len $KRAKEN_MINIMIZER_LEN \ diff --git a/scripts/krakenu-translate b/scripts/krakenhll-translate similarity index 100% rename from scripts/krakenu-translate rename to scripts/krakenhll-translate diff --git a/scripts/krakenu-upgrade_db.sh b/scripts/krakenhll-upgrade_db.sh similarity index 100% rename from scripts/krakenu-upgrade_db.sh rename to scripts/krakenhll-upgrade_db.sh diff --git a/scripts/krakenu-verify_gi_numbers.pl b/scripts/krakenhll-verify_gi_numbers.pl similarity index 100% rename from scripts/krakenu-verify_gi_numbers.pl rename to scripts/krakenhll-verify_gi_numbers.pl diff --git a/src/Makefile b/src/Makefile index cc84b11..65415db 100644 --- a/src/Makefile +++ b/src/Makefile @@ -19,23 +19,23 @@ db_shrink: krakendb.o quickfile.o db_sort: krakendb.o quickfile.o -set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o +set_lcas: krakendb.o quickfile.o krakenhlltil.o seqreader.o uid_mapping.o grade_classification: taxdb.h report-cols.h read_uid_mapping: quickfile.o -classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h +classify: classify.cpp krakendb.o quickfile.o krakenhlltil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h $(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS) build_taxdb: taxdb.h report-cols.h quickfile.o make_seqid_to_taxid_map: quickfile.o -read_uid_mapping: quickfile.o krakenutil.o uid_mapping.o +read_uid_mapping: quickfile.o krakenhlltil.o uid_mapping.o -krakenutil.o: krakenutil.cpp krakenutil.hpp taxdb.h report-cols.h - $(CXX) $(CXXFLAGS) -c krakenutil.cpp +krakenhlltil.o: krakenhlltil.cpp krakenhlltil.hpp taxdb.h report-cols.h + $(CXX) $(CXXFLAGS) -c krakenhlltil.cpp krakendb.o: krakendb.cpp krakendb.hpp quickfile.hpp $(CXX) $(CXXFLAGS) -c krakendb.cpp @@ -46,5 +46,5 @@ seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp quickfile.o: quickfile.cpp quickfile.hpp $(CXX) $(CXXFLAGS) -c quickfile.cpp -uid_mapping.o: krakenutil.hpp uid_mapping.hpp uid_mapping.cpp +uid_mapping.o: krakenhlltil.hpp uid_mapping.hpp uid_mapping.cpp $(CXX) $(CXXFLAGS) -c uid_mapping.cpp diff --git a/src/classify.cpp b/src/classify.cpp index f2ac91b..b2a5723 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -19,7 +19,7 @@ #include "kraken_headers.hpp" #include "krakendb.hpp" -#include "krakenutil.hpp" +#include "krakenhlltil.hpp" #include "quickfile.hpp" #include "seqreader.hpp" #include "readcounts.hpp" diff --git a/src/get_kmers.cpp b/src/get_kmers.cpp index 9288078..22f19a4 100644 --- a/src/get_kmers.cpp +++ b/src/get_kmers.cpp @@ -20,7 +20,7 @@ #include "kraken_headers.hpp" #include "quickfile.hpp" #include "krakendb.hpp" -#include "krakenutil.hpp" +#include "krakenhlltil.hpp" #include "seqreader.hpp" #include diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index 46fd953..bec1d1c 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -19,7 +19,7 @@ #include "assert_helpers.h" #include "kraken_headers.hpp" -#include "krakenutil.hpp" +#include "krakenhlltil.hpp" #include #include diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp index 46e8eb8..cbfd3d5 100644 --- a/src/krakenutil.hpp +++ b/src/krakenutil.hpp @@ -17,8 +17,8 @@ * along with Kraken. If not, see . */ -#ifndef KRAKENUTIL_HPP -#define KRAKENUTIL_HPP +#ifndef KRAKENHLLTIL_HPP +#define KRAKENHLLTIL_HPP #include "kraken_headers.hpp" #include diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index dc75c63..8457599 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -21,7 +21,7 @@ #include "kraken_headers.hpp" #include "quickfile.hpp" #include "krakendb.hpp" -#include "krakenutil.hpp" +#include "krakenhlltil.hpp" #include "seqreader.hpp" #include "taxdb.h" #include "readcounts.hpp" diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp index 2914468..5b4d001 100644 --- a/src/uid_mapping.cpp +++ b/src/uid_mapping.cpp @@ -2,7 +2,7 @@ #include #include #include "uid_mapping.hpp" -#include "krakenutil.hpp" +#include "krakenhlltil.hpp" #include "assert_helpers.h" using namespace std; diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index d0675f2..b10cac8 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -19,7 +19,7 @@ build_db() { if [[ "$PROG" == "kraken" ]]; then mkdir -p $DB_DIR - CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database" + CMD="krakenhll-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database" for L in $@; do CMD="$CMD --library-dir=$DIR/data/library/$L" done diff --git a/tests/init.sh b/tests/init.sh index d029fb1..495ee4a 100755 --- a/tests/init.sh +++ b/tests/init.sh @@ -4,15 +4,15 @@ set -xeu [[ $# -eq 1 ]] && DIR=$1 || DIR=`pwd` -## Install KrakenU locally into install/ +## Install KrakenHLL locally into install/ #$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install ## Download taxonomy and genomic data into data/ -time $DIR/install/krakenu-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria -time $DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any -time $DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome -time $DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 -time $DIR/install/krakenu-download --db $DIR/data -R contaminants +time $DIR/install/krakenhll-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria +time $DIR/install/krakenhll-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any +time $DIR/install/krakenhll-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome +time $DIR/install/krakenhll-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 +time $DIR/install/krakenhll-download --db $DIR/data -R contaminants for i in fungi protozoa viral viral-neighbors archaea bacteria; do [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.fna diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh index 580f218..21b6f0d 100755 --- a/tests/test-on-simulated-reads.sh +++ b/tests/test-on-simulated-reads.sh @@ -29,10 +29,10 @@ run_kraken() { if [[ "$PROG" == "kraken" ]]; then CMD="kraken" - elif [[ "$PROG" == "krakenu" ]]; then - CMD="$DIR/install/krakenu --report-file $KFILE.report" - elif [[ "$PROG" == "krakenuid" ]]; then - CMD="$DIR/install/krakenu --report-file $KFILE.report --uid-mapping" + elif [[ "$PROG" == "krakenhll" ]]; then + CMD="$DIR/install/krakenhll --report-file $KFILE.report" + elif [[ "$PROG" == "krakenhllid" ]]; then + CMD="$DIR/install/krakenhll --report-file $KFILE.report --uid-mapping" else echo "Unknown $PROG" return; @@ -61,14 +61,14 @@ for i in 1; do # 2 3 FQ=$SDIR/$NAM.fq [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i for K in 31; do - # run_kraken $FQ $NAM $dat viral $K krakenuid + # run_kraken $FQ $NAM $dat viral $K krakenhllid if [[ `uname` != "Darwin" ]]; then run_kraken $FQ $NAM $dat oct2017 $K kraken ALWAYS_SEQMAP - run_kraken $FQ $NAM $dat oct2017 $K krakenu ALWAYS_SEQMAP - run_kraken $FQ $NAM $dat oct2017 $K krakenuid ALWAYS_SEQMAP + run_kraken $FQ $NAM $dat oct2017 $K krakenhll ALWAYS_SEQMAP + run_kraken $FQ $NAM $dat oct2017 $K krakenhllid ALWAYS_SEQMAP else run_kraken $FQ $NAM $dat viral $K kraken - run_kraken $FQ $NAM $dat viral $K krakenu + run_kraken $FQ $NAM $dat viral $K krakenhll fi done done From 0c33f0ecd357a37384de7afb004708d7df5603b4 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Mon, 6 Nov 2017 13:26:47 -0500 Subject: [PATCH 094/105] Change name to KrakenHLL --- README.md | 12 +++++----- install_kraken.sh | 18 ++++++++------- scripts/{krakenu => krakenhll} | 4 +++- ...library.sh => krakenhll-add_to_library.sh} | 0 scripts/{krakenu-build => krakenhll-build} | 22 ++++++++++--------- ...kenu-build_db.sh => krakenhll-build_db.sh} | 6 ++--- ...sh.sh => krakenhll-check_for_jellyfish.sh} | 0 ...kenu-clean_db.sh => krakenhll-clean_db.sh} | 0 ...pfile.pl => krakenhll-cp_into_tempfile.pl} | 0 .../{krakenu-download => krakenhll-download} | 2 +- scripts/{krakenu-filter => krakenhll-filter} | 0 ...rakenu-mpa-report => krakenhll-mpa-report} | 0 ...ead_merger.pl => krakenhll-read_merger.pl} | 0 scripts/{krakenu-report => krakenhll-report} | 0 ...nu-shrink_db.sh => krakenhll-shrink_db.sh} | 0 ....sh => krakenhll-standard_installation.sh} | 10 ++++----- ...{krakenu-translate => krakenhll-translate} | 0 ...-upgrade_db.sh => krakenhll-upgrade_db.sh} | 0 ...bers.pl => krakenhll-verify_gi_numbers.pl} | 0 src/hyperloglogplus.h | 3 ++- src/krakenutil.hpp | 4 ++-- tests/build-dbs.sh | 2 +- tests/init.sh | 12 +++++----- tests/test-on-simulated-reads.sh | 16 +++++++------- 24 files changed, 59 insertions(+), 52 deletions(-) rename scripts/{krakenu => krakenhll} (99%) rename scripts/{krakenu-add_to_library.sh => krakenhll-add_to_library.sh} (100%) rename scripts/{krakenu-build => krakenhll-build} (96%) rename scripts/{krakenu-build_db.sh => krakenhll-build_db.sh} (96%) rename scripts/{krakenu-check_for_jellyfish.sh => krakenhll-check_for_jellyfish.sh} (100%) rename scripts/{krakenu-clean_db.sh => krakenhll-clean_db.sh} (100%) rename scripts/{krakenu-cp_into_tempfile.pl => krakenhll-cp_into_tempfile.pl} (100%) rename scripts/{krakenu-download => krakenhll-download} (99%) rename scripts/{krakenu-filter => krakenhll-filter} (100%) rename scripts/{krakenu-mpa-report => krakenhll-mpa-report} (100%) rename scripts/{krakenu-read_merger.pl => krakenhll-read_merger.pl} (100%) rename scripts/{krakenu-report => krakenhll-report} (100%) rename scripts/{krakenu-shrink_db.sh => krakenhll-shrink_db.sh} (100%) rename scripts/{krakenu-standard_installation.sh => krakenhll-standard_installation.sh} (77%) rename scripts/{krakenu-translate => krakenhll-translate} (100%) rename scripts/{krakenu-upgrade_db.sh => krakenhll-upgrade_db.sh} (100%) rename scripts/{krakenu-verify_gi_numbers.pl => krakenhll-verify_gi_numbers.pl} (100%) diff --git a/README.md b/README.md index 83ae11b..bf23151 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -Kraken taxonomic sequence classification system with unique k-mer counting +KrakenHLL taxonomic sequence classification system with unique k-mer counting =============================================== [Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count using the HyperLogLog algorithm. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. -kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). +KrakenHLL adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). Here's a small example of a classification against a viral database with k=25. There are three species identified by just one read - Enterobacteria phage BP-4795, Salmonella phage SEN22, Sulfolobus monocaudavirus SMV1. Out of those, the identification of Salmonella phage SEN22 is the strongest, as there read was matched with 116 k-mers that are unique to the sequence, while the match to Sulfolobus monocaudavirus SMV1 is only based on a single 25-mer. @@ -33,13 +33,13 @@ Here's a small example of a classification against a viral database with k=25. T ## Usage -For usage, see `krakenu --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM). +For usage, see `krakenhll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM). ### Differences to `kraken` - - Use `krakenu --report-file FILENAME ...` to write the kraken report to `FILENAME`. - - Use `krakenu --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenu-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same. + - Use `krakenhll --report-file FILENAME ...` to write the kraken report to `FILENAME`. + - Use `krakenhll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenhll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same. - Add a suffix `.gz` to output files to generate gzipped output files ### Differences to `kraken-build` - - Use `krakenu-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`. + - Use `krakenhll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`. - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space diff --git a/install_kraken.sh b/install_kraken.sh index 0e662b6..3b12552 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -82,14 +82,16 @@ do fi done -echo -echo "Kraken installation complete." -echo -echo "To make things easier for you, you may want to copy/symlink the following" -echo "files into a directory in your PATH:" -for file in $KRAKEN_DIR/krakenu* +echo -n " +Kraken installation complete. + +To make things easier for you, you may want to copy/symlink the following +files into a directory in your PATH: + +ln -s" +for file in $KRAKEN_DIR/krakenhll* do - [ -x "$file" ] && echo " $file" + [ -x "$file" ] && echo -n " $file" done - +echo " DEST_DIR" exit 0 diff --git a/scripts/krakenu b/scripts/krakenhll similarity index 99% rename from scripts/krakenu rename to scripts/krakenhll index 006a078..e6d4df6 100755 --- a/scripts/krakenu +++ b/scripts/krakenhll @@ -222,7 +222,6 @@ Usage: $PROG [options] Options: --db NAME Name for Kraken DB (default: $default_db) --report-file FILENAME Write Kraken report to FILENAME - --uid-mapping Map using UID database --threads NUM Number of threads (default: $def_thread_ct) --fasta-input Input is FASTA format --fastq-input Input is FASTQ format @@ -246,6 +245,9 @@ Options: --help Print this message --version Print version information +Experimental: + --uid-mapping Map using UID database + If none of the *-input or *-compressed flags are specified, and the file is a regular file, automatic format detection is attempted. EOF diff --git a/scripts/krakenu-add_to_library.sh b/scripts/krakenhll-add_to_library.sh similarity index 100% rename from scripts/krakenu-add_to_library.sh rename to scripts/krakenhll-add_to_library.sh diff --git a/scripts/krakenu-build b/scripts/krakenhll-build similarity index 96% rename from scripts/krakenu-build rename to scripts/krakenhll-build index e90b353..8888cd8 100755 --- a/scripts/krakenu-build +++ b/scripts/krakenhll-build @@ -86,7 +86,7 @@ $hash_size = ""; $max_db_size = ""; $add_taxonomy_ids_for_genome = 0; $add_taxonomy_ids_for_seq = 0; -$build_uid_database = 1; +$build_uid_database = 0; $build_lca_database = 1; # variables corresponding to task options @@ -260,7 +260,6 @@ Options: --shrink-block-offset NUM When shrinking, select the k-mer that is NUM positions from the end of a block of k-mers (default: 1) - --uid-database Build a UID database (default no) --lca-database Build a LCA database (default yes) --no-lca-database Do not build a LCA database --work-on-disk Perform most operations on disk rather than in @@ -273,6 +272,9 @@ Options: for one taxonomy ID. --library-dir DIR Use DIR for reference sequences instead of DBDIR/library. --taxonomy-dir DIR Use DIR for taxonomy instead of DBDIR/taxonomy. + +Experimental: + --uid-database Build a UID database (default no) EOF exit $exit_code; } @@ -288,7 +290,7 @@ sub display_version { } sub download_taxonomy { - exec "krakenu-download_taxonomy.sh"; + exec "krakenhll-download_taxonomy.sh"; } sub download_library { @@ -297,12 +299,12 @@ sub download_library { warn "Unknown library type \"$type\"\n"; usage(); } - exec "krakenu-download_genomic_library.sh", $type; + exec "krakenhll-download_genomic_library.sh", $type; } sub add_to_library { my $arg = shift; - exec "krakenu-add_to_library.sh", $arg; + exec "krakenhll-add_to_library.sh", $arg; } sub shrink_db { @@ -313,11 +315,11 @@ sub shrink_db { if (! defined($new_db)) { die "Must specify new database name to perform shrink task\n"; } - exec "krakenu-shrink_db.sh", $new_count, $new_db, $shrink_block_offset; + exec "krakenhll-shrink_db.sh", $new_count, $new_db, $shrink_block_offset; } sub standard_installation { - exec "krakenu-standard_installation.sh"; + exec "krakenhll-standard_installation.sh"; } sub build_database { @@ -340,13 +342,13 @@ sub build_database { $ENV{"KRAKEN_LIBRARY_DIRS"} = "@library_dirs"; $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir; my $opt = ($verbose? "-x" : ""); - exec "krakenu-build_db.sh"; + exec "krakenhll-build_db.sh"; } sub clean_database { - exec "krakenu-clean_db.sh"; + exec "krakenhll-clean_db.sh"; } sub upgrade_database { - exec "krakenu-upgrade_db.sh"; + exec "krakenhll-upgrade_db.sh"; } diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenhll-build_db.sh similarity index 96% rename from scripts/krakenu-build_db.sh rename to scripts/krakenhll-build_db.sh index f8c1450..959f041 100755 --- a/scripts/krakenu-build_db.sh +++ b/scripts/krakenhll-build_db.sh @@ -54,7 +54,7 @@ script_dir=`dirname $0` DATABASE_DIR="$KRAKEN_DB_NAME" FIND_OPTS=-L -JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh` +JELLYFISH_BIN=`$script_dir/krakenhll-check_for_jellyfish.sh` NCBI_SERVER="ftp.ncbi.nih.gov" FTP_SERVER="ftp://$NCBI_SERVER" @@ -267,7 +267,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then REPNAME=database if [[ ! -s $REPNAME.report.tsv ]]; then echo "Creating database summary report $REPNAME.report.tsv ..." - krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv + krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv fi fi @@ -300,7 +300,7 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then REPNAME=uid_database if [[ ! -s $REPNAME.report.tsv ]]; then echo "Creating UID database summary report $REPNAME.report.tsv ..." - krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv + krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv fi fi diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenhll-check_for_jellyfish.sh similarity index 100% rename from scripts/krakenu-check_for_jellyfish.sh rename to scripts/krakenhll-check_for_jellyfish.sh diff --git a/scripts/krakenu-clean_db.sh b/scripts/krakenhll-clean_db.sh similarity index 100% rename from scripts/krakenu-clean_db.sh rename to scripts/krakenhll-clean_db.sh diff --git a/scripts/krakenu-cp_into_tempfile.pl b/scripts/krakenhll-cp_into_tempfile.pl similarity index 100% rename from scripts/krakenu-cp_into_tempfile.pl rename to scripts/krakenhll-cp_into_tempfile.pl diff --git a/scripts/krakenu-download b/scripts/krakenhll-download similarity index 99% rename from scripts/krakenu-download rename to scripts/krakenhll-download index b70a24f..c052463 100755 --- a/scripts/krakenu-download +++ b/scripts/krakenhll-download @@ -1,7 +1,7 @@ #!/usr/bin/env perl #vim: et:ts=2:sw=2 -# krakenu-download.pl - based on centrifuge-download +# krakenhll-download.pl - based on centrifuge-download # (c) Florian Breitwieser, 2017 use strict; diff --git a/scripts/krakenu-filter b/scripts/krakenhll-filter similarity index 100% rename from scripts/krakenu-filter rename to scripts/krakenhll-filter diff --git a/scripts/krakenu-mpa-report b/scripts/krakenhll-mpa-report similarity index 100% rename from scripts/krakenu-mpa-report rename to scripts/krakenhll-mpa-report diff --git a/scripts/krakenu-read_merger.pl b/scripts/krakenhll-read_merger.pl similarity index 100% rename from scripts/krakenu-read_merger.pl rename to scripts/krakenhll-read_merger.pl diff --git a/scripts/krakenu-report b/scripts/krakenhll-report similarity index 100% rename from scripts/krakenu-report rename to scripts/krakenhll-report diff --git a/scripts/krakenu-shrink_db.sh b/scripts/krakenhll-shrink_db.sh similarity index 100% rename from scripts/krakenu-shrink_db.sh rename to scripts/krakenhll-shrink_db.sh diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenhll-standard_installation.sh similarity index 77% rename from scripts/krakenu-standard_installation.sh rename to scripts/krakenhll-standard_installation.sh index e09de80..b34dd44 100755 --- a/scripts/krakenu-standard_installation.sh +++ b/scripts/krakenhll-standard_installation.sh @@ -30,11 +30,11 @@ then WOD_FLAG="--work-on-disk" fi -krakenu-check_for_jellyfish.sh -krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy -krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map -krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map -krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ +krakenhll-check_for_jellyfish.sh +krakenhll-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy +krakenhll-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map +krakenhll-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map +krakenhll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \ --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \ --max-db-size "$KRAKEN_MAX_DB_SIZE" \ --minimizer-len $KRAKEN_MINIMIZER_LEN \ diff --git a/scripts/krakenu-translate b/scripts/krakenhll-translate similarity index 100% rename from scripts/krakenu-translate rename to scripts/krakenhll-translate diff --git a/scripts/krakenu-upgrade_db.sh b/scripts/krakenhll-upgrade_db.sh similarity index 100% rename from scripts/krakenu-upgrade_db.sh rename to scripts/krakenhll-upgrade_db.sh diff --git a/scripts/krakenu-verify_gi_numbers.pl b/scripts/krakenhll-verify_gi_numbers.pl similarity index 100% rename from scripts/krakenu-verify_gi_numbers.pl rename to scripts/krakenhll-verify_gi_numbers.pl diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h index b4d9a81..10baa14 100644 --- a/src/hyperloglogplus.h +++ b/src/hyperloglogplus.h @@ -32,7 +32,7 @@ using namespace std; // experimentally determined threshold values for p - 4 static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100, - 6500, 11500, 20000, 50000, 120000, 350000}; + 6500, 11500, 20000, 50000, 120000, 350000}; /////////////////////// @@ -69,6 +69,7 @@ inline uint64_t ranhash (uint64_t u) { return v; } +// from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp inline uint64_t murmurhash3_finalizer (uint64_t key) { key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that. key ^= key >> 33; diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp index 46e8eb8..cbfd3d5 100644 --- a/src/krakenutil.hpp +++ b/src/krakenutil.hpp @@ -17,8 +17,8 @@ * along with Kraken. If not, see . */ -#ifndef KRAKENUTIL_HPP -#define KRAKENUTIL_HPP +#ifndef KRAKENHLLTIL_HPP +#define KRAKENHLLTIL_HPP #include "kraken_headers.hpp" #include diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index d0675f2..b92d23a 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -19,7 +19,7 @@ build_db() { if [[ "$PROG" == "kraken" ]]; then mkdir -p $DB_DIR - CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database" + CMD="krakenhll-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy" for L in $@; do CMD="$CMD --library-dir=$DIR/data/library/$L" done diff --git a/tests/init.sh b/tests/init.sh index d029fb1..495ee4a 100755 --- a/tests/init.sh +++ b/tests/init.sh @@ -4,15 +4,15 @@ set -xeu [[ $# -eq 1 ]] && DIR=$1 || DIR=`pwd` -## Install KrakenU locally into install/ +## Install KrakenHLL locally into install/ #$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install ## Download taxonomy and genomic data into data/ -time $DIR/install/krakenu-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria -time $DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any -time $DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome -time $DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 -time $DIR/install/krakenu-download --db $DIR/data -R contaminants +time $DIR/install/krakenhll-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria +time $DIR/install/krakenhll-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any +time $DIR/install/krakenhll-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome +time $DIR/install/krakenhll-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 +time $DIR/install/krakenhll-download --db $DIR/data -R contaminants for i in fungi protozoa viral viral-neighbors archaea bacteria; do [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.fna diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh index 580f218..61ff561 100755 --- a/tests/test-on-simulated-reads.sh +++ b/tests/test-on-simulated-reads.sh @@ -29,10 +29,10 @@ run_kraken() { if [[ "$PROG" == "kraken" ]]; then CMD="kraken" - elif [[ "$PROG" == "krakenu" ]]; then - CMD="$DIR/install/krakenu --report-file $KFILE.report" - elif [[ "$PROG" == "krakenuid" ]]; then - CMD="$DIR/install/krakenu --report-file $KFILE.report --uid-mapping" + elif [[ "$PROG" == "krakenhll" ]]; then + CMD="$DIR/install/krakenhll --report-file $KFILE.report" + elif [[ "$PROG" == "krakenhull" ]]; then + CMD="$DIR/install/krakenhll --report-file $KFILE.report --uid-mapping" else echo "Unknown $PROG" return; @@ -61,14 +61,14 @@ for i in 1; do # 2 3 FQ=$SDIR/$NAM.fq [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i for K in 31; do - # run_kraken $FQ $NAM $dat viral $K krakenuid + # run_kraken $FQ $NAM $dat viral $K krakenhllid if [[ `uname` != "Darwin" ]]; then run_kraken $FQ $NAM $dat oct2017 $K kraken ALWAYS_SEQMAP - run_kraken $FQ $NAM $dat oct2017 $K krakenu ALWAYS_SEQMAP - run_kraken $FQ $NAM $dat oct2017 $K krakenuid ALWAYS_SEQMAP + run_kraken $FQ $NAM $dat oct2017 $K krakenhll ALWAYS_SEQMAP + run_kraken $FQ $NAM $dat oct2017 $K krakenhllid ALWAYS_SEQMAP else run_kraken $FQ $NAM $dat viral $K kraken - run_kraken $FQ $NAM $dat viral $K krakenu + run_kraken $FQ $NAM $dat viral $K krakenhll fi done done From 274e41fb87fc1d3ccd76926682559de51ddc3b5a Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 8 Nov 2017 11:39:01 -0500 Subject: [PATCH 095/105] Update --- tests/build-dbs.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index bfbd3f8..d087dd5 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -20,7 +20,7 @@ build_db() { for L in $@; do CMD="$CMD --library-dir=$DIR/data/library/$L" done - #if [[ ! -f "$DB_DIR/is.busy" ]]; then + if [[ ! -f "$DB_DIR/is.busy" ]]; then echo "EXECUTING $CMD" touch $DB_DIR/is.busy $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log @@ -30,9 +30,9 @@ build_db() { dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp fi rm $DB_DIR/is.busy - #else - # echo "IGNORING $DB_DIR" - #fi + else + echo "$DB_DIR/is.busy exists, ignoring directory." + fi } K=$1; shift; From 69d835234c475c866871ceb1a7cfe9bf228edcb0 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 8 Nov 2017 11:43:14 -0500 Subject: [PATCH 096/105] Create taxDB if not present --- scripts/krakenhll | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scripts/krakenhll b/scripts/krakenhll index e6d4df6..8a4aad9 100755 --- a/scripts/krakenhll +++ b/scripts/krakenhll @@ -38,6 +38,7 @@ $ENV{"KRAKEN_DIR"} = $KRAKEN_DIR; $ENV{"PATH"} = "$KRAKEN_DIR:$ENV{PATH}"; my $CLASSIFY = "$KRAKEN_DIR/classify"; +my $CREATE_TAXDB = "$KRAKEN_DIR/build_taxdb"; my $GZIP_MAGIC = chr(hex "1f") . chr(hex "8b"); my $BZIP2_MAGIC = "BZ"; @@ -92,6 +93,12 @@ if (! @ARGV) { usage(); } +if (!defined $report_file) { + print STDERR "Need to specify a report file with --report-file! +See --help for more details.\n"; + exit 1; +} + eval { @db_prefix = map { krakenlib::find_db($_) } @db_prefix }; if ($@) { die "$PROG: $@"; @@ -160,6 +167,16 @@ if ($uid_mapping) { } +if (! -f $db_prefix[0]."/taxDB") { + print STDERR "Taxonomy database not at ".$db_prefix[0]."/taxDB - creating it ..."; + die "$db_prefix[0]/taxonomy/nodes.dmp does not exist!" unless -f $db_prefix[0]."/taxonomy/nodes.dmp"; + die "$db_prefix[0]/taxonomy/names.dmp does not exist!" unless -f $db_prefix[0]."/taxonomy/names.dmp"; + + my $cmd = "$CREATE_TAXDB $db_prefix[0]/taxonomy/names.dmp $db_prefix[0]/taxonomy/nodes.dmp > $db_prefix[0]/taxDB"; + print STDERR "$cmd\n"; + system $cmd; +} + # handle piping for decompression/merging my @pipe_argv; if ($paired) { From d83c579bb5bd5a1787f9790aedd069a64036395c Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 8 Nov 2017 11:43:44 -0500 Subject: [PATCH 097/105] Compute k-mer counts if not present --- src/classify.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/classify.cpp b/src/classify.cpp index f2ac91b..3b8a03e 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -153,7 +153,7 @@ int main(int argc, char **argv) { if (!TaxDB_file.empty()) { // TODO: Define if the taxDB has read counts or not!! - taxdb = TaxonomyDB(TaxDB_file, false); + taxdb = TaxonomyDB(TaxDB_file, false); Parent_map = taxdb.getParentMap(); } else { cerr << "TaxDB argument is required!" << endl; @@ -226,12 +226,20 @@ int main(int argc, char **argv) { if (Print_kraken_report) { for (size_t i = 0; i < DB_filenames.size(); ++i) { - const auto& fname = DB_filenames[i]; - ifstream ifs(fname + ".counts"); + const auto fname = DB_filenames[i] + ".counts"; + ifstream ifs(fname); if (ifs.good()) { ifs.close(); - taxdb.readGenomeSizes(fname+".counts"); + } else { + ofstream ofs(fname); + cerr << "Writing kmer counts to " << fname << "... [only once for this database, may take a while] " << endl; + auto counts = KrakenDatabases[i]->count_taxons(); + for (auto it = counts.begin(); it != counts.end(); ++it) { + ofs << it->first << '\t' << it->second << '\n'; + } + ofs.close(); } + taxdb.readGenomeSizes(fname); } TaxReport rep = TaxReport(*Report_output, taxdb, taxon_counts, false); From 30a65384f600c2233ef7ec8b58422b34f6f0f522 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 8 Nov 2017 11:46:26 -0500 Subject: [PATCH 098/105] Fix indent --- src/hyperloglogplus.h | 823 +++++++++++++++++++++--------------------- 1 file changed, 421 insertions(+), 402 deletions(-) diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h index b4d9a81..9274451 100644 --- a/src/hyperloglogplus.h +++ b/src/hyperloglogplus.h @@ -32,51 +32,52 @@ using namespace std; // experimentally determined threshold values for p - 4 static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100, - 6500, 11500, 20000, 50000, 120000, 350000}; + 6500, 11500, 20000, 50000, 120000, 350000}; /////////////////////// // /** - * gives the estimated cardinality for m bins, v of which are non-zero + * Gives the estimated cardinality for m bins, v of which are non-zero + * using linear counting of Whang et al., 1990: n_hat = -m ln(v) * @param m number of bins in the matrix * @param v number of non-zero bins * @return */ double linearCounting(uint32_t m, uint32_t v) { - if (v > m) { - throw std::invalid_argument("number of v should not be greater than m"); - } - double fm = double(m); - return fm * log(fm/double(v)); + if (v > m) { + throw std::invalid_argument("number of v should not be greater than m"); + } + return double(m) * log(double(m)/double(v)); } /** - * from Numerical Recipes, 3rd Edition, p 352 - * Returns hash of u as a 64-bit integer. - * -*/ + * from Numerical Recipes, 3rd Edition, p 352 + * Returns hash of u as a 64-bit integer. + * + */ inline uint64_t ranhash (uint64_t u) { uint64_t v = u * 3935559000370003845 + 2691343689449507681; - v ^= v >> 21; v ^= v << 37; v ^= v >> 4; - v *= 4768777513237032717; - v ^= v << 20; v ^= v >> 41; v ^= v << 5; return v; } +/** + * Avalanche mixer/finalizer from MurMurHash3 + * https://github.com/aappleby/smhasher + */ inline uint64_t murmurhash3_finalizer (uint64_t key) { - key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that. - key ^= key >> 33; - key *= 0xff51afd7ed558ccd; - key ^= key >> 33; - key *= 0xc4ceb9fe1a85ec53; - key ^= key >> 33; - return key; + key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that. + key ^= key >> 33; + key *= 0xff51afd7ed558ccd; + key ^= key >> 33; + key *= 0xc4ceb9fe1a85ec53; + key ^= key >> 33; + return key; } /** @@ -85,14 +86,14 @@ inline uint64_t murmurhash3_finalizer (uint64_t key) { * @return */ double alpha(uint32_t m) { - switch (m) { - case 16: return 0.673; - case 32: return 0.697; - case 64: return 0.709; - } - - // m >= 128 - return 0.7213 / (1 + 1.079/double(m)); + switch (m) { + case 16: return 0.673; + case 32: return 0.697; + case 64: return 0.709; + } + + // m >= 128 + return 0.7213 / (1 + 1.079/double(m)); } /** @@ -101,16 +102,16 @@ double alpha(uint32_t m) { * @return */ double calculateEstimate(vector array) { - double inverseSum = 0.0; - for (size_t i = 0; i < array.size(); ++i) { - // TODO: pre-calculate the power calculation - inverseSum += pow(2,-array[i]); - } - return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum; + double inverseSum = 0.0; + for (size_t i = 0; i < array.size(); ++i) { + // TODO: pre-calculate the power calculation + inverseSum += pow(2,-array[i]); + } + return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum; } uint32_t countZeros(vector s) { - return (uint32_t)count(s.begin(), s.end(), 0); + return (uint32_t)count(s.begin(), s.end(), 0); } /** @@ -128,9 +129,9 @@ T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) { // ((T(1) << (hi - lo) - 1) 1's from position 0 to position (hi-lo-1) // (((T(1) << (hi - lo)) - 1) << lo) 1's from position lo to position hi - // The T(1) is required to not cause overflow on 32bit machines - // TODO: consider creating a bitmask only once in the beginning - T bitmask = (((T(1) << (hi - lo)) - 1) << lo); + // The T(1) is required to not cause overflow on 32bit machines + // TODO: consider creating a bitmask only once in the beginning + T bitmask = (((T(1) << (hi - lo)) - 1) << lo); T result = value & bitmask; if (!shift_left) { @@ -140,15 +141,31 @@ T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) { // shift resulting bits to the left result = result << (sizeof(T)*8 - hi); } - return result; + return result; +} + +inline +void insert_hash(vector& vec, uint32_t val) { + auto it = std::lower_bound( vec.begin(), vec.end(), val); // find proper position in descending order + if (it == vec.end()) { + vec.insert( it, val ); // insert before iterator it + } +} + +inline +void merge_lists(vector& vec1, const vector& vec2) { + auto it = std::lower_bound( vec.begin(), vec.end(), val); // find proper position in descending order + if (it == vec.end()) { + vec.insert( it, val ); // insert before iterator it + } } template T extractBits(T bits, uint8_t hi) { // create a bitmask for first hi bits (LSB 0 numbering) - T bitmask = T(-1) << (sizeof(T)*8 - hi); + T bitmask = T(-1) << (sizeof(T)*8 - hi); - return (bits & bitmask); + return (bits & bitmask); } // functions for counting the number of leading 0-bits (clz) @@ -174,7 +191,7 @@ static int clz_manual(uint64_t x) #endif inline uint32_t clz(const uint32_t x) { - return __builtin_clz(x); + return __builtin_clz(x); } inline uint32_t clz(const uint64_t x) { @@ -189,7 +206,7 @@ inline uint32_t clz(const uint64_t x) { //#else uint32_t clz_log2(const uint64_t w) { - return 63 - floor(log2(w)); + return 63 - floor(log2(w)); } //#endif @@ -198,7 +215,7 @@ uint32_t clz_log2(const uint64_t w) { // see Heule et al., section 5.3.2 // Also, using sets might give a larger overhead as each insertion costs more // consider using vector and sort/unique when merging. -typedef set SparseListType; +typedef vector SparseListType; typedef uint64_t HashSize; /** @@ -212,233 +229,236 @@ class HyperLogLogPlusMinus { private: - vector M; // registers (M) of size m - uint8_t p; // precision - uint32_t m; // number of registers - bool sparse; // sparse representation of the data? - SparseListType sparseList; // TODO: use a compressed list instead + vector M; // registers (M) of size m + uint8_t p; // precision + uint32_t m; // number of registers + bool sparse; // sparse representation of the data? + SparseListType sparseList; // TODO: use a compressed list instead - // vectors containing data for bias correction - vector > rawEstimateData; // TODO: make this static - vector > biasData; + // vectors containing data for bias correction + vector > rawEstimateData; // TODO: make this static + vector > biasData; - // sparse versions of p and m - static const uint8_t pPrime = 25; // precision when using a sparse representation - // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32 - static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime + // sparse versions of p and m + static const uint8_t pPrime = 25; // precision when using a sparse representation + // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32 + static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime public: - ~HyperLogLogPlusMinus() {}; - - /** - * Create new HyperLogLogPlusMinus counter - * @param precision - * @param sparse - */ - HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) { - if (precision > 18 || precision < 4) { - throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18"); - } - - this->m = 1 << precision; - - if (sparse) { - this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size - } else { - this->M = vector(m); - } - } - - /** - * Add a new item to the counter. - * @param item - */ - void add(T_KEY item) { - add(item, sizeof(T_KEY)); - } - - /** - * Add a new item to the counter. - * @param item - * @param size size of item - */ - void add(T_KEY item, size_t size) { - - // compute hash for item - HashSize hash_value = murmurhash3_finalizer(item); + ~HyperLogLogPlusMinus() {}; + + /** + * Create new HyperLogLogPlusMinus counter + * @param precision + * @param sparse + */ + HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) { + if (precision > 18 || precision < 4) { + throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18"); + } + + this->m = 1 << precision; + + if (sparse) { + this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size + } else { + this->M = vector(m); + } + } + + /** + * Add a new item to the counter. + * @param item + */ + void add(T_KEY item) { + add(item, sizeof(T_KEY)); + } + + /** + * Add a new item to the counter. + * @param item + * @param size size of item + */ + void add(T_KEY item, size_t size) { + + // compute hash for item + HashSize hash_value = murmurhash3_finalizer(item); #ifdef HLL_DEBUG - cerr << "Value: " << item << "; hash(value): " << hash_value << endl; - cerr << bitset<64>(hash_value) << endl; + cerr << "Value: " << item << "; hash(value): " << hash_value << endl; + cerr << bitset<64>(hash_value) << endl; #endif - if (sparse) { - // sparse mode: put the encoded hash into sparse list - uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value); - this->sparseList.insert(encoded_hash_value); + if (sparse) { + // sparse mode: put the encoded hash into sparse list + uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value); + insert_hash(sparseList, encoded_hash_value); #ifdef HLL_DEBUG - idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value); - assert_eq(ir.idx,get_index(hash_value, p)); - assert_eq(ir.rank, get_rank(hash_value, p)); + idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value); + assert_eq(ir.idx,get_index(hash_value, p)); + assert_eq(ir.rank, get_rank(hash_value, p)); #endif - // if the sparseList is too large, switch to normal (register) representation - if (this->sparseList.size() > this->m) { // TODO: is the size of m correct? - switchToNormalRepresentation(); - } - } else { - // normal mode - // take first p bits as index {x63,...,x64-p} - uint32_t idx = get_index(hash_value, p); - // shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0} - uint8_t rank = get_rank(hash_value, p); - - // update the register if current rank is bigger - if (rank > this->M[idx]) { - this->M[idx] = rank; - } - } - } - - void add(vector words) { - for(size_t i = 0; i < words.size(); ++i) { - this->add(words[i]); - } - } - - /** - * Reset to its initial state. - */ - void reset() { - this->sparse = true; - this->sparseList.clear(); // - this->M.clear(); - } - - /** - * Convert from sparse representation (using tmpSet and sparseList) to normal (using register) - */ - void switchToNormalRepresentation() { + // if the sparseList is too large, switch to normal (register) representation + if (this->sparseList.size() > this->m) { // TODO: is the size of m correct? + switchToNormalRepresentation(); + } + } else { + // normal mode + // take first p bits as index {x63,...,x64-p} + uint32_t idx = get_index(hash_value, p); + // shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0} + uint8_t rank = get_rank(hash_value, p); + + // update the register if current rank is bigger + if (rank > this->M[idx]) { + this->M[idx] = rank; + } + } + } + + void add(vector words) { + for(size_t i = 0; i < words.size(); ++i) { + this->add(words[i]); + } + } + + /** + * Reset to its initial state. + */ + void reset() { + this->sparse = true; + this->sparseList.clear(); // + this->M.clear(); + } + + /** + * Convert from sparse representation (using tmpSet and sparseList) to normal (using register) + */ + void switchToNormalRepresentation() { #ifdef HLL_DEBUG - cerr << "switching to normal representation" << endl; - cerr << " est before: " << cardinality(true) << endl; + cerr << "switching to normal representation" << endl; + cerr << " est before: " << cardinality(true) << endl; #endif - this->sparse = false; - this->M = vector(this->m); - if (sparseList.size() > 0) { //TDOD: do I need to check this, here? - addToRegisters(this->sparseList); - this->sparseList.clear(); - } + this->sparse = false; + this->M = vector(this->m); + if (sparseList.size() > 0) { //TDOD: do I need to check this, here? + addToRegisters(this->sparseList); + this->sparseList.clear(); + } #ifdef HLL_DEBUG - cerr << " est after: " << cardinality(true) << endl; + cerr << " est after: " << cardinality(true) << endl; #endif - } - - /** - * add sparseList to the registers of M - */ - void addToRegisters(const SparseListType &sparseList) { - if (sparseList.size() == 0) { - return; - } - for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) { - - idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr); - - assert_lt(ir.idx,M.size()); - if (ir.rank > this->M[ir.idx]) { - this->M[ir.idx] = ir.rank; - } - } - } - - /** - * Merge another HyperLogLogPlusMinus into this. Converts to normal representation - * @param other - */ - void merge(const HyperLogLogPlusMinus* other) { - if (this->p != other->p) { - throw std::invalid_argument("precisions must be equal"); - } - - if (this->sparse && other->sparse) { - if (this->sparseList.size()+other->sparseList.size() > this->m) { - switchToNormalRepresentation(); - addToRegisters(other->sparseList); - } else { - this->sparseList.insert(other->sparseList.begin(),other->sparseList.end()); - } - } else if (other->sparse) { - // other is sparse, but this is not - addToRegisters(other->sparseList); - } else { - if (this->sparse) { - switchToNormalRepresentation(); - } - - // merge registers - for (size_t i = 0; i < other->M.size(); ++i) { - if (other->M[i] > this->M[i]) { - this->M[i] = other->M[i]; - } - } - } - } - - HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus* other) { - merge(other); - return *this; - } - - HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus& other) { - merge(&other); - return *this; - } - - /** - * - * @return cardinality estimate - */ - uint64_t cardinality(bool verbose=true) { - if (sparse) { - // if we are still 'sparse', then use linear counting, which is more - // accurate for low cardinalities, and use increased precision pPrime - return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size()))); - } - - // initialize bias correction data - if (rawEstimateData.empty()) { initRawEstimateData(); } - if (biasData.empty()) { initBiasData(); } - - // calculate raw estimate on registers - //double est = alpha(m) * harmonicMean(M, m); - double est = calculateEstimate(M); - - // correct for biases if estimate is smaller than 5m - if (est <= double(m)*5.0) { - est -= getEstimateBias(est); - } - - uint32_t v = countZeros(M); - if (v > 2) { - // calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix - double lc_estimate = linearCounting(m, v); - - // check if the lc estimate is below the threshold - if (lc_estimate <= double(threshold[p-4])) { - if (lc_estimate < 0) { throw; } - // return lc estimate of cardinality - return lc_estimate; - } - return lc_estimate; // always use lc_estimate when available - } - - // return bias-corrected hyperloglog estimate of cardinality - return uint64_t(est); - } + } + + /** + * add sparseList to the registers of M + */ + void addToRegisters(const SparseListType &sparseList) { + if (sparseList.size() == 0) { + return; + } + for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) { + + idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr); + + assert_lt(ir.idx,M.size()); + if (ir.rank > this->M[ir.idx]) { + this->M[ir.idx] = ir.rank; + } + } + } + + /** + * Merge another HyperLogLogPlusMinus into this. Converts to normal representation + * @param other + */ + void merge(const HyperLogLogPlusMinus* other) { + if (this->p != other->p) { + throw std::invalid_argument("precisions must be equal"); + } + + if (this->sparse && other->sparse) { + if (this->sparseList.size()+other->sparseList.size() > this->m) { + // TODO: this switches to normal representation too soon if there is duplication + switchToNormalRepresentation(); + addToRegisters(other->sparseList); + } else { + + for (const auto val : other->sparseList) { + insert_hash(this->sparseList, val); + } + } + } else if (other->sparse) { + // other is sparse, but this is not + addToRegisters(other->sparseList); + } else { + if (this->sparse) { + switchToNormalRepresentation(); + } + // merge registers + for (size_t i = 0; i < other->M.size(); ++i) { + if (other->M[i] > this->M[i]) { + this->M[i] = other->M[i]; + } + } + } + } + + HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus* other) { + merge(other); + return *this; + } + + HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus& other) { + merge(&other); + return *this; + } + + /** + * + * @return cardinality estimate + */ + uint64_t cardinality(bool verbose=true) { + if (sparse) { + // if we are still 'sparse', then use linear counting, which is more + // accurate for low cardinalities, and use increased precision pPrime + return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size()))); + } + + // initialize bias correction data + if (rawEstimateData.empty()) { initRawEstimateData(); } + if (biasData.empty()) { initBiasData(); } + + // calculate raw estimate on registers + //double est = alpha(m) * harmonicMean(M, m); + double est = calculateEstimate(M); + + // correct for biases if estimate is smaller than 5m + if (est <= double(m)*5.0) { + est -= getEstimateBias(est); + } + + uint32_t v = countZeros(M); + if (v > 2) { + // calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix + double lc_estimate = linearCounting(m, v); + + // check if the lc estimate is below the threshold + if (lc_estimate <= double(threshold[p-4])) { + if (lc_estimate < 0) { throw; } + // return lc estimate of cardinality + return lc_estimate; + } + return lc_estimate; // always use lc_estimate when available + } + + // return bias-corrected hyperloglog estimate of cardinality + return uint64_t(est); + } private: @@ -452,10 +472,10 @@ class HyperLogLogPlusMinus { } template inline uint32_t get_index(const T hash_value, const uint8_t p, const uint8_t size) const { - // take first p bits as index {x63,...,x64-p} - assert_lt(p,size); - uint32_t idx = hash_value >> (size - p); - return idx; + // take first p bits as index {x63,...,x64-p} + assert_lt(p,size); + uint32_t idx = hash_value >> (size - p); + return idx; } inline uint32_t get_index(const uint64_t hash_value, const uint8_t p) const { @@ -463,167 +483,166 @@ class HyperLogLogPlusMinus { } inline uint32_t get_index(const uint32_t hash_value, const uint8_t p) const { - return get_index(hash_value, p, 32); + return get_index(hash_value, p, 32); } template inline - T get_trailing_ones(const uint8_t p) const { - return (T(1) << p ) - 1; + T get_trailing_ones(const uint8_t p) const { + return (T(1) << p ) - 1; } template inline uint8_t get_rank(const T hash_value, const uint8_t p) const { - // shift p values off, and count leading zeros of the remaining string {x63-p,...,x0} - T_KEY rank_bits = (hash_value << p | get_trailing_ones(p)); + // shift p values off, and count leading zeros of the remaining string {x63-p,...,x0} + T_KEY rank_bits = (hash_value << p | get_trailing_ones(p)); #ifdef HLL_DEBUG - cerr << "rank bits: " << bitset<32>(rank_bits) << endl; + cerr << "rank bits: " << bitset<32>(rank_bits) << endl; #endif - uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1; - assert_leq(rank_val,64-p+1); - return rank_val; + uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1; + assert_leq(rank_val,64-p+1); + return rank_val; } - void initRawEstimateData() { - rawEstimateData = vector >(); - - rawEstimateData.push_back(vector(rawEstimateData_precision4,arr_len(rawEstimateData_precision4))); - rawEstimateData.push_back(vector(rawEstimateData_precision5,arr_len(rawEstimateData_precision5))); - rawEstimateData.push_back(vector(rawEstimateData_precision6,arr_len(rawEstimateData_precision6))); - rawEstimateData.push_back(vector(rawEstimateData_precision7,arr_len(rawEstimateData_precision7))); - rawEstimateData.push_back(vector(rawEstimateData_precision8,arr_len(rawEstimateData_precision8))); - rawEstimateData.push_back(vector(rawEstimateData_precision9,arr_len(rawEstimateData_precision9))); - rawEstimateData.push_back(vector(rawEstimateData_precision10,arr_len(rawEstimateData_precision10))); - rawEstimateData.push_back(vector(rawEstimateData_precision11,arr_len(rawEstimateData_precision11))); - rawEstimateData.push_back(vector(rawEstimateData_precision12,arr_len(rawEstimateData_precision12))); - rawEstimateData.push_back(vector(rawEstimateData_precision13,arr_len(rawEstimateData_precision13))); - rawEstimateData.push_back(vector(rawEstimateData_precision14,arr_len(rawEstimateData_precision14))); - rawEstimateData.push_back(vector(rawEstimateData_precision15,arr_len(rawEstimateData_precision15))); - rawEstimateData.push_back(vector(rawEstimateData_precision16,arr_len(rawEstimateData_precision16))); - rawEstimateData.push_back(vector(rawEstimateData_precision17,arr_len(rawEstimateData_precision17))); - rawEstimateData.push_back(vector(rawEstimateData_precision18,arr_len(rawEstimateData_precision18))); - - } - - void initBiasData() { - biasData = vector >(); - - biasData.push_back(vector(biasData_precision4,arr_len(biasData_precision4))); - biasData.push_back(vector(biasData_precision5,arr_len(biasData_precision5))); - biasData.push_back(vector(biasData_precision6,arr_len(biasData_precision6))); - biasData.push_back(vector(biasData_precision7,arr_len(biasData_precision7))); - biasData.push_back(vector(biasData_precision8,arr_len(biasData_precision8))); - biasData.push_back(vector(biasData_precision9,arr_len(biasData_precision9))); - biasData.push_back(vector(biasData_precision10,arr_len(biasData_precision10))); - biasData.push_back(vector(biasData_precision11,arr_len(biasData_precision11))); - biasData.push_back(vector(biasData_precision12,arr_len(biasData_precision12))); - biasData.push_back(vector(biasData_precision13,arr_len(biasData_precision13))); - biasData.push_back(vector(biasData_precision14,arr_len(biasData_precision14))); - biasData.push_back(vector(biasData_precision15,arr_len(biasData_precision15))); - biasData.push_back(vector(biasData_precision16,arr_len(biasData_precision16))); - biasData.push_back(vector(biasData_precision17,arr_len(biasData_precision17))); - biasData.push_back(vector(biasData_precision18,arr_len(biasData_precision18))); - } - - /** - * Estimate the bias using empirically determined values. - * Uses weighted average of the two cells between which the estimate falls. - * TODO: Check if nearest neighbor average gives better values, as proposed in the paper - * @param est - * @return correction value for - */ - double getEstimateBias(double estimate) { - vector rawEstimateTable = rawEstimateData[p-4]; - vector biasTable = biasData[p-4]; - - // check if estimate is lower than first entry, or larger than last - if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); } - if (rawEstimateTable.back() <= estimate) { return rawEstimateTable.back() - biasTable.back(); } - - // get iterator to first element that is not smaller than estimate - vector::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate); - size_t pos = it - rawEstimateTable.begin(); - - double e1 = rawEstimateTable[pos-1]; - double e2 = rawEstimateTable[pos]; - - double c = (estimate - e1) / (e2 - e1); - - return biasTable[pos-1]*(1-c) + biasTable[pos]*c; - } - - - /** - * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation. - * - * Difference from the algorithm described in the paper: - * The index always is in the p most significant bits - * - * see section 5.3 in Heule et al. - * @param x the hash bits - * @return encoded hash value - */ - uint32_t encodeHashIn32Bit(uint64_t hash_value) { - // extract first pPrime bits, and shift them onto a 32-bit integer - uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32); + void initRawEstimateData() { + rawEstimateData = vector >(); + + rawEstimateData.push_back(vector(rawEstimateData_precision4,arr_len(rawEstimateData_precision4))); + rawEstimateData.push_back(vector(rawEstimateData_precision5,arr_len(rawEstimateData_precision5))); + rawEstimateData.push_back(vector(rawEstimateData_precision6,arr_len(rawEstimateData_precision6))); + rawEstimateData.push_back(vector(rawEstimateData_precision7,arr_len(rawEstimateData_precision7))); + rawEstimateData.push_back(vector(rawEstimateData_precision8,arr_len(rawEstimateData_precision8))); + rawEstimateData.push_back(vector(rawEstimateData_precision9,arr_len(rawEstimateData_precision9))); + rawEstimateData.push_back(vector(rawEstimateData_precision10,arr_len(rawEstimateData_precision10))); + rawEstimateData.push_back(vector(rawEstimateData_precision11,arr_len(rawEstimateData_precision11))); + rawEstimateData.push_back(vector(rawEstimateData_precision12,arr_len(rawEstimateData_precision12))); + rawEstimateData.push_back(vector(rawEstimateData_precision13,arr_len(rawEstimateData_precision13))); + rawEstimateData.push_back(vector(rawEstimateData_precision14,arr_len(rawEstimateData_precision14))); + rawEstimateData.push_back(vector(rawEstimateData_precision15,arr_len(rawEstimateData_precision15))); + rawEstimateData.push_back(vector(rawEstimateData_precision16,arr_len(rawEstimateData_precision16))); + rawEstimateData.push_back(vector(rawEstimateData_precision17,arr_len(rawEstimateData_precision17))); + rawEstimateData.push_back(vector(rawEstimateData_precision18,arr_len(rawEstimateData_precision18))); + + } + + void initBiasData() { + biasData = vector >(); + + biasData.push_back(vector(biasData_precision4,arr_len(biasData_precision4))); + biasData.push_back(vector(biasData_precision5,arr_len(biasData_precision5))); + biasData.push_back(vector(biasData_precision6,arr_len(biasData_precision6))); + biasData.push_back(vector(biasData_precision7,arr_len(biasData_precision7))); + biasData.push_back(vector(biasData_precision8,arr_len(biasData_precision8))); + biasData.push_back(vector(biasData_precision9,arr_len(biasData_precision9))); + biasData.push_back(vector(biasData_precision10,arr_len(biasData_precision10))); + biasData.push_back(vector(biasData_precision11,arr_len(biasData_precision11))); + biasData.push_back(vector(biasData_precision12,arr_len(biasData_precision12))); + biasData.push_back(vector(biasData_precision13,arr_len(biasData_precision13))); + biasData.push_back(vector(biasData_precision14,arr_len(biasData_precision14))); + biasData.push_back(vector(biasData_precision15,arr_len(biasData_precision15))); + biasData.push_back(vector(biasData_precision16,arr_len(biasData_precision16))); + biasData.push_back(vector(biasData_precision17,arr_len(biasData_precision17))); + biasData.push_back(vector(biasData_precision18,arr_len(biasData_precision18))); + } + + /** + * Estimate the bias using empirically determined values. + * Uses weighted average of the two cells between which the estimate falls. + * TODO: Check if nearest neighbor average gives better values, as proposed in the paper + * @param est + * @return correction value for + */ + double getEstimateBias(double estimate) { + vector rawEstimateTable = rawEstimateData[p-4]; + vector biasTable = biasData[p-4]; + + // check if estimate is lower than first entry, or larger than last + if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); } + if (rawEstimateTable.back() <= estimate) { return rawEstimateTable.back() - biasTable.back(); } + + // get iterator to first element that is not smaller than estimate + vector::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate); + size_t pos = it - rawEstimateTable.begin(); + + double e1 = rawEstimateTable[pos-1]; + double e2 = rawEstimateTable[pos]; + + double c = (estimate - e1) / (e2 - e1); + + return biasTable[pos-1]*(1-c) + biasTable[pos]*c; + } + + + /** + * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation. + * + * Difference from the algorithm described in the paper: + * The index always is in the p most significant bits + * + * see section 5.3 in Heule et al. + * @param x the hash bits + * @return encoded hash value + */ + uint32_t encodeHashIn32Bit(uint64_t hash_value) { + // extract first pPrime bits, and shift them onto a 32-bit integer + uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32); #ifdef HLL_DEBUG - cerr << "value: " << bitset<64>(hash_value) << endl; + cerr << "value: " << bitset<64>(hash_value) << endl; cerr << "index: " << std::bitset<32>(idx) << " ( bits from 64 to " << 64-pPrime << "; " << idx << ")" << endl; #endif - // are the bits {63-p, ..., 63-p'} all 0? - if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) { - // compute the additional rank (minimum rank is already p'-p) - // the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag - uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0 - return idx | uint32_t(additional_rank<<1) | 1; - } else { - // else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0) - assert_eq((idx & 1),0); - return idx; - } - } - - - /** - * struct holding the index and rank/rho of an entry - */ - struct idx_n_rank { - uint32_t idx; - uint8_t rank; - idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {} - }; - - // - // - /** - * Decode a hash from the sparse representation. - * Returns the index and number of leading zeros (nlz) with precision p stored in k - * @param k the hash bits - * @return index and rank in non-sparse format - */ - idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const { - - // difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes - uint32_t idx = get_index(encoded_hash_value, p); - uint8_t rank_val; - - // check if the last bit is 1 - if ( (encoded_hash_value & 1) == 1) { - // if yes: the hash was stored with higher precision, bits p to pPrime were 0 - uint8_t additional_rank = pPrime - p; - rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1); - } else { - rank_val = get_rank(encoded_hash_value,p); - - // clz counts 64 bit only, it seems - if (rank_val > 32) - rank_val -= 32; - } - - return(idx_n_rank(idx,rank_val)); - } + // are the bits {63-p, ..., 63-p'} all 0? + if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) { + // compute the additional rank (minimum rank is already p'-p) + // the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag + uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0 + return idx | uint32_t(additional_rank<<1) | 1; + } else { + // else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0) + assert_eq((idx & 1),0); + return idx; + } + } + + + /** + * struct holding the index and rank/rho of an entry + */ + struct idx_n_rank { + uint32_t idx; + uint8_t rank; + idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {} + }; + + // + // + /** + * Decode hash from sparse representation. + * Returns the index and number of leading zeros (nlz) with precision p stored in k + * @return index and rank in non-sparse format + */ + idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const { + + // difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes + uint32_t idx = get_index(encoded_hash_value, p); + uint8_t rank_val; + + // check if the last bit is 1 + if ( (encoded_hash_value & 1) == 1) { + // if yes: the hash was stored with higher precision, bits p to pPrime were 0 + uint8_t additional_rank = pPrime - p; + rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1); + } else { + rank_val = get_rank(encoded_hash_value,p); + + // clz counts 64 bit only, it seems + if (rank_val > 32) + rank_val -= 32; + } + + return(idx_n_rank(idx,rank_val)); + } }; From f34f8d4722aa8729b4ba3dca21c9a9ebc05893cd Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 8 Nov 2017 21:34:00 -0500 Subject: [PATCH 099/105] Fix gzstream compilation, update on HLL --- src/Makefile | 10 ++- src/classify.cpp | 60 +++++++-------- src/gzstream/Makefile | 2 +- src/gzstream/index.html | 145 ------------------------------------- src/gzstream/libgzstream.a | Bin 9648 -> 0 bytes src/gzstream/logo.gif | Bin 1651 -> 0 bytes src/hyperloglogbias.h | 4 +- src/hyperloglogplus.h | 127 +++++++++++++++----------------- 8 files changed, 98 insertions(+), 250 deletions(-) delete mode 100644 src/gzstream/index.html delete mode 100644 src/gzstream/libgzstream.a delete mode 100644 src/gzstream/logo.gif diff --git a/src/Makefile b/src/Makefile index cc84b11..d236de3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,9 +1,10 @@ CXX = g++ FOPENMP?=-fopenmp -CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -O2 -Wfatal-errors ${CPPFLAGS} +CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS} #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping -LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream ${LDFLAGS} +#LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS} +LIBFLAGS = -L. -lz ${LDFLAGS} .PHONY: all install clean @@ -25,7 +26,7 @@ grade_classification: taxdb.h report-cols.h read_uid_mapping: quickfile.o -classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h +classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o gzstream.o hyperloglogplus.h taxdb.h report-cols.h $(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS) build_taxdb: taxdb.h report-cols.h quickfile.o @@ -43,6 +44,9 @@ krakendb.o: krakendb.cpp krakendb.hpp quickfile.hpp seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp $(CXX) $(CXXFLAGS) -c seqreader.cpp +gzstream.o: gzstream/gzstream.C gzstream/gzstream.h + $(CXX) $(CXXFLAGS) -c -O gzstream/gzstream.C + quickfile.o: quickfile.cpp quickfile.hpp $(CXX) $(CXXFLAGS) -c quickfile.cpp diff --git a/src/classify.cpp b/src/classify.cpp index 3b8a03e..2ceca19 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -114,19 +114,19 @@ ostream* cout_or_file(string file) { } void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) { - QuickFile db_file; - db_file.open_file(DB_filename); - if (Populate_memory) { - db_file.load_file(); - } - database = KrakenDB(db_file.ptr()); - QuickFile idx_file; - idx_file.open_file(Index_filename); - if (Populate_memory) - idx_file.load_file(); - - KrakenDBIndex db_index(idx_file.ptr()); - database.set_index(&db_index); + QuickFile db_file; + db_file.open_file(DB_filename); + if (Populate_memory) { + db_file.load_file(); + } + database = KrakenDB(db_file.ptr()); + QuickFile idx_file; + idx_file.open_file(Index_filename); + if (Populate_memory) + idx_file.load_file(); + + KrakenDBIndex db_index(idx_file.ptr()); + database.set_index(&db_index); } int main(int argc, char **argv) { @@ -242,18 +242,18 @@ int main(int argc, char **argv) { taxdb.readGenomeSizes(fname); } - TaxReport rep = TaxReport(*Report_output, taxdb, taxon_counts, false); - rep.setReportCols(vector { - "%", - "reads", + TaxReport rep = TaxReport(*Report_output, taxdb, taxon_counts, false); + rep.setReportCols(vector { + "%", + "reads", "taxReads", "kmers", "dup", "cov", - "taxID", - "rank", - "taxName"}); - rep.printReport("kraken","blu"); + "taxID", + "rank", + "taxName"}); + rep.printReport("kraken","blu"); } for (size_t i = 0; i < Open_fstreams.size(); ++i) { @@ -367,11 +367,11 @@ void process_file(char *filename) { inline uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& current_bin_key, - int64_t& current_min_pos, int64_t& current_max_pos) { - uint32_t* val_ptr = database.kmer_query( - database.canonical_representation(*kmer_ptr), ¤t_bin_key, - ¤t_min_pos, ¤t_max_pos); - return val_ptr ? *val_ptr : 0; + int64_t& current_min_pos, int64_t& current_max_pos) { + uint32_t* val_ptr = database.kmer_query( + database.canonical_representation(*kmer_ptr), ¤t_bin_key, + ¤t_min_pos, ¤t_max_pos); + return val_ptr ? *val_ptr : 0; } @@ -512,10 +512,10 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss, taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr, db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos); - //uint32_t* val_ptr = KrakenDatabases[i]->kmer_query( - // KrakenDatabases[i]->canonical_representation(*kmer_ptr), &db_statuses[i].current_bin_key, - // &db_statuses[i].current_min_pos, &db_statuses[i].current_max_pos); - //taxon = val_ptr ? *val_ptr : 0; + //uint32_t* val_ptr = KrakenDatabases[i]->kmer_query( + // KrakenDatabases[i]->canonical_representation(*kmer_ptr), &db_statuses[i].current_bin_key, + // &db_statuses[i].current_min_pos, &db_statuses[i].current_max_pos); + //taxon = val_ptr ? *val_ptr : 0; if (taxon) break; } diff --git a/src/gzstream/Makefile b/src/gzstream/Makefile index 8c21da1..4c32088 100644 --- a/src/gzstream/Makefile +++ b/src/gzstream/Makefile @@ -33,7 +33,7 @@ # ---------------------------------------------------------------------------- # CXX = CC -n32 -LANG:std # for SGI Irix 6.5, MIPSpro CC version 7.30 -CXX = g++-7 # for Linux RedHat 6.1, g++ version 2.95.2 +CXX = g++ # for Linux RedHat 6.1, g++ version 2.95.2 CPPFLAGS = -I. -O -fPIC LDFLAGS = -L. -lgzstream -lz diff --git a/src/gzstream/index.html b/src/gzstream/index.html deleted file mode 100644 index 8a9ef8e..0000000 --- a/src/gzstream/index.html +++ /dev/null @@ -1,145 +0,0 @@ - -Gzstream Library Home Page - - - -

Gzstream Library Home Page

- -
-
- - - -
- - -
-

Introduction

- -Gzstream is a small C++ library, basically just a wrapper, -that provides the functionality of the -zlib C-library in a C++ iostream. -It is freely available under the LGPL license.

- -Gzstream has been written by -Deepak Bandyopadhyay and -Lutz Kettner at -the Computational -Geometry Group at UNC Chapel Hill.

- - -


-

Supported Systems

- -Gzstream requires a standard compliant C++ compiler (we use the new -header file conventions and the new iostream in the std:: name space) -and, of course, zlib. We used zlib 1.1.3 so far, but see the zlib home page for why you should -upgrade to zlib 1.1.4. So, in theory, the provided sources could run -on many platforms. However, we used only the following few -platforms.

-

- -

    -
  • PC Linux, RedHat 6.1, g++ version 2.95.2 -
  • PC Linux, Debian, g++ version 2.95.2 and 3.1 -
  • SGI Irix 6.5, MIPSpro CC version 7.30 -

- - -


-

Installation

- -Either compile gzstream.C by hand, place it in some library, -and move gzstream.h into the include search path of your -compiler. Or use the provided Makefile, adapt its -variables, and follow the remarks in the Makefile. Two -test programs are provided, test_gzip.C and test_gunzip.C. -The Makefile contains a rule that performs a small test -with these programs.

- - -


-

Documentation

- -The library provides two classes, igzstream and ogzstream, -that can be used analogously to ifstream and ofstream -respectively.

- -The classes are by default in the global name space. This can -be changed by setting the macro GZSTREAM_NAMESPACE to -the desired name space, e.g., by setting the option --DGZSTREAM_NAMESPACE=gz in the Makefile. -However, this needs to be consistent for both, the library compilation -and the application that uses the library.

- - -


-

What's Missing

- -
    -
  • Seek. The zlib library provides the necessary functionality, - but we have not realized that in the wrapper (yet? ;-). -
  • Both streams are based on the same streambuffer. So, they - cannot be used to derive an iogzstream class that would allow - simultaneous reading and writing to the same file. -

- - -


-

Download and Release Notes

- -
    -
  • Gzstream library 1.5 (08 Apr 2003): - gzstream.tgz
    - Fixed bug that did not set the state correctly on failure to open or - close a file.
    - Fixed bug in the indexing of the write buffer that - caused the write buffer to shrink continously and finally caused - wrong results when writing compressed files (only observed on some - platforms).

    -

  • Gzstream library 1.4 (27 Apr 2002):
    - Fixed a bug that stopped stream output after calling flush() - or using std::endl.

    -

  • Gzstream library 1.3 (06 Nov 2001):
    - Fixed unsigned char -- signed char bug. Increased buffer size - for better performance.

    -

  • Gzstream library 1.2 (04 Oct 2001):
    - Initial release as gzstream, renamed from zipstream.

    -

  • Zipstream library 1.1 (09 Sep 2001):
    - Initial release. -
- -
-

Acknowledgements

- -Credits for finding bugs and improving this software go to: -Vincent Ricard, Peter Milley, Peter J. Torelli, and Ares Lagae. -

- -


-

Links

- - - -
-
- The Computational Geometry Group at UNC Chapel Hill, Jan. 08, 2003. -
- - - diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a deleted file mode 100644 index e0df77266a1f739fb8331743cfa8eb3717ebbf09..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9648 zcmc&)Yj7LY6<*u2t(tVL(ooE+C?ur{se|O$iA_n6klm~Yk+`w*CfUfcY^$*?Bdue_ zOhc&1W2-2b#|&ZU586(Lv}Fo21)6~ZX+#~E0SbjerZ6S^D3~!&3N#@E;(llESxGA^ ziPJy51k3-vCkn(eu|&h5-KPlJ1*$5U6osKM!MaMwW~#2&YM zVST;R+2|o2wM|v3I=_AGjm<4TaJDaB(b3W@p&g_Tw81h-x`J^D<7&p68TT?i#CVAD z2;+Yk9VNowRgBGy>ltH=I^)k6A7RWezQH)bc#84-QsLjj*alS9b;~@So{_i~3kUl< zhq^s=-F-vxUKO|9;aJO-sT%ds!SFy!+YO;e5m|ieK&Y5eL#Qtr4;Pa(Mz@4x-F?yF zmLmR|h6cJ=S|pNYcQJ>}9+O*_9ak_OE)3fuw1#I9`8|1KwMyzvC&>k7F*r?R&2=K+ zj#kS|PnT)uP(ES%u#imMof7>%{p7qwBR)B zY>~UsTxaHN&5vS1aQn*_AD*g|q!KJM31uwVa2+MWi3{x_T)jk}rj34`1!Rb3((ozK zRYgoRRn@`?P23B*qh(D;jyCUhRq!4BQ9PsHkfg=LvPs(s%%-Y6p^zF*gu;WGss%e$ zwb|F<$<+!e9t5?}y;+v9Wh^E9BiVnnc z11|Jur{thrE8HQcLq$AbM~Au1ei8~bZ47xFgvqw$W;lq3%j;r0rN_*B*YXqOrp zh-hl}&_HNPe7m{s*sZi1bwd7Xz9s7BYx&M_&-sfDg|ukQpsn+3)~#tqb$6zws$F61 z>83mQKf6COm&N?f0nFHSsL|A8hwR#-vF14yF(np03$4WZKd?k{S*5*_gQJ_n& zvg4q3%~+B~S)%Ai(7R)OhAb+M9dc;TlY?Qy!NgMsXgC?;bK(sFFAI2nwB1b9TuM)> zaps0515)(2*>E&Bb?NdN(G}dg6n)ojQcOoNtzy2ivG{YW{CL1l^N5sueo{Dkl0b~Y zn8-{Zbzy7S$oTs-T8iJ0Jw1v7&xMz-W0aZx6$;a-q*RwO_fsWxt5^TKVb_!yn(5Qu zI{u~-dQ&Oc;A+;7Qmtfsf21;*_;^$luS1(>&WEc#d1_Ldp`@0&yt~~>a;d9C|J3l6 z{CrX?rHRNdF(tLyC8lY|U$qLQX|*#}lA)?MO{(uXSpE3H<5Vd2OfIXb3QV4oT4h&K z-f=~rS^2-FR3vRw%=E3ITdL7gRAs76DO&^YZ>QqU{^z_34CwC}T9(*zg&E+Q z75jy6V`kL1I-3o#RkiBJeEJ(o`eCCy%^E+P+(bJp+E}ue?#7Z40_AXWkOjC+ek0L=bjX_>x4AL+?dO(g zBd20PH|t*n^kdl!FK^YyEE(oypI;q*Y|KRxFP6Buuj|Lf)MWQ--$tLB8QuPjQ4m8V zyjxAHh9@V>q=M_>1IQ7cUZfW(SzqDeM+b>CFP2eZB%Mvw*N=~^)cB4{8xM^l!1M)B zL4`ILkjj(L8YU*5LGpj25pDfhAikRoP3wm-vsRWw5t*%QoHnM(Nt6D6p;3A0+{*#b zYM4$-oD%Vycs83$S)pX4EwDl=NjNI+r-uZGV@9QY6n)+ff}VAb2)C1uvc8>BvdzPt z>`0f|Xqm}1d&+DSs_fip6F078xs6=PZd-`J#c}~P!ptGSZZI=rFT7}@iIUyP@}0Il zW$99hetOXRf;NuhZroGL#X-+uvLjh4I&<1qfUc4qq8s7(JR6?QB-x&^(N6%`nJks+ zaYHJfbKb52O)clw7!CQQ$KEQLJ(1nD5FKQ@?~xIj{b6Q^e#!1Cuw~O9*=exb&1y|# zgk~R}Vz&tFtX%-aL$GUOwGlExvtMVm+gU0)u1S`eiMSbRat+5Lz;4Xxmozjs$K$=x z%s81e@-qiBd*myTc(X3Edbx3m+_E9n%uTR+fbCA_pXT*;=p|UsSnS~C{2OKE*kg`s zr&(V~o2?d#Rk@)VntAi{78p+U>xC+{oL}0EjCyg>4OgB-^EeYfOvb+ygs&3 zv-PNfcmZ~9wmXm1(Cn9+afm$bHP>mnOe{I)9&?@UEElo5lNGj;FwL!0!?a+f8r~&o z53m`_`@l(&uiMOiA7qB;#c_w(?_RUtv`vUyd(3`!R)_!^la-=&y|!MslN&iM0WOYP zKw(srzt<3ti@P_zL%_@BkA9IE^KQu>&HD^|C4Kyb7>D`ZAZKUnVGiqRmZaOTlhjF4 zZ5TKMxCA&0xDbe^gxc9aJS8l?6o@`*X8^IV7ncJ`{{;HH41XU2@e{W8Z6G$9+FvmL zejsX~b~TXfmH@vE{Z9}MVyry`r2d`&lK3Y zwL^AK0ckux2VzI9-2)`Qx3F9Tl3#kqM1F%zUj@W(+uGx(6~f;Fy}-Rde4W?Q^BDHJ z+Bgs!MeUCmR|6?eULfVE2}u39fE3RxAoX_vkouFE{tSgdd3Xm%eqRHU-LpVr96+*r z3`lnSS-y|uce0$`9h1BVxET5;PzQK#T^k2p2iyqs0j~t&yP!*T5g%70L7t+X=J-F9LoHIr1To1Ca;$D3JKi0Wlu=AmafbUYW>` z0MWO+A2=8I0B|;NA5g~MDDXSL-9UWN$lHOaa5({V0SAHK1@;2x05<`z0ImU61KWTO zpaQG`x`9|jvJ-d-5Ikui5IWKVpdE;@qiadR+L9fpNANZhqNL>`s3)QcvB2cDOcO#y zR+uJ4?&Nn+H>6Jpmi#i)gfl_!W10~4E3ajm5P!15G~tDyYnUdaJRC*6kbgqTC;b>8 znh zqk@z^021v0pV~HDL@T(c4YU5sY>%~R$PFPr-i2Ou$#&^?tiOTjpD=wT+f)5e+lPz% zui)`)F#V(WsQHwil2D0K7v?w#;Uv==#{c9i7c zneUTnKPI|{mFaLc(f3$Ds%YR)!KUqLG7trnk`qBb=CgvHeY@=M~)2|oE z>H9oi{!GDm-Y=l{7wE4mpw|@StFC~4tRS8w<~6_nK!KdjDD&liD2S(`K%aE;^(D1G z+#ed;iWlJPmbGi1hR$FdrwZBQ&ejl^o=|TvrfRWZM2okET3Q;0jPuAMCyr-6k39R? z=^4%$Eu!g0+7$1N4)+HKwiZ2ftU)c0BrCS`vido63W;vEOBxgIS%pM$D0}*UB zlnAOpoCiiUDw}pK(5R}BC>{1lYR?EAe-po}uWd{JIn~v|M7$@g;p{xrcW!p^u!i&a zbGJ*x2ZyM?8@NMDj?Q=>enugxB6s|6$)M%1JijsuPx+@^;DepP(B|%7q)+^NKu3`i z>>}skg(vAfBg3(X7UrxsiA?~7Uwrw8`oh6j3qq_@Td_EB;VcryJtM|{F_0I;VHvo= zD|X^u>|}oGq5jH&;mAOjx*{AO!U?5{)4O<7?F|le^_ekGbLWcSe>lP|D{jcH4<7RL z%ooMO;my(R?v_M$@a^DqMPi^A7|B?U>c`RNxrJr;9gN12)B0{8rUwV8okm(!jWS*6 zyk`XekJ2SFz0Bjr@n|9u?2K&j)ZsoJQA2&fcwE)C4u)|E+Z}Bwrinhb^cPpf#&VV- zsHf%L(uUBAd}Vl4?(nQ`jJR5-r?#qiXH&nQOsB;{E2_OTu>5*o0ErvwR9jKT;_Gj| zUs~$-=lzpHU$|ikr+x}&!4yuN)S`&PbV$`!M=#LRo&vAqK6-1 HRcQYO8{rkq diff --git a/src/gzstream/logo.gif b/src/gzstream/logo.gif deleted file mode 100644 index e259089fbb097573bdc3bab9eccc75e548cf7251..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1651 zcmc(e`&Z0)7{}l3jC|8rW;J1BOOZK!%QYqubK-O%w{_WMSe0qa7^NBs*~ZDCEjE{2 zHl>gOFG4Tw%OhPVV@tKAD-u&_wzdM^M3pT{k%k> zsZik)AVhEP=U)C!P_alxGV;{8)O_9@nx=%#X|xC1Ik4l zX~hCW3}#ShHK%|DkAPxOV#W(s2sog`n9}QI)oO&mvM`7_4J>R49&;^F)_@q`a4EG! zVqlE|%PmIojQYx=2{15l(E!*2l1(tNAOsA9bR%L}SQ9)jquCYe02o>z5P+hfo(4lZ z1S65Ghbj7BUPuzb(BSeR=HWpynpq2r`U?zQkmymXhKUB@gQ1X}h|~9E5Qbrb z)Lxx5OZWtY`$1i6qK%>Y1(hs2n2=zY``|hX9Ox4e-HLGnh$-aMS``|>Cn^mWg#aJm z8t|VDVq5M~4&neQ5QEp-!N!2d!KkoN3X>BdQih2)bLH8H#XwLX&v_fUhn0aifr+rP zk^@QQ0C}pQQltb?TC51W?mf>coRiIViN1|XLSSnCaQxmsZ({k{` z!5dKi|I6KiVEF}5W>`^U0!ngjCQ&2YBy}ZcYW5FzS(q76diGZ8sOgGlb*1O-q&dw` z853BRd-tetMS3mou)a@L!>-)msvhn54E9vDL{|dQgTKZFag@2_B+n5zH zl3jbv`Qyyj4*YJv8Yi3i39S`w{&MV1Q{*ci?_S=cd}_aA@SUt?_tid3e`X)7%l2=x8>-l~`3s zc4}CEM|H!v_)~j+wy8Xo^yk!3qxU{KEczpCJH^|2Rl%KQz)#A#R%SXGZ z!hC7%@wWBW+W12UrtIr|9_QXlIm>V9PP!y{+S}!(Jd8HJ3h}P$eJS%#F>Fj+^mSc$ znwR&i@gdX8ww>D$oRR$F3Hx`0U5~vuxaOOk5g98ZG8BGy@)q9K?O1g^XAjIY8P6T^b7OPUS1Z`_CnE~jyv#Tgo1e4K&2%C6 zxWc5lkY~z1R-jclUNjfyRY?oi-K=Wg(V~%_j47YnF*f4W;=qjw%Wh#H{+AWCwVF%%i?!{u9KPFZ;~;r^lM?AYt`F-@Ezu319F((xBU4VT z5zq;xMY=YJD8_EKzPsYLFv+yrLr#n-tR5aWCpvxEFQ+cu@vToNzq{(}=l6QAr>Wz0 zzNw#!?@#XMW!5j4KlEMA; array) { +double calculateRawEstimate(vector array) { double inverseSum = 0.0; for (size_t i = 0; i < array.size(); ++i) { // TODO: pre-calculate the power calculation @@ -238,10 +237,6 @@ class HyperLogLogPlusMinus { bool sparse; // sparse representation of the data? SparseListType sparseList; // TODO: use a compressed list instead - // vectors containing data for bias correction - vector > rawEstimateData; // TODO: make this static - vector > biasData; - // sparse versions of p and m static const uint8_t pPrime = 25; // precision when using a sparse representation // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32 @@ -257,12 +252,12 @@ class HyperLogLogPlusMinus { * @param precision * @param sparse */ - HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) { + HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),m(1< 18 || precision < 4) { throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18"); } - this->m = 1 << precision; + //this->m = 1 << precision; if (sparse) { this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size @@ -427,39 +422,30 @@ class HyperLogLogPlusMinus { */ uint64_t cardinality(bool verbose=true) { if (sparse) { - // if we are still 'sparse', then use linear counting, which is more - // accurate for low cardinalities, and use increased precision pPrime + // if we are 'sparse', then use linear counting with increased precision pPrime return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size()))); } - // initialize bias correction data - if (rawEstimateData.empty()) { initRawEstimateData(); } - if (biasData.empty()) { initBiasData(); } + // use linear counting (lc) estimate if there are zeros in the matrix + // AND the lc estimate is smaller than an empirically defined threshold + uint32_t v = countZeros(M); + if (v != 0) { + uint64_t lc_estimate = linearCounting(m, v); + // check if the lc estimate is below the threshold + assert(lc_estimate >= 0); + if (lc_estimate <= double(threshold[p-4])) { + return lc_estimate; + } + } // calculate raw estimate on registers //double est = alpha(m) * harmonicMean(M, m); - double est = calculateEstimate(M); - + double est = calculateRawEstimate(M); // correct for biases if estimate is smaller than 5m if (est <= double(m)*5.0) { est -= getEstimateBias(est); } - uint32_t v = countZeros(M); - if (v > 2) { - // calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix - double lc_estimate = linearCounting(m, v); - - // check if the lc estimate is below the threshold - if (lc_estimate <= double(threshold[p-4])) { - if (lc_estimate < 0) { throw; } - // return lc estimate of cardinality - return lc_estimate; - } - return lc_estimate; // always use lc_estimate when available - } - - // return bias-corrected hyperloglog estimate of cardinality return uint64_t(est); } @@ -507,57 +493,58 @@ class HyperLogLogPlusMinus { return rank_val; } - void initRawEstimateData() { - rawEstimateData = vector >(); - - rawEstimateData.push_back(vector(rawEstimateData_precision4,arr_len(rawEstimateData_precision4))); - rawEstimateData.push_back(vector(rawEstimateData_precision5,arr_len(rawEstimateData_precision5))); - rawEstimateData.push_back(vector(rawEstimateData_precision6,arr_len(rawEstimateData_precision6))); - rawEstimateData.push_back(vector(rawEstimateData_precision7,arr_len(rawEstimateData_precision7))); - rawEstimateData.push_back(vector(rawEstimateData_precision8,arr_len(rawEstimateData_precision8))); - rawEstimateData.push_back(vector(rawEstimateData_precision9,arr_len(rawEstimateData_precision9))); - rawEstimateData.push_back(vector(rawEstimateData_precision10,arr_len(rawEstimateData_precision10))); - rawEstimateData.push_back(vector(rawEstimateData_precision11,arr_len(rawEstimateData_precision11))); - rawEstimateData.push_back(vector(rawEstimateData_precision12,arr_len(rawEstimateData_precision12))); - rawEstimateData.push_back(vector(rawEstimateData_precision13,arr_len(rawEstimateData_precision13))); - rawEstimateData.push_back(vector(rawEstimateData_precision14,arr_len(rawEstimateData_precision14))); - rawEstimateData.push_back(vector(rawEstimateData_precision15,arr_len(rawEstimateData_precision15))); - rawEstimateData.push_back(vector(rawEstimateData_precision16,arr_len(rawEstimateData_precision16))); - rawEstimateData.push_back(vector(rawEstimateData_precision17,arr_len(rawEstimateData_precision17))); - rawEstimateData.push_back(vector(rawEstimateData_precision18,arr_len(rawEstimateData_precision18))); - + vector rawEstimateData(size_t p) { + switch (p) { + case 4: return vector(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)); + case 5: return vector(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)); + case 6: return vector(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)); + case 7: return vector(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)); + case 8: return vector(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)); + case 9: return vector(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)); + case 10: return vector(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)); + case 11: return vector(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)); + case 12: return vector(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)); + case 13: return vector(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)); + case 14: return vector(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)); + case 15: return vector(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)); + case 16: return vector(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)); + case 17: return vector(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)); + case 18: return vector(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)); + } + return vector(); } - void initBiasData() { - biasData = vector >(); - - biasData.push_back(vector(biasData_precision4,arr_len(biasData_precision4))); - biasData.push_back(vector(biasData_precision5,arr_len(biasData_precision5))); - biasData.push_back(vector(biasData_precision6,arr_len(biasData_precision6))); - biasData.push_back(vector(biasData_precision7,arr_len(biasData_precision7))); - biasData.push_back(vector(biasData_precision8,arr_len(biasData_precision8))); - biasData.push_back(vector(biasData_precision9,arr_len(biasData_precision9))); - biasData.push_back(vector(biasData_precision10,arr_len(biasData_precision10))); - biasData.push_back(vector(biasData_precision11,arr_len(biasData_precision11))); - biasData.push_back(vector(biasData_precision12,arr_len(biasData_precision12))); - biasData.push_back(vector(biasData_precision13,arr_len(biasData_precision13))); - biasData.push_back(vector(biasData_precision14,arr_len(biasData_precision14))); - biasData.push_back(vector(biasData_precision15,arr_len(biasData_precision15))); - biasData.push_back(vector(biasData_precision16,arr_len(biasData_precision16))); - biasData.push_back(vector(biasData_precision17,arr_len(biasData_precision17))); - biasData.push_back(vector(biasData_precision18,arr_len(biasData_precision18))); + vector biasData(size_t p) { + switch(p) { + case 4: return vector(biasData_precision4,arr_len(biasData_precision4)); + case 5: return vector(biasData_precision5,arr_len(biasData_precision5)); + case 6: return vector(biasData_precision6,arr_len(biasData_precision6)); + case 7: return vector(biasData_precision7,arr_len(biasData_precision7)); + case 8: return vector(biasData_precision8,arr_len(biasData_precision8)); + case 9: return vector(biasData_precision9,arr_len(biasData_precision9)); + case 10: return vector(biasData_precision10,arr_len(biasData_precision10)); + case 11: return vector(biasData_precision11,arr_len(biasData_precision11)); + case 12: return vector(biasData_precision12,arr_len(biasData_precision12)); + case 13: return vector(biasData_precision13,arr_len(biasData_precision13)); + case 14: return vector(biasData_precision14,arr_len(biasData_precision14)); + case 15: return vector(biasData_precision15,arr_len(biasData_precision15)); + case 16: return vector(biasData_precision16,arr_len(biasData_precision16)); + case 17: return vector(biasData_precision17,arr_len(biasData_precision17)); + case 18: return vector(biasData_precision18,arr_len(biasData_precision18)); + } + return vector(); } /** - * Estimate the bias using empirically determined values. + * Estimate the bias of raw estimate using empirically determined values. * Uses weighted average of the two cells between which the estimate falls. * TODO: Check if nearest neighbor average gives better values, as proposed in the paper * @param est * @return correction value for */ double getEstimateBias(double estimate) { - vector rawEstimateTable = rawEstimateData[p-4]; - vector biasTable = biasData[p-4]; + vector rawEstimateTable = rawEstimateData(p); + vector biasTable = biasData(p); // check if estimate is lower than first entry, or larger than last if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); } From 93c155f5fe9eebd0a16c2ac37a38d3a3fd1391bf Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 8 Nov 2017 22:12:29 -0500 Subject: [PATCH 100/105] Fix gzstream building and licensing --- CHANGELOG | 90 -------------- install_kraken.sh => install_krakenhll.sh | 0 scripts/krakenhll | 3 +- scripts/krakenhll-build | 3 +- scripts/krakenhll-build_db.sh | 3 +- scripts/krakenhll-check_for_jellyfish.sh | 4 +- scripts/krakenhll-download | 1 + scripts/krakenhll-report | 3 +- scripts/krakenhll-standard_installation.sh | 3 +- scripts/krakenhll-verify_gi_numbers.pl | 54 --------- src/Makefile | 2 +- src/gzstream/.Makefile.swp | Bin 12288 -> 0 bytes src/make_seqid_to_taxid_map.cpp | 130 --------------------- tests/build-dbs.sh | 2 +- 14 files changed, 15 insertions(+), 283 deletions(-) delete mode 100644 CHANGELOG rename install_kraken.sh => install_krakenhll.sh (100%) delete mode 100755 scripts/krakenhll-verify_gi_numbers.pl delete mode 100644 src/gzstream/.Makefile.swp delete mode 100644 src/make_seqid_to_taxid_map.cpp diff --git a/CHANGELOG b/CHANGELOG deleted file mode 100644 index b914174..0000000 --- a/CHANGELOG +++ /dev/null @@ -1,90 +0,0 @@ -v0.10.6-beta: -* fixed overflow bug in command line parsing -* fixed GRCh38.p2 bug in human genome downloads - -v0.10.5-beta: -* fix bug in GRCh38 download to handle multi-fasta files -* add --header-line and --intermediate-ranks options to kraken-mpa-report -* improved support for adding multi-FASTA files with --add-to-library -* allow assigning taxon IDs in reference sequences w/o GI numbers - using "kraken:taxid" code -* included full sequence descriptions when using "--[un]classified-out" -* reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink) -* reduced memory usage of db_sort (Build step 3) -* reduced memory usage of set_lcas (Build step 6) -* support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB - env. variables -* added kraken-translate for getting taxonomic names for each sequence -* added a --rebuild option to kraken-build -* turned off default name checking for PE reads; added --check-names option -* added plasmids to --download-library options -* added HTML manual, redirecting README to that - -v0.10.4-beta: -* use GRCh38 for human genome library -* enable input via stdin (via /dev/fd/0) -* enable compressed (gzip/bzip2) input -* enable auto-detection of fasta/fastq/gz/bz2 -* simplified add_to_library.sh code to speed up large additions -* use RNA genomes for viral genome library -* scan .ffn (RNA) files for genomic data when building databases -* handle paired-end reads with --paired option -* provide MetaPhlAn-compatible output with kraken-mpa-report -* added domain/kingdom codes to kraken-report -* added kraken-filter script for simple confidence scoring -* added support for multi-FASTA files in custom DBs -* fixed build_kraken_db.sh bug for k-mers w/ k < 31 -* updates to README file - -v0.10.3-beta: -* remove Fatal.pm use in kraken-report -* fixed false success message on make failure in installer -* explicitly require g++ as C++ compiler in Makefile -* change to quickfile.cpp to do proper syncing on close -* fixed kraken-build bug w/ --work-on-disk (cause of some major build stalls) -* changed hash size calculation to use Perl -* close input files explicitly in db_sort/db_shrink to reduce reported memory -* allow db_shrink to work in RAM -* updates to README file - -v0.10.2-beta: -* fixed kraken-report bug w/ --show-zeros -* fixed kraken-report installation bug -* updates to README file - -v0.10.1-beta: -* fixed 2nd bug in build_kraken.sh in calculating hash size (thanks T. Antao) -* fixed bug in add_to_library.sh for some bash versions (thanks T. Antao) -* fixed issue where search window wasn't cached until a failure (query speedup) -* added $KRAKEN_DIR fallback for kraken/kraken-build (thanks S. Koren) - -v0.10.0-beta: -* added CHANGELOG -* fixed quick mode hit list output -* updated README citation -* changed minimizer sort order (query speedup), changes database structure -* use linear search with small windows (query speedup) -* changed query procedure (query speedup); search w/o 1st calculating minimizer -* changed readlink in installer to perl Cwd::abs_path (portability) -* removed MAP_POPULATE for preloading, uses read loop instead (bugfix/port.) -* added --work-on-disk switch to kraken-build -* added kraken-report script -* fixed bug in build_kraken.sh in calculating hash size (thanks T. Antao) - -v0.9.1b: -* fixed bug to allow kraken-build --shrink - -v0.9.0b: -* full rewrite -* minimizers used to speed queries, prefix index removed - -v0.3: -* DB build parallelized, Jellyfish removed from LCA assignment - -v0.2: -* full rewrite, most progs. changed to C++ -* Jellyfish removed from classification step -* prefix index used to speed queries - -v0.1: -* initial version, mostly Perl diff --git a/install_kraken.sh b/install_krakenhll.sh similarity index 100% rename from install_kraken.sh rename to install_krakenhll.sh diff --git a/scripts/krakenhll b/scripts/krakenhll index 8a4aad9..69e3b56 100755 --- a/scripts/krakenhll +++ b/scripts/krakenhll @@ -1,6 +1,7 @@ #!/usr/bin/env perl -# Copyright 2013-2015, Derrick Wood +# Original file Copyright 2013-2015, Derrick Wood +# Portions (c) 2017, Florian Breitwieser as part of KrakenHLL # # This file is part of the Kraken taxonomic sequence classification system. # diff --git a/scripts/krakenhll-build b/scripts/krakenhll-build index 8888cd8..1ab41c9 100755 --- a/scripts/krakenhll-build +++ b/scripts/krakenhll-build @@ -1,6 +1,7 @@ #!/usr/bin/env perl -# Copyright 2013-2015, Derrick Wood +# Original file Copyright 2013-2015, Derrick Wood +# Portions (c) 2017, Florian Breitwieser as part of KrakenHLL # # This file is part of the Kraken taxonomic sequence classification system. # diff --git a/scripts/krakenhll-build_db.sh b/scripts/krakenhll-build_db.sh index 959f041..adc4345 100755 --- a/scripts/krakenhll-build_db.sh +++ b/scripts/krakenhll-build_db.sh @@ -1,7 +1,8 @@ #!/bin/bash #vim: noai:ts=2:sw=2 -# Copyright 2013-2015, Derrick Wood +# Original file Copyright 2013-2015, Derrick Wood +# Portions (c) 2017, Florian Breitwieser as part of KrakenHLL # # This file is part of the Kraken taxonomic sequence classification system. # diff --git a/scripts/krakenhll-check_for_jellyfish.sh b/scripts/krakenhll-check_for_jellyfish.sh index c2aa2d7..50d9867 100755 --- a/scripts/krakenhll-check_for_jellyfish.sh +++ b/scripts/krakenhll-check_for_jellyfish.sh @@ -1,7 +1,7 @@ #!/bin/bash -# Copyright 2013-2015, Derrick Wood -# modified by Florian Breitwieser, 2017 +# Original file Copyright 2013-2015, Derrick Wood +# Portions (c) 2017, Florian Breitwieser as part of KrakenHLL # # This file is part of the Kraken taxonomic sequence classification system. # diff --git a/scripts/krakenhll-download b/scripts/krakenhll-download index c052463..e4e52c3 100755 --- a/scripts/krakenhll-download +++ b/scripts/krakenhll-download @@ -3,6 +3,7 @@ # krakenhll-download.pl - based on centrifuge-download # (c) Florian Breitwieser, 2017 +# licensed under GPL-3 use strict; use warnings; diff --git a/scripts/krakenhll-report b/scripts/krakenhll-report index e9cdaf5..bf950d4 100755 --- a/scripts/krakenhll-report +++ b/scripts/krakenhll-report @@ -1,6 +1,7 @@ #!/usr/bin/env perl -# Copyright 2013-2015, Derrick Wood +# Original file Copyright 2013-2015, Derrick Wood +# Portions (c) 2017, Florian Breitwieser as part of KrakenHLL # # This file is part of the Kraken taxonomic sequence classification system. # diff --git a/scripts/krakenhll-standard_installation.sh b/scripts/krakenhll-standard_installation.sh index b34dd44..98353e1 100755 --- a/scripts/krakenhll-standard_installation.sh +++ b/scripts/krakenhll-standard_installation.sh @@ -1,6 +1,7 @@ #!/bin/bash -# Copyright 2013-2015, Derrick Wood +# Original file Copyright 2013-2015, Derrick Wood +# Portions (c) 2017, Florian Breitwieser as part of KrakenHLL # # This file is part of the Kraken taxonomic sequence classification system. # diff --git a/scripts/krakenhll-verify_gi_numbers.pl b/scripts/krakenhll-verify_gi_numbers.pl deleted file mode 100755 index 0bb5cdf..0000000 --- a/scripts/krakenhll-verify_gi_numbers.pl +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2013-2015, Derrick Wood -# -# This file is part of the Kraken taxonomic sequence classification system. -# -# Kraken is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Kraken is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Kraken. If not, see . - -# Checks each sequence header to ensure it has a GI number to -# enable taxonomic ID lookup later. Also has some (very basic) -# FASTA-format checking. - -use strict; -use warnings; -use File::Basename; - -my $PROG = basename $0; - -die "$PROG: must specify one filename!\n" if @ARGV != 1; - -my $filename = shift; - -open FASTA, "<", $filename - or die "$PROG: can't open $filename: $!\n"; -my $seq_ct = 0; -my $errors = 0; -while () { - next unless /^>/; - $seq_ct++; - if (! /^>(\S+)/) { - $errors++; - warn "file $filename, line $. lacks sequence ID\n"; - } - if ($1 !~ /(^|\|)(gi|kraken:taxid)\|(\d+)/) { - $errors++; - warn "file $filename, line $.: sequence ID lacks GI number\n"; - } -} -close FASTA; - -if ($errors) { - exit 1; -} diff --git a/src/Makefile b/src/Makefile index d236de3..cfebf25 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,7 +14,7 @@ install: $(PROGS) cp $(PROGS) $(KRAKEN_DIR)/ clean: - rm -f $(PROGS) *.o + rm -rf $(PROGS) *.o *.dSYM db_shrink: krakendb.o quickfile.o diff --git a/src/gzstream/.Makefile.swp b/src/gzstream/.Makefile.swp deleted file mode 100644 index f5e077d91d7cee7748a0835509de1d24b28f23d8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeHNJ8UDz86M|x$=PudpUb%TZ5`m#nOu^RFXy8m1AElt2vZV4QaQ=ZVYoXa2e;ha z%*=|Sj6;BwhJh3|V7N%<1bG2*nj}b?BzBP^RXUeOiU0{x1Syg--z>S*O1XotP~%`1 z`oPuB%s2o4{g0)f=~uti+@YJ!%M91o8GHAY=U#p5(wkR4{xM^Nz7o<*q|idy;YVG8 zjKW8a<)2ryr?~M()k>bZG8TppMHr2xr^~U7MIb{_(UZ!3#qW!-qLo(}o@k>4A6Itp zXdnZT=Pm9F1quZoM}eo=>PoF@XD+?)JUx45|8W)-a|#6t1quZU1quZU1quZU1quZU z1^!PIF!2-YJMiUmbAJxzpHH0ooc~&U6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy zd(MU|JmRFzxWzs5~u=Y;2&RQ>;vH2KnYj|{`xdyKLq+f6(|FL z`wC<40|&rm;1clnrx^Pq@E+g;9Jm5J1N`Ah#=Z}{4qO5Le3`L#fK6Zn`1h9?`z`Pr z;77olz!W$JmVtkLiLpNc-vthUXMv9{G4>(wGvKGdTYv=a0JP4@1 zST}o?KDhcu-@_Bu~}Y8dVwMr$Wz zq_ZjiOX+qVob!4iK*Y{W2UTupVGgWnw`bl^**oVrtjc-lH1hspEHf_k-)Wrkk51iv z6#vYjMTi#Mv*#JJ{q1piPrW-eq``7ea{`>>T%zmOuhX9MnnOJ#4Gm-{^+@=n6VDS` z50W65I(AZ;YQq!9jALhbf{pAQyX?!sfEs)4!{+X`(`w#c3^eh&q7)M7! z!yU5g#MKI5G!2)qR~s+{u{cGFh?(Q_fIKivuo0(sCRJf>dI+GSSRy>qP2#f*y40 zT^*|~P~$RVZYNcRXq~tsPqBH2vJc8Fhsr@#2e>_9S;^bcCPw!dL))8CtkdwNmvFm8 zgGf0pKEjus3VPCkKB+(Q#1)F<0T&yB#F>*@wR>$Fm> z)+)7XrTQ|}Zq%!{>MO6%)!g)1O<;PqzD_k~HJ!3$b6|1RX>ph~HSz+#->k!)ttbg$ zIUFcaGxVGi-MEE!RoZjfHq<5RCP-OYU9FbrcBGA+x#JR5S8BCdxwcxpNwnW}!PSzU z2$2Fjh>*JzX-2a|ocG=}k^>4OQ=*Boj7TP116xW6>P7=I;YxtzFiJ>|q9pKX#E;PV zgeOp2K5-OAJf&!mF5BMSrxrS_P_!)&3La29>7!aeVQqQX(85C7fyR*%ZZ0~p4d)-Zks0-t*7cd7L z{lw7SX7A44evjPUL%QpBI-opUOVx-lDvm^$F0gG~!Yg>KcxbF&An4h`o$E!o^(9VJ zOVDB_3f}HCiekGZ+NMM`OLPrXcA^EoOO(KXpRDbzIJHczd1y-3t7}-AR%R62xG$qq zO({Uus%)i|V-ZYEvDG#G+R~qzuxdc1nhT6T#VcwI>( lygkv5w{6<&kh - * - * This file is part of the Kraken taxonomic sequence classification system. - * - * Kraken is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Kraken is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Kraken. If not, see . - */ - -// Produce a mapping of sequence IDs to taxon IDs - -// This program's reason for being is that the gi_taxid_nucl.dmp file -// is monstrously huge, and the only efficient way to do this task is -// to use mmap to quickly access the file. Otherwise, I'd have just -// used a little Perl script instead of all these strchr() calls. - -#include "kraken_headers.hpp" -#include "quickfile.hpp" - -using namespace std; -using namespace kraken; - -#define USER_SPECIFIED_FLAG "TAXID" - -map user_specified_taxids; -map > requests; -uint64_t request_count = 0; - -void fill_request_map(char *filename); -void report_taxo_numbers(char *filename); - -int main(int argc, char **argv) { - if (argc < 3) { - cerr << "Usage: make_seqid_to_taxid_map [ ]\n" - << " If nodes.dmp and names.dmp files are provided, then each sequence header is added with a further link\n" - << " to the taxonomy." - << endl; - return 1; - } - char *map_filename = argv[1]; - char *list_filename = argv[2]; - - //char *nodes_filename; - //char *names_filename; - //if (argc == 5) { - // nodes_filename = argv[3]; - // names_filename = argv[4]; - //} - - fill_request_map(list_filename); - report_taxo_numbers(map_filename); - - return 0; -} - -void report_taxo_numbers(char *filename) { - string file_str = filename; - QuickFile file(file_str); - char *fptr, *fptr_start; - fptr_start = fptr = file.ptr(); - size_t file_size = file.size(); - - // Line format: - while (request_count > 0 && (size_t)(fptr - fptr_start) < file_size) { - char *nl_ptr = strchr(fptr, '\n'); - uint64_t gi = atoll(fptr); - - if (requests.count(gi) > 0) { - char *tab_ptr = strchr(fptr, '\t'); - set::iterator it; - // Output line format: - for (it = requests[gi].begin(); it != requests[gi].end(); it++) { - cout << *it << "\t"; - cout.write(tab_ptr + 1, nl_ptr - tab_ptr); - request_count--; - } - } - - fptr = nl_ptr + 1; - } - file.close_file(); - - // Same as before - just doing the user specified sequences now - // Output line format: - map::iterator mit = user_specified_taxids.begin(); - while (mit != user_specified_taxids.end()) { - cout << mit->first << "\t" << mit->second << endl; - mit++; - } -} - -void fill_request_map(char *filename) { - string file_str = filename; - QuickFile file(file_str); - char *fptr, *fptr_start; - fptr_start = fptr = file.ptr(); - size_t file_size = file.size(); - - // Line format: - // OR: TAXID (user spec'ed) - while ((size_t)(fptr - fptr_start) < file_size) { - char *nl_ptr = strchr(fptr, '\n'); - char *sep_ptr = strchr(fptr, '\t'); - if (strncmp(fptr, USER_SPECIFIED_FLAG, strlen(USER_SPECIFIED_FLAG)) == 0) { - char *sep_ptr = strchr(fptr, '\t'); - uint64_t taxid = atoll(sep_ptr + 1); - sep_ptr = strchr(sep_ptr + 1, '\t'); - string seqid(sep_ptr + 1, nl_ptr - sep_ptr - 1); - user_specified_taxids[seqid] = taxid; - } - else { - uint64_t gi = atoll(fptr); - requests[gi].insert(string(sep_ptr + 1, nl_ptr - sep_ptr - 1)); - request_count++; - } - fptr = nl_ptr + 1; - } - - file.close_file(); -} diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh index a2f5438..5abbb66 100755 --- a/tests/build-dbs.sh +++ b/tests/build-dbs.sh @@ -106,7 +106,7 @@ for VAR in $@; do [[ -d $EUKD ]] || mkdir -p $EUKD [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD build_db $K euk-oct2017 fungi protozoa ;; - *) echo "$USAGE" + *) echo -e "Unknown database $VAR!\n$USAGE" exit 1 ;; esac done From c6871c11e0e4150191d14dcbe3ca69c973bd2b55 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Wed, 8 Nov 2017 22:15:07 -0500 Subject: [PATCH 101/105] Fix licensing --- install_kraken.sh | 3 +-- src/build_taxdb.cpp | 6 +++--- src/classify.cpp | 3 ++- src/dump_taxdb.cpp | 19 +++++++++++++++++ src/grade_classification.cpp | 16 +++++++++++++- src/hyperloglogplus.h | 41 ++++++++++++++++++++++-------------- src/krakenutil.cpp | 3 ++- src/krakenutil.hpp | 3 ++- src/query_taxdb.cpp | 8 +++---- src/quickfile.cpp | 3 ++- src/read_uid_mapping.cpp | 18 ++++++++++++++++ src/readcounts.hpp | 18 ++++++++++++++++ src/report-cols.h | 5 ++--- src/set_lcas.cpp | 4 ++-- src/taxdb.h | 7 +++--- src/uid_mapping.cpp | 18 ++++++++++++++++ src/uid_mapping.hpp | 18 ++++++++++++++++ 17 files changed, 155 insertions(+), 38 deletions(-) diff --git a/install_kraken.sh b/install_kraken.sh index 3b12552..9655715 100755 --- a/install_kraken.sh +++ b/install_kraken.sh @@ -1,9 +1,8 @@ #!/bin/bash +# Portions (c) 2017, Florian Breitwieser # Copyright 2013-2015, Derrick Wood # -# This file is part of the Kraken taxonomic classification system. -# # Kraken is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp index 263de5c..467ca98 100644 --- a/src/build_taxdb.cpp +++ b/src/build_taxdb.cpp @@ -1,14 +1,14 @@ /* * Copyright 2017, Florian Breitwieser * - * This file is part of the Kraken taxonomic sequence classification system. + * This file is part of the KrakenHLL taxonomic sequence classification system. * - * Kraken is free software: you can redistribute it and/or modify + * KrakenHLL is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * - * Kraken is distributed in the hope that it will be useful, + * KrakenHLL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. diff --git a/src/classify.cpp b/src/classify.cpp index 3b8a03e..a940f18 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -1,5 +1,6 @@ /* - * Copyright 2013-2015, Derrick Wood + * Original file Copyright 2013-2015, Derrick Wood + * Portions (c) 2017, Florian Breitwieser as part of KrakenHLL * * This file is part of the Kraken taxonomic sequence classification system. * diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp index 76246e4..b2e8555 100644 --- a/src/dump_taxdb.cpp +++ b/src/dump_taxdb.cpp @@ -1,3 +1,22 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ + #include "taxdb.h" #include "quickfile.hpp" #include diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp index 148c7e9..781b26e 100644 --- a/src/grade_classification.cpp +++ b/src/grade_classification.cpp @@ -1,6 +1,20 @@ /* * Copyright 2017, Florian Breitwieser - * licnsed under GPLv3 + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . */ #include "taxdb.h" diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h index 5e27407..21d40eb 100644 --- a/src/hyperloglogplus.h +++ b/src/hyperloglogplus.h @@ -1,3 +1,21 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ /* * hyperloglogplus.h * @@ -99,16 +117,14 @@ double alpha(uint32_t m) { /** * calculate the raw estimate as harmonic mean of the ranks in the register - * @param array - * @return */ -double calculateEstimate(vector array) { +double calculateEstimate(vector M) { double inverseSum = 0.0; - for (size_t i = 0; i < array.size(); ++i) { + for (size_t i = 0; i < M.size(); ++i) { // TODO: pre-calculate the power calculation - inverseSum += pow(2,-array[i]); + inverseSum += pow(2,-M[i]); } - return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum; + return alpha(M.size()) * double(M.size() * M.size()) * 1 / inverseSum; } uint32_t countZeros(vector s) { @@ -117,10 +133,6 @@ uint32_t countZeros(vector s) { /** * Extract bits (from uint32_t or uint64_t) using LSB 0 numbering from hi to lo, including lo - * @param bits - * @param hi - * @param lo - * @return */ template T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) { @@ -198,19 +210,16 @@ inline uint32_t clz(const uint32_t x) { } inline uint32_t clz(const uint64_t x) { - uint32_t u32 = (x >> 32); + return __builtin_clzl(x); +/* uint32_t u32 = (x >> 32); uint32_t result = u32 ? __builtin_clz(u32) : 32; if (result == 32) { u32 = x & 0xFFFFFFFFUL; result += (u32 ? __builtin_clz(u32) : 32); } - return result; + return result; */ } //#else - -uint32_t clz_log2(const uint64_t w) { - return 63 - floor(log2(w)); -} //#endif diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp index 46fd953..2e18cf0 100644 --- a/src/krakenutil.cpp +++ b/src/krakenutil.cpp @@ -1,5 +1,6 @@ /* - * Copyright 2013-2015, Derrick Wood + * Original file Copyright 2013-2015, Derrick Wood + * Portions (c) 2017, Florian Breitwieser as part of KrakenHLL * * This file is part of the Kraken taxonomic sequence classification system. * diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp index cbfd3d5..aff26bf 100644 --- a/src/krakenutil.hpp +++ b/src/krakenutil.hpp @@ -1,5 +1,6 @@ /* - * Copyright 2013-2015, Derrick Wood + * Original file Copyright 2013-2015, Derrick Wood + * Portions (c) 2017, Florian Breitwieser as part of KrakenHLL * * This file is part of the Kraken taxonomic sequence classification system. * diff --git a/src/query_taxdb.cpp b/src/query_taxdb.cpp index 7412792..a45c117 100644 --- a/src/query_taxdb.cpp +++ b/src/query_taxdb.cpp @@ -1,20 +1,20 @@ /* * Copyright 2017, Florian Breitwieser * - * This file is part of the Kraken taxonomic sequence classification system. + * This file is part of the KrakenHLL taxonomic sequence classification system. * - * Kraken is free software: you can redistribute it and/or modify + * KrakenHLL is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * - * Kraken is distributed in the hope that it will be useful, + * KrakenHLL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with Kraken. If not, see . + * along with KrakenHLL. If not, see . */ #include "taxdb.h" diff --git a/src/quickfile.cpp b/src/quickfile.cpp index c518dd9..39e257d 100644 --- a/src/quickfile.cpp +++ b/src/quickfile.cpp @@ -1,5 +1,6 @@ /* - * Copyright 2013-2015, Derrick Wood + * Original file Copyright 2013-2015, Derrick Wood + * Portions (c) 2017, Florian Breitwieser as part of KrakenHLL * * This file is part of the Kraken taxonomic sequence classification system. * diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp index 8f83742..3802924 100644 --- a/src/read_uid_mapping.cpp +++ b/src/read_uid_mapping.cpp @@ -1,3 +1,21 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ #include "uid_mapping.hpp" #include "kraken_headers.hpp" diff --git a/src/readcounts.hpp b/src/readcounts.hpp index eddca78..afb45c7 100644 --- a/src/readcounts.hpp +++ b/src/readcounts.hpp @@ -1,3 +1,21 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ #ifndef READCOUNTS_HPP #define READCOUNTS_HPP diff --git a/src/report-cols.h b/src/report-cols.h index e5fa0a5..9a37d56 100644 --- a/src/report-cols.h +++ b/src/report-cols.h @@ -1,8 +1,7 @@ /* * report-cols.h - * Copyright (C) 2017 fbreitwieser - * - * Distributed under terms of the MIT license. + * Copyright (C) 2017 fbreitwieser + * licensed under GPL3 */ #ifndef REPORT_COLS_H diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp index dc75c63..25cebf6 100644 --- a/src/set_lcas.cpp +++ b/src/set_lcas.cpp @@ -1,6 +1,6 @@ -// vim: noai:ts=2:sw=2:expandtab:smarttab /* - * Copyright 2013-2015, Derrick Wood + * Original file Copyright 2013-2015, Derrick Wood + * Portions (c) 2017, Florian Breitwieser as part of KrakenHLL * * This file is part of the Kraken taxonomic sequence classification system. * diff --git a/src/taxdb.h b/src/taxdb.h index aef1e50..38f71a5 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -1,8 +1,9 @@ -/* Original work Copyright 2013 David Ainsworth - * Modified work copyright 2017 Florian Breitwieser +/* + * Original work Copyright 2013 David Ainsworth + * Modified work copyright 2017 Florian Breitwieser * * The original file is part of SLAM - * The modified file is part of a modified Kraken version + * The modified file is part of KrakenHLL * * SLAM is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp index 2914468..e2fc4cd 100644 --- a/src/uid_mapping.cpp +++ b/src/uid_mapping.cpp @@ -1,3 +1,21 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ #include #include diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp index 93d1680..64e6193 100644 --- a/src/uid_mapping.hpp +++ b/src/uid_mapping.hpp @@ -1,3 +1,21 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ #ifndef UID_MAPPING_H #define UID_MAPPING_H From 53314ae85b8422649e7fb6f89664fc6f5fc1e255 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 9 Nov 2017 18:06:38 -0500 Subject: [PATCH 102/105] Fix taxonomy reporting order and percentage --- src/Makefile | 2 +- src/readcounts.hpp | 10 + src/taxdb.h | 898 ++++++++++++++++++++++----------------------- 3 files changed, 451 insertions(+), 459 deletions(-) diff --git a/src/Makefile b/src/Makefile index cfebf25..37e068d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,7 +2,7 @@ CXX = g++ FOPENMP?=-fopenmp CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS} #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors -PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping +PROGS = classify db_sort set_lcas db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping #LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS} LIBFLAGS = -L. -lz ${LDFLAGS} diff --git a/src/readcounts.hpp b/src/readcounts.hpp index afb45c7..74a52a6 100644 --- a/src/readcounts.hpp +++ b/src/readcounts.hpp @@ -45,6 +45,16 @@ namespace kraken { kmers += b.kmers; return *this; } + + bool operator<(const ReadCounts& rc) { + if (n_reads < rc.n_reads) { + return true; + } + if (n_reads == rc.n_reads && n_kmers < rc.n_kmers) { + return true; + } + return false; + } }; uint64_t reads(const ReadCounts& read_count) { diff --git a/src/taxdb.h b/src/taxdb.h index 38f71a5..0416119 100644 --- a/src/taxdb.h +++ b/src/taxdb.h @@ -42,12 +42,12 @@ using namespace std; namespace patch { - template < typename T > std::string to_string( const T& n ) - { - std::ostringstream stm ; - stm << n ; - return stm.str() ; - } + template < typename T > std::string to_string( const T& n ) + { + std::ostringstream stm ; + stm << n ; + return stm.str() ; + } } @@ -73,7 +73,7 @@ struct TaxRank { // plus 'sequence', 'assembly', and 'root' //static constexpr vector rank_strings = { // "no rank", "sequence", "assembly", - // "subspecies", "species", "subgenus", "genus", "tribe", "subfamily", + // "subspecies", "species", "subgenus", "genus", "tribe", "subfamily", //"family", "superfamily", "parvorder", "infraorder", "suborder", //"order", "superorder", "parvclass", "infraclass", "subclass", //"class", "superclass", "subphylum", "phylum", "kingdom", @@ -130,11 +130,11 @@ struct TaxRank { case RANK::superkingdom: return "superkingdom"; case RANK::root: return "root"; default: - log_msg("Invalid rank!\n"); + log_msg("Invalid rank!\n"); } return "NA"; } - + }; const unordered_map TaxRank::string_to_rank = { @@ -172,35 +172,35 @@ const unordered_map TaxRank::string_to_rank = { template class TaxonomyEntry { - public: - TAXID taxonomyID; - TaxonomyEntry* parent; - std::vector children; + public: + TAXID taxonomyID; + TaxonomyEntry* parent; + std::vector children; - string rank; - std::string scientificName; - uint64_t genomeSize; - uint64_t genomeSizeOfChildren; + string rank; + std::string scientificName; + uint64_t genomeSize; + uint64_t genomeSizeOfChildren; - TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {} + TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {} - TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) : - taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_), + TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) : + taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_), genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) { - - if (parent_ != NULL) { - parent->children.push_back(this); - } - } + if (parent_ != NULL) { + parent->children.push_back(this); + } - inline bool operator==(const TaxonomyEntry& other) const; + } - friend std::ostream &operator<<(std::ostream &os, const TaxonomyEntry &m) { - TAXID parentTaxonomyID = (m.parent == NULL)? m.taxonomyID : m.parent->taxonomyID; - os << '[' << m.taxonomyID << ";parent="<< parentTaxonomyID << ";name=" << m.scientificName << ";rank=" << m.rank << ']'; - return os; -} + inline bool operator==(const TaxonomyEntry& other) const; + + friend std::ostream &operator<<(std::ostream &os, const TaxonomyEntry &m) { + TAXID parentTaxonomyID = (m.parent == NULL)? m.taxonomyID : m.parent->taxonomyID; + os << '[' << m.taxonomyID << ";parent="<< parentTaxonomyID << ";name=" << m.scientificName << ";rank=" << m.rank << ']'; + return os; + } }; @@ -214,91 +214,91 @@ class TaxonomyEntry { //} /* -template -struct TaxonomyEntryPtr_comp { - bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const; -}; -*/ + template + struct TaxonomyEntryPtr_comp { + bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const; + }; + */ template class TaxonomyDB { - public: - TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName); - TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false); - TaxonomyDB(); + public: + TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName); + TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false); + TaxonomyDB(); - TaxonomyDB(TaxonomyDB&& rhs) : entries(std::move(rhs.entries)) { - } + TaxonomyDB(TaxonomyDB&& rhs) : entries(std::move(rhs.entries)) { + } - TaxonomyDB& operator=(TaxonomyDB&& rhs) { - entries = std::move(rhs.entries); - return *this; - } + TaxonomyDB& operator=(TaxonomyDB&& rhs) { + entries = std::move(rhs.entries); + return *this; + } - void writeTaxonomyIndex(std::ostream & outs) const; - void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes); + void writeTaxonomyIndex(std::ostream & outs) const; + void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes); - TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; - std::string getScientificName(const TAXID taxID) const; - std::string getRank(const TAXID taxID) const; - TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; - pair getLowestCommonAncestor(TAXID a, TAXID b) const; - string getNextProperRank(TAXID a) const; - TAXID getTaxIDAtNextProperRank(TAXID a) const; + TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const; + std::string getScientificName(const TAXID taxID) const; + std::string getRank(const TAXID taxID) const; + TAXID getLowestCommonAncestor(const std::vector& taxIDs) const; + pair getLowestCommonAncestor(TAXID a, TAXID b) const; + string getNextProperRank(TAXID a) const; + TAXID getTaxIDAtNextProperRank(TAXID a) const; - TAXID getParentTaxID(const TAXID taxID) const; - std::unordered_map getParentMap() const; - TAXID getByScientificName(string name) const; - std::unordered_map getScientificNameMap() const; - std::string getLineage(TAXID taxonomyID) const; - std::string getMetaPhlAnLineage(TAXID taxonomyID) const; - TaxonomyEntry getEntry(TAXID taxID) const; + TAXID getParentTaxID(const TAXID taxID) const; + std::unordered_map getParentMap() const; + TAXID getByScientificName(string name) const; + std::unordered_map getScientificNameMap() const; + std::string getLineage(TAXID taxonomyID) const; + std::string getMetaPhlAnLineage(TAXID taxonomyID) const; + TaxonomyEntry getEntry(TAXID taxID) const; - bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_); - bool hasTaxon(TAXID taxonomyID_); + bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_); + bool hasTaxon(TAXID taxonomyID_); - size_t distance(TAXID taxID1, TAXID taxID2) const; + size_t distance(TAXID taxID1, TAXID taxID2) const; - bool isSubSpecies(TAXID taxonomyID) const; - int isBelowInTree(TAXID upper, TAXID lower) const; + bool isSubSpecies(TAXID taxonomyID) const; + int isBelowInTree(TAXID upper, TAXID lower) const; - void setGenomeSizes(const std::unordered_map & genomeSizes); - void readGenomeSizes(string file); - void setGenomeSize(const TAXID taxid, const uint64_t genomeSize); + void setGenomeSizes(const std::unordered_map & genomeSizes); + void readGenomeSizes(string file); + void setGenomeSize(const TAXID taxid, const uint64_t genomeSize); - void printReport(); + void printReport(); - std::unordered_map > entries; - bool genomeSizes_are_set; + std::unordered_map > entries; + bool genomeSizes_are_set; - private: + private: - std::unordered_map > - readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes); + std::unordered_map > + readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes); }; template class TaxReport { -private: - std::ostream& _reportOfb; - TaxonomyDB & _taxdb; - std::unordered_map _readCounts; - std::unordered_map _readCountsIncludingChildren; - uint64_t _total_n_reads; - bool _show_zeros; - void printLine(TaxonomyEntry& tax, unsigned depth); - -public: - TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, std::unordered_map, bool _show_zeros); - void printReport(std::string format, std::string rank); - void printReport(TaxonomyEntry& tax, unsigned depth); - void setReportCols(std::vector names); - - std::vector _report_col_names; - std::vector _report_cols; + private: + std::ostream& _reportOfb; + TaxonomyDB & _taxdb; + std::unordered_map _readCounts; + std::unordered_map _readCountsIncludingChildren; + uint64_t _total_n_reads; + bool _show_zeros; + void printLine(TaxonomyEntry& tax, unsigned depth); + + public: + TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, std::unordered_map, bool _show_zeros); + void printReport(std::string format, std::string rank); + void printReport(TaxonomyEntry& tax, unsigned depth); + void setReportCols(std::vector names); + + std::vector _report_col_names; + std::vector _report_cols; }; template @@ -307,7 +307,7 @@ V find_or_use_default(const std::unordered_map& my_map, const K& query, co //////////////////////////// DEFINITIONS void log_msg (const std::string& s) { - std::cerr << s; + std::cerr << s; } template @@ -321,77 +321,77 @@ uint64_t string_to_T(string str) { template inline uint64_t reads(const T read_count) { - cerr << "No reads function for type!! " << endl; - throw ; - return(0); + cerr << "No reads function for type!! " << endl; + throw ; + return(0); } inline uint64_t reads(const uint64_t read_count) { - return(read_count); + return(read_count); } std::vector in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) { - std::vector tokens; - size_t i = 0; - size_t next_end = start_at-1; - - for (size_t next_start = s.find(start_char, next_end + 1); \ - next_start != string::npos; - next_start = s.find(start_char, next_end + 1), ++i) { - - next_end = s.find(end_char, next_start + 1); - if (next_end == string::npos) { - cerr << "unmatched start and end!"; - exit(1); - } - - tokens.push_back(s.substr(next_start+1, next_end-1)); + std::vector tokens; + size_t i = 0; + size_t next_end = start_at-1; + + for (size_t next_start = s.find(start_char, next_end + 1); \ + next_start != string::npos; + next_start = s.find(start_char, next_end + 1), ++i) { + + next_end = s.find(end_char, next_start + 1); + if (next_end == string::npos) { + cerr << "unmatched start and end!"; + exit(1); } - return tokens; + tokens.push_back(s.substr(next_start+1, next_end-1)); + } + + return tokens; } std::vector tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) { - std::vector tokens(max_fields); - size_t delim_length = delimiter.length(); - size_t last = 0; - size_t i = 0; - - for (size_t next = s.find(delimiter, last); - (max_fields > 0 && i < max_fields) && next != string::npos; - next = s.find(delimiter, last), ++i) { - tokens[i] = s.substr(last, next-last); - last = next + delim_length; - } - if (max_fields > 0 && i < max_fields) { - tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars); - } + std::vector tokens(max_fields); + size_t delim_length = delimiter.length(); + size_t last = 0; + size_t i = 0; + + for (size_t next = s.find(delimiter, last); + (max_fields > 0 && i < max_fields) && next != string::npos; + next = s.find(delimiter, last), ++i) { + tokens[i] = s.substr(last, next-last); + last = next + delim_length; + } + if (max_fields > 0 && i < max_fields) { + tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars); + } - return tokens; + return tokens; } std::vector get_fields(const std::string &s, const std::string& delimiter, vector fields) { - std::vector tokens; - tokens.reserve(fields.size()); - size_t delim_length = delimiter.length(); - size_t last = 0; - size_t i = 0; - size_t current_field = 0; - - for (size_t next = s.find(delimiter, last); - tokens.size() < fields.size() && next != string::npos; - next = s.find(delimiter, last), ++i) { - if (i == fields[current_field]) { - tokens.push_back(s.substr(last, next-last)); - ++current_field; - } - last = next + delim_length; + std::vector tokens; + tokens.reserve(fields.size()); + size_t delim_length = delimiter.length(); + size_t last = 0; + size_t i = 0; + size_t current_field = 0; + + for (size_t next = s.find(delimiter, last); + tokens.size() < fields.size() && next != string::npos; + next = s.find(delimiter, last), ++i) { + if (i == fields[current_field]) { + tokens.push_back(s.substr(last, next-last)); + ++current_field; } + last = next + delim_length; + } - return tokens; + return tokens; } @@ -401,49 +401,49 @@ std::vector get_fields(const std::string &s, const std::string& del // readCountsOfChildren = 0; //} /* -template -bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { + template + bool TaxonomyEntryPtr_comp::operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { - return ( - (reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren))); - } -*/ + return ( + (reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren))); + } + */ /* -template -TAXID TaxonomyDB::getByScientificName(string name) const { - for (const auto & tax : entries) { - if (tax.second.scientificName == name) { - return tax.first; - } - } - return 0; -} - -template -std::unordered_map TaxonomyDB::getScientificNameMap() const { - std::unordered_map scientificNameMap; - for (const auto & tax : entries) { - scientificNameMap[tax.second.scientificName] = tax.first; - } - return scientificNameMap; -} -*/ + template + TAXID TaxonomyDB::getByScientificName(string name) const { + for (const auto & tax : entries) { + if (tax.second.scientificName == name) { + return tax.first; + } + } + return 0; + } + + template + std::unordered_map TaxonomyDB::getScientificNameMap() const { + std::unordered_map scientificNameMap; + for (const auto & tax : entries) { + scientificNameMap[tax.second.scientificName] = tax.first; + } + return scientificNameMap; + } + */ template unordered_map TaxonomyDB::getParentMap() const { - unordered_map Parent_map; - //for (const auto & tax : entries) { - for (auto tax_it = entries.begin(); tax_it != entries.end(); ++tax_it) { - if (tax_it->first == 0) - continue; - if (tax_it->second.parent == NULL) { - //cerr << "Parent for " << tax.first << " is 0\n"; - Parent_map[tax_it->first] = 0; // for kraken::lca - } else { - Parent_map[tax_it->first] = tax_it->second.parent->taxonomyID; - } + unordered_map Parent_map; + //for (const auto & tax : entries) { + for (auto tax_it = entries.begin(); tax_it != entries.end(); ++tax_it) { + if (tax_it->first == 0) + continue; + if (tax_it->second.parent == NULL) { + //cerr << "Parent for " << tax.first << " is 0\n"; + Parent_map[tax_it->first] = 0; // for kraken::lca + } else { + Parent_map[tax_it->first] = tax_it->second.parent->taxonomyID; } - return Parent_map; + } + return Parent_map; } template @@ -459,25 +459,25 @@ TaxonomyEntry TaxonomyDB::getEntry(TAXID taxID) const { template void createPointers( - std::unordered_map >& entries, - const std::unordered_map& parentMap) { + std::unordered_map >& entries, + const std::unordered_map& parentMap) { for (auto entry_it = entries.begin(); entry_it != entries.end(); ++entry_it) { - TAXID taxonomyID = entry_it->first; - auto parent_it = parentMap.find(taxonomyID); - if (parent_it == parentMap.end()) { - cerr << "Cannot find parent for " << taxonomyID << endl; - } else { - TAXID parentTaxonomyID = parent_it->second; + TAXID taxonomyID = entry_it->first; + auto parent_it = parentMap.find(taxonomyID); + if (parent_it == parentMap.end()) { + cerr << "Cannot find parent for " << taxonomyID << endl; + } else { + TAXID parentTaxonomyID = parent_it->second; if (taxonomyID != parentTaxonomyID) { - auto parent_ptr = entries.find(parentTaxonomyID); - if (parent_ptr != entries.end()) { - entry_it->second.parent = &parent_ptr->second; - parent_ptr->second.children.push_back(&entry_it->second); - } else { - cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl; - } - } + auto parent_ptr = entries.find(parentTaxonomyID); + if (parent_ptr != entries.end()) { + entry_it->second.parent = &parent_ptr->second; + parent_ptr->second.children.push_back(&entry_it->second); + } else { + cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl; } + } + } } } @@ -487,7 +487,7 @@ TaxonomyDB::TaxonomyDB() : genomeSizes_are_set(false) { } template TaxonomyDB::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) : entries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes) - { } +{ } template unordered_map> readDumps(const std::string namesDumpFileName, const std::string nodesDumpFileName) { @@ -502,8 +502,8 @@ unordered_map> readDumps(const std::string namesDump template TaxonomyDB::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) : - entries(readDumps(namesDumpFileName, nodesDumpFileName)) { -} + entries(readDumps(namesDumpFileName, nodesDumpFileName)) { + } template std::unordered_map parseNodesDump(const std::string nodesDumpFileName, std::unordered_map >& entries) { @@ -525,9 +525,9 @@ std::unordered_map parseNodesDump(const std::string nodesDumpFileNa auto entryIt = entries.find(taxonomyID); if (entryIt == entries.end()) { entries[taxonomyID] = TaxonomyEntry(taxonomyID, NULL, rank, ""); - parentMap[taxonomyID] = parentTaxonomyID; + parentMap[taxonomyID] = parentTaxonomyID; } else { - parentMap[taxonomyID] = parentTaxonomyID; + parentMap[taxonomyID] = parentTaxonomyID; entryIt->second.rank = rank; } @@ -556,11 +556,11 @@ void parseNamesDump(const std::string namesDumpFileName, std::unordered_map(taxonomyID, NULL, "", scientificName); + if (entryIt == entries.end()) { + cerr << "Entry for " << taxonomyID << " does not exist - it should!" << '\n'; + //entries[taxonomyID] = TaxonomyEntry(taxonomyID, NULL, "", scientificName); } else { - entryIt->second.scientificName = scientificName; + entryIt->second.scientificName = scientificName; } } namesDumpFile.ignore(2560, '\n'); @@ -572,7 +572,7 @@ std::vector getSortedKeys(const std::unordered_map& std::vector keys; keys.reserve (my_unordered_map.size()); for (auto it = my_unordered_map.begin(); it != my_unordered_map.end(); ++it) { - keys.push_back(it->first); + keys.push_back(it->first); } std::sort (keys.begin(), keys.end()); return keys; @@ -582,15 +582,15 @@ template void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { std::vector sorted_keys = getSortedKeys(entries); for (size_t i = 0; i < sorted_keys.size(); ++i) { - TAXID taxonomyID = sorted_keys[i]; - const auto& entry = entries.at(taxonomyID); - TAXID parentTaxonomyID = (entry.parent==NULL? taxonomyID : entry.parent->taxonomyID); + TAXID taxonomyID = sorted_keys[i]; + const auto& entry = entries.at(taxonomyID); + TAXID parentTaxonomyID = (entry.parent==NULL? taxonomyID : entry.parent->taxonomyID); outs << taxonomyID << '\t' << parentTaxonomyID << '\t' - << entry.scientificName << '\t' << entry.rank; + << entry.scientificName << '\t' << entry.rank; if (genomeSizes_are_set) { - outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren; - } - outs << '\n'; + outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren; + } + outs << '\n'; } outs.flush(); } @@ -598,7 +598,7 @@ void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs) const { template void TaxonomyDB::setGenomeSizes(const std::unordered_map & genomeSizes) { for (auto it = genomeSizes.begin(); it != genomeSizes.end(); ++it) { - setGenomeSize(it->first, it->second); + setGenomeSize(it->first, it->second); } genomeSizes_are_set = true; } @@ -611,7 +611,7 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName, bool has template std::unordered_map > - TaxonomyDB::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) { +TaxonomyDB::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) { log_msg("Reading taxonomy index from " + inFileName); std::ifstream inFile(inFileName); if (!inFile.is_open()) @@ -634,10 +634,10 @@ std::unordered_map > inFile.get(); // read tab std::getline(inFile, scientificName, '\t'); if (hasGenomeSizes) { - std::getline(inFile, rank, '\t'); - inFile >> genomeSize >> genomeSizeOfChildren; + std::getline(inFile, rank, '\t'); + inFile >> genomeSize >> genomeSizeOfChildren; } else { - std::getline(inFile, rank, '\n'); + std::getline(inFile, rank, '\n'); } TaxonomyEntry newEntry(taxonomyID, NULL, rank, scientificName, genomeSize, genomeSizeOfChildren); @@ -679,76 +679,76 @@ TAXID TaxonomyDB::getTaxIDAtNextProperRank(TAXID a) const { template pair TaxonomyDB::getLowestCommonAncestor(TAXID a, TAXID b) const { - if (a == 0 || b == 0) { - return a ? pair(a,-1) : pair(b,-1); - } + if (a == 0 || b == 0) { + return a ? pair(a,-1) : pair(b,-1); + } - // create a path from a to the root - std::unordered_set a_path; - int distA = 0; - while (a > 0 && a != getParentTaxID(a)) { - if (a == b) - return pair{a, distA}; - a_path.insert(a); - a = getParentTaxID(a); - ++distA; - } + // create a path from a to the root + std::unordered_set a_path; + int distA = 0; + while (a > 0 && a != getParentTaxID(a)) { + if (a == b) + return pair{a, distA}; + a_path.insert(a); + a = getParentTaxID(a); + ++distA; + } - int distB = 0; - // search for b in the path from a to the root - while (b > 0 && b != getParentTaxID(b)) { - auto it = a_path.find(b); - if (it != a_path.end()) { - return pair(b, distB + std::distance(a_path.begin(), it)); - } - b = getParentTaxID(b); - ++distB; + int distB = 0; + // search for b in the path from a to the root + while (b > 0 && b != getParentTaxID(b)) { + auto it = a_path.find(b); + if (it != a_path.end()) { + return pair(b, distB + std::distance(a_path.begin(), it)); } - return pair(1, distA+distB); + b = getParentTaxID(b); + ++distB; + } + return pair(1, distA+distB); } /* -template -TAXID TaxonomyDB::getLowestCommonAncestor( - const std::vector& taxIDs) const { - if (taxIDs.size() == 0) { - return 0; - } - std::vector > paths; - for (auto& taxID : taxIDs) { - bool good = true; - std::vector path; - TAXID tempTaxID = taxID; - while (tempTaxID != 0) { - path.push_back(tempTaxID); - tempTaxID = getParentTaxID(tempTaxID); - } - if (good) paths.push_back(path); - } - if (paths.size() == 0) { - return 0; - } - for (auto& path : paths) - std::reverse(path.begin(), path.end()); - std::sort(paths.begin(), paths.end(), - [](std::vector i, std::vector j) { - return i.size() < j.size(); - }); - TAXID consensus = 0; - // assumes equal paths lengths?? - for (unsigned i = 0; i < paths[0].size(); i++) { - TAXID temp = 0; - for (auto& path : paths) { - if (temp == 0) - temp = path[i]; - else if (temp != path[i]) { - return consensus; - } - } - consensus = temp; - } - return consensus; + template + TAXID TaxonomyDB::getLowestCommonAncestor( + const std::vector& taxIDs) const { + if (taxIDs.size() == 0) { + return 0; + } + std::vector > paths; + for (auto& taxID : taxIDs) { + bool good = true; + std::vector path; + TAXID tempTaxID = taxID; + while (tempTaxID != 0) { + path.push_back(tempTaxID); + tempTaxID = getParentTaxID(tempTaxID); + } + if (good) paths.push_back(path); + } + if (paths.size() == 0) { + return 0; + } + for (auto& path : paths) + std::reverse(path.begin(), path.end()); + std::sort(paths.begin(), paths.end(), + [](std::vector i, std::vector j) { + return i.size() < j.size(); + }); + TAXID consensus = 0; +// assumes equal paths lengths?? +for (unsigned i = 0; i < paths[0].size(); i++) { +TAXID temp = 0; +for (auto& path : paths) { +if (temp == 0) +temp = path[i]; +else if (temp != path[i]) { +return consensus; +} +} +consensus = temp; +} +return consensus; } */ @@ -837,26 +837,26 @@ std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { if (taxonomyID != 131567) { std::string rank = getRank(taxonomyID); if (rank == "species") { - lineage.insert(0, "|s__"); - lineage.insert(4, getScientificName(taxonomyID)); + lineage.insert(0, "|s__"); + lineage.insert(4, getScientificName(taxonomyID)); } else if (rank == "genus") { - lineage.insert(0, "|g__"); - lineage.insert(4, getScientificName(taxonomyID)); + lineage.insert(0, "|g__"); + lineage.insert(4, getScientificName(taxonomyID)); } else if (rank == "family") { - lineage.insert(0, "|f__"); - lineage.insert(4, getScientificName(taxonomyID)); + lineage.insert(0, "|f__"); + lineage.insert(4, getScientificName(taxonomyID)); } else if (rank == "order") { - lineage.insert(0, "|o__"); - lineage.insert(4, getScientificName(taxonomyID)); + lineage.insert(0, "|o__"); + lineage.insert(4, getScientificName(taxonomyID)); } else if (rank == "class") { - lineage.insert(0, "|c__"); - lineage.insert(4, getScientificName(taxonomyID)); + lineage.insert(0, "|c__"); + lineage.insert(4, getScientificName(taxonomyID)); } else if (rank == "phylum") { - lineage.insert(0, "|p__"); - lineage.insert(4, getScientificName(taxonomyID)); + lineage.insert(0, "|p__"); + lineage.insert(4, getScientificName(taxonomyID)); } else if (rank == "superkingdom") { - lineage.insert(0, "k__"); - lineage.insert(3, getScientificName(taxonomyID)); + lineage.insert(0, "k__"); + lineage.insert(3, getScientificName(taxonomyID)); } } taxonomyID = getParentTaxID(taxonomyID); @@ -870,42 +870,42 @@ std::string TaxonomyDB::getMetaPhlAnLineage(TAXID taxonomyID) const { template TAXID TaxonomyDB::getTaxIDAtRank(const TAXID taxID, - const std::string& rank) const { + const std::string& rank) const { if (taxID == 0 || taxID == 1) return 0; auto entry_it = entries.find(taxID); // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl; if (entry_it != entries.end()) { - const TaxonomyEntry* entry_ptr = &entry_it->second; - while (entry_ptr != NULL - && entry_ptr->parent != NULL) { - // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl; - if (entry_ptr->rank == rank) { - return entry_ptr->taxonomyID; - } else { - entry_ptr = entry_ptr->parent; + const TaxonomyEntry* entry_ptr = &entry_it->second; + while (entry_ptr != NULL + && entry_ptr->parent != NULL) { + // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl; + if (entry_ptr->rank == rank) { + return entry_ptr->taxonomyID; + } else { + entry_ptr = entry_ptr->parent; + } } } - } return 0; } template void TaxonomyDB::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) { - auto it = entries.find(taxid); - if (it == entries.end()) { - cerr << "No taxonomy entry for " << taxid << "!!" << endl; - return; - } - TaxonomyEntry* tax = &it->second; - tax->genomeSize += genomeSize; - - while (tax->parent != NULL) { - tax = tax->parent; - //std::cerr << "setting genomeSizeOfChildren of parent" << std::endl; - tax->genomeSizeOfChildren += genomeSize; - } + auto it = entries.find(taxid); + if (it == entries.end()) { + cerr << "No taxonomy entry for " << taxid << "!!" << endl; + return; + } + TaxonomyEntry* tax = &it->second; + tax->genomeSize += genomeSize; + + while (tax->parent != NULL) { + tax = tax->parent; + //std::cerr << "setting genomeSizeOfChildren of parent" << std::endl; + tax->genomeSizeOfChildren += genomeSize; + } } template @@ -927,31 +927,32 @@ void TaxonomyDB::readGenomeSizes(string file) { } /* -template -void TaxonomyDB::setReadCounts(const unordered_map& readCounts) { - for (auto& elem : readCounts) { - addReadCount(elem.first, elem.second); - } + template + void TaxonomyDB::setReadCounts(const unordered_map& readCounts) { + for (auto& elem : readCounts) { + addReadCount(elem.first, elem.second); + } - for (auto& tax : entries) { - std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); - } -} -*/ + for (auto& tax : entries) { + std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp()); + } + } + */ template - TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, - std::unordered_map readCounts, +TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, + std::unordered_map readCounts, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _readCounts(readCounts), _show_zeros(show_zeros) { - for (auto it = _readCounts.begin(); it != _readCounts.end(); ++it) { - TaxonomyEntry* tax = &taxdb.entries.at(it->first); - while (tax != NULL) { - _readCountsIncludingChildren[tax->taxonomyID] += it->second; - tax = tax->parent; - } - } + for (auto it = _readCounts.begin(); it != _readCounts.end(); ++it) { + TaxonomyEntry* tax = &taxdb.entries.at(it->first); + while (tax != NULL) { + _readCountsIncludingChildren[tax->taxonomyID] += it->second; + tax = tax->parent; + } + } + _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, @@ -962,94 +963,75 @@ template template void TaxReport::setReportCols(std::vector names) { - _report_cols.clear(); - for (size_t i = 0; i< names.size(); ++i) { - auto& s = names[i]; - auto it = report_col_name_map.find(s); - if (it == report_col_name_map.end()) { - throw std::runtime_error(s + " is not a valid report column name"); - } - _report_cols.push_back(it->second); - } - _report_col_names = names; + _report_cols.clear(); + for (size_t i = 0; i< names.size(); ++i) { + auto& s = names[i]; + auto it = report_col_name_map.find(s); + if (it == report_col_name_map.end()) { + throw std::runtime_error(s + " is not a valid report column name"); + } + _report_cols.push_back(it->second); + } + _report_col_names = names; } template void TaxReport::printReport(std::string format, std::string rank) { - uint64_t _total_n_reads = reads(_readCountsIncludingChildren[0]) + reads(_readCountsIncludingChildren[1]); - if (_total_n_reads == 0) { - std::cerr << "total number of reads is zero - not creating a report!" << endl; - return; - } - if (_report_cols.size() == _report_col_names.size()) { - // print header - bool first_one = true; - for (size_t i=0; i < _report_col_names.size(); ++i) { - const std::string& s = _report_col_names[i]; - if (first_one) { - first_one = false; - } else { - _reportOfb << '\t'; - } - _reportOfb << s; - } - _reportOfb << endl; - } + _total_n_reads = reads(_readCountsIncludingChildren[0]) + reads(_readCountsIncludingChildren[1]); + if (_total_n_reads == 0) { + std::cerr << "total number of reads is zero - not creating a report!" << endl; + return; + } + if (_report_cols.size() == _report_col_names.size()) { + // print header + bool first_one = true; + for (size_t i=0; i < _report_col_names.size(); ++i) { + const std::string& s = _report_col_names[i]; + if (first_one) { + first_one = false; + } else { + _reportOfb << '\t'; + } + _reportOfb << s; + } + _reportOfb << endl; + } - if (format == "kraken") { - // A: print number of unidentified reads - printReport(_taxdb.entries.at(0),0u); - // B: print normal results - printReport(_taxdb.entries.at(1),0u); - // C: Print Unclassified stuff - auto it = _taxdb.entries.find(-1); - if (it != _taxdb.entries.end()) { - printReport(it->second,0u); - } - } else { - // print stuff at a certain level .. - //_uid_abundance; - //_taxinfo + if (format == "kraken") { + // A: print number of unidentified reads + printReport(_taxdb.entries.at(0),0u); + // B: print normal results + printReport(_taxdb.entries.at(1),0u); + // C: Print Unclassified stuff + auto it = _taxdb.entries.find(-1); + if (it != _taxdb.entries.end()) { + printReport(it->second,0u); + } + } else { + // print stuff at a certain level .. + //_uid_abundance; + //_taxinfo - } + } } -template -struct CompareReadCounts : std::binary_function { - CompareReadCounts(std::vector counts_) : counts(counts_) {} - - bool operator()(size_t a, size_t b) const { - if (counts[a]->n_reads == counts[b]->n_reads) { - return counts[a]->n_kmers < counts[b]->n_kmers; - } else { - return counts[a]->n_reads < counts[b]->n_reads; - } - } - - std::vector& counts; -}; - template void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) { - if (_show_zeros || reads(_readCountsIncludingChildren[tax.taxonomyID]) > 0) { - printLine(tax, depth); - // TODO: Order children ... - - std::vector pos(tax.children.size()); - std::vector counts(tax.children.size()); - for (size_t i=0; i < tax.children.size(); ++i) { - pos[i] = i; - counts[i] = &_readCountsIncludingChildren[i]; - } - - std::sort(pos.begin(), pos.end(), CompareReadCounts(counts)); - - for (size_t i=0; i < tax.children.size(); ++i) { - auto child_it = tax.children[ pos[i] ]; - printReport(*child_it, depth+1); - } - } + if (_show_zeros || reads(_readCountsIncludingChildren[tax.taxonomyID]) > 0) { + printLine(tax, depth); + + // Sort children + std::vector pos(tax.children.size()); + for (size_t i=0; i < tax.children.size(); ++i) { pos[i] = i; } + std::sort(pos.begin(), pos.end(), + [&](size_t a, size_t b) { return _readCountsIncludingChildren[tax.children[b]->taxonomyID] < _readCountsIncludingChildren[tax.children[a]->taxonomyID] ;} ); + + for (size_t i=0; i < tax.children.size(); ++i) { + auto child_it = tax.children[ pos[i] ]; + printReport(*child_it, depth+1); + } + } } template @@ -1058,56 +1040,56 @@ void TaxReport::printLine(TaxonomyEntry& tax, unsigned long long unique_kmers_for_clade = _readCountsIncludingChildren[tax.taxonomyID].kmers.cardinality(); double genome_size = double(tax.genomeSize+tax.genomeSizeOfChildren); - for (size_t i = 0; i< _report_cols.size(); ++i) { - auto& col = _report_cols[i]; - switch (col) { - case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; - case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; - case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; - case REPORTCOLS::DEPTH: _reportOfb << depth; break; - case REPORTCOLS::PERCENTAGE: _reportOfb << setprecision(4) << 100.0*(reads(_readCountsIncludingChildren[tax.taxonomyID]))/_total_n_reads; break; - //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; - //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; - case REPORTCOLS::NUM_READS: _reportOfb << reads(_readCounts[tax.taxonomyID]); break; - case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(_readCountsIncludingChildren[tax.taxonomyID])); break; - case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << _readCounts[tax.taxonomyID].kmers.cardinality(); break; - case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << unique_kmers_for_clade; break; - case REPORTCOLS::NUM_KMERS: _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break; - case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break; - case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; - case REPORTCOLS::CLADE_KMER_COVERAGE: - if (genome_size == 0) { - _reportOfb << "NA"; - } else { - _reportOfb << setprecision(4) << (unique_kmers_for_clade / genome_size); - }; break; - case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break; - case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; - //case REPORTCOLS::GENOME_SIZE: ; break; - //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; - //case REPORTCOLS::SUM_SCORE: ; break; - case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break; - default: _reportOfb << "NA"; - } - if (&col == &_report_cols.back()) { - _reportOfb << '\n'; - } else { - _reportOfb << '\t'; - } - } + for (size_t i = 0; i< _report_cols.size(); ++i) { + auto& col = _report_cols[i]; + switch (col) { + case REPORTCOLS::NAME: _reportOfb << tax.scientificName ; break; + case REPORTCOLS::SPACED_NAME: _reportOfb << string(2*depth, ' ') + tax.scientificName; break; + case REPORTCOLS::TAX_ID: _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break; + case REPORTCOLS::DEPTH: _reportOfb << depth; break; + case REPORTCOLS::PERCENTAGE: _reportOfb << setprecision(4) << 100.0*(reads(_readCountsIncludingChildren[tax.taxonomyID]))/_total_n_reads; break; + //case REPORTCOLS::ABUNDANCE: _reportOfb << 100*counts.abundance[0]; break; + //case REPORTCOLS::ABUNDANCE_LEN: _reportOfb << 100*counts.abundance[1]; break; + case REPORTCOLS::NUM_READS: _reportOfb << reads(_readCounts[tax.taxonomyID]); break; + case REPORTCOLS::NUM_READS_CLADE: _reportOfb << (reads(_readCountsIncludingChildren[tax.taxonomyID])); break; + case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << _readCounts[tax.taxonomyID].kmers.cardinality(); break; + case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE: _reportOfb << unique_kmers_for_clade; break; + case REPORTCOLS::NUM_KMERS: _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break; + case REPORTCOLS::NUM_KMERS_CLADE: _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break; + case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break; + case REPORTCOLS::CLADE_KMER_COVERAGE: + if (genome_size == 0) { + _reportOfb << "NA"; + } else { + _reportOfb << setprecision(4) << (unique_kmers_for_clade / genome_size); + }; break; + case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break; + case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break; + //case REPORTCOLS::GENOME_SIZE: ; break; + //case REPORTCOLS::NUM_WEIGHTED_READS: ; break; + //case REPORTCOLS::SUM_SCORE: ; break; + case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break; + default: _reportOfb << "NA"; + } + if (&col == &_report_cols.back()) { + _reportOfb << '\n'; + } else { + _reportOfb << '\t'; + } + } } template inline V find_or_use_default(const std::unordered_map& my_map, const K& query, const V default_value) { - auto itr = my_map.find(query); + auto itr = my_map.find(query); - if (itr == my_map.end()) { - return default_value; - } + if (itr == my_map.end()) { + return default_value; + } - return itr->second; + return itr->second; } From 10f5998a54bf4642e20b9d1f3790ff9e23afcd96 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Thu, 9 Nov 2017 23:23:35 -0500 Subject: [PATCH 103/105] Fixed HLL bug introduced by using vector for sparse representation --- src/Makefile | 11 ++++- src/count_unique.cpp | 56 ++++++++++++++++++++++++++ src/hyperloglogplus.h | 12 +++--- src/krakendb.cpp | 1 + src/test_hll_on_db.cpp | 91 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 163 insertions(+), 8 deletions(-) create mode 100644 src/count_unique.cpp create mode 100644 src/test_hll_on_db.cpp diff --git a/src/Makefile b/src/Makefile index 37e068d..0c8f50e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,7 +2,8 @@ CXX = g++ FOPENMP?=-fopenmp CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS} #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors -PROGS = classify db_sort set_lcas db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping +PROGS = classify db_sort set_lcas db_shrink build_taxdb read_uid_mapping count_unique +TEST_PROGS = grade_classification dump_taxdb test_hll_on_db #LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS} LIBFLAGS = -L. -lz ${LDFLAGS} @@ -10,11 +11,13 @@ LIBFLAGS = -L. -lz ${LDFLAGS} all: $(PROGS) +allall: $(PROGS) $(TEST_PROGS) + install: $(PROGS) cp $(PROGS) $(KRAKEN_DIR)/ clean: - rm -rf $(PROGS) *.o *.dSYM + rm -rf $(PROGS) $(TEST_PROGS) *.o *.dSYM *.gch db_shrink: krakendb.o quickfile.o @@ -26,6 +29,10 @@ grade_classification: taxdb.h report-cols.h read_uid_mapping: quickfile.o +count_unique: hyperloglogplus.h + +test_hll_on_db: krakendb.o hyperloglogplus.h quickfile.o + classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o gzstream.o hyperloglogplus.h taxdb.h report-cols.h $(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS) diff --git a/src/count_unique.cpp b/src/count_unique.cpp new file mode 100644 index 0000000..3299a5a --- /dev/null +++ b/src/count_unique.cpp @@ -0,0 +1,56 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ + +#include "hyperloglogplus.h" +#include +#include + +using namespace std; + +int main(int argc, char **argv) { + if (argc != 4) { + std::cerr << "USAGE:\n" + << "count_unique PRECISION SPARSE TEST_MODE\n" + << "\n" + << "Valid precision values: 10-18. SPARSE can be 0 or 1. If TEST_MODE is 1, then a HLL estimate is given with each number. \n" + << "Returns the cardinality of the input stream (has to be uint64_t)\n"; + return 1; + } + + size_t p = stoi(argv[1]); + bool sparse = bool(stoi(argv[2])); + bool test_mode = bool(stoi(argv[3])); + HyperLogLogPlusMinus hll(p, sparse); // unique k-mer count per taxon + uint64_t nr; + uint64_t ctr = 0; + if (test_mode) { + cout << "observed\testimated\n"; + } + while (cin >> nr) { + hll.add(nr); + if (test_mode) { + cout << ++ctr << '\t' << hll.cardinality() << '\n'; + } + } + if (!test_mode) { + cout << hll.cardinality() << endl; + } + +} + diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h index 495ef11..21f7d19 100644 --- a/src/hyperloglogplus.h +++ b/src/hyperloglogplus.h @@ -117,7 +117,7 @@ double alpha(uint32_t m) { /** * calculate the raw estimate as harmonic mean of the ranks in the register */ -double calculateRawEstimate(vector M) { +inline double calculateRawEstimate(const vector& M) { double inverseSum = 0.0; for (size_t i = 0; i < M.size(); ++i) { // TODO: pre-calculate the power calculation @@ -159,7 +159,7 @@ T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) { inline void insert_hash(vector& vec, uint32_t val) { auto it = std::lower_bound( vec.begin(), vec.end(), val); // find proper position in descending order - if (it == vec.end()) { + if (it == vec.end() || *it != val) { vec.insert( it, val ); // insert before iterator it } } @@ -429,7 +429,7 @@ class HyperLogLogPlusMinus { * * @return cardinality estimate */ - uint64_t cardinality(bool verbose=true) { + uint64_t cardinality(bool verbose=true) const { if (sparse) { // if we are 'sparse', then use linear counting with increased precision pPrime return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size()))); @@ -502,7 +502,7 @@ class HyperLogLogPlusMinus { return rank_val; } - vector rawEstimateData(size_t p) { + vector rawEstimateData(size_t p) const { switch (p) { case 4: return vector(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)); case 5: return vector(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)); @@ -523,7 +523,7 @@ class HyperLogLogPlusMinus { return vector(); } - vector biasData(size_t p) { + vector biasData(size_t p) const { switch(p) { case 4: return vector(biasData_precision4,arr_len(biasData_precision4)); case 5: return vector(biasData_precision5,arr_len(biasData_precision5)); @@ -551,7 +551,7 @@ class HyperLogLogPlusMinus { * @param est * @return correction value for */ - double getEstimateBias(double estimate) { + double getEstimateBias(double estimate) const { vector rawEstimateTable = rawEstimateData(p); vector biasTable = biasData(p); diff --git a/src/krakendb.cpp b/src/krakendb.cpp index cae738f..d49a66f 100644 --- a/src/krakendb.cpp +++ b/src/krakendb.cpp @@ -1,4 +1,5 @@ /* + * Portions (c) 2017, Florian Breitwieser as part of KrakenHLL * Copyright 2013-2015, Derrick Wood * * This file is part of the Kraken taxonomic sequence classification system. diff --git a/src/test_hll_on_db.cpp b/src/test_hll_on_db.cpp new file mode 100644 index 0000000..365ad5e --- /dev/null +++ b/src/test_hll_on_db.cpp @@ -0,0 +1,91 @@ +/* + * Copyright 2017, Florian Breitwieser + * + * This file is part of the KrakenHLL taxonomic sequence classification system. + * + * KrakenHLL is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * KrakenHLL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Kraken. If not, see . + */ + +#include "hyperloglogplus.h" +#include "kraken_headers.hpp" +#include "quickfile.hpp" +#include "krakendb.hpp" +#include +#include + +using namespace std; +using namespace kraken; + +//using std::map to have the keys sorted +void count_n_random_kmers(size_t nr, char* ptr, size_t pair_sz, size_t key_ct, size_t key_len) { +} + +int main(int argc, char **argv) { + if (argc != 5) { + std::cerr << "USAGE:\n" + << "count_unique DATABASE PRECISION SPARSE NR_KMERS\n" + << "\n" + << "Valid precision values: 10-18. SPARSE can be 0 or 1. \n"; + return 1; + } + + char *db_name = argv[1]; + QuickFile db_file; + db_file.open_file(db_name); + //db_file.load_file(); + KrakenDB db(db_file.ptr()); + + size_t p = stoi(argv[2]); + bool sparse = bool(stoi(argv[3])); + size_t nr = stoi(argv[4]); + + HyperLogLogPlusMinus hll(p, sparse); // unique k-mer count per taxon + + char* ptr = db.get_ptr(); + //char* pair_ptr = db.get_pair_ptr(); + uint64_t key_len = db.get_key_len(); // how many bytes does each key occupy? + //uint64_t val_len = db.get_val_len(); // how many bytes does each value occupy? + uint64_t key_ct = db.get_key_ct(); // how many key/value pairs are there? + uint64_t pair_sz = db.pair_size(); // how many bytes does each pair occupy? + + if (nr > key_ct) { + cerr << nr << " is greater than " << key_ct << "!!!" << endl; + exit(1); + } + + if (ptr == NULL) { + std::cerr << "Kraken database pointer is NULL [pair_sz: " << pair_sz << ", key_ct: "< dis(0.0, 1.0); + + size_t ctr = 0; + for (uint64_t i = 0; i < key_ct; i++) { + if (dis(gen) < prob) { + uint64_t* kmer = (uint64_t *) (ptr + pair_sz * i); + //uint32_t* taxon = (uint32_t *) (ptr + pair_sz * i + key_len); + //if (taxon == NULL) { + // std::cerr << "taxon is NULL (i is " << i << " and key_ct is " << key_ct << ")" << std::endl; + hll.add(*kmer); + ++ctr; + if (ctr < 10 || floor(log10(ctr)) == log10(ctr)) { + cout << ctr << '\t' << hll.cardinality() << '\n'; + } + } + } +} + From 1d37b547a179e2d7b965d0241216c235fdcf2547 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 10 Nov 2017 15:09:18 -0500 Subject: [PATCH 104/105] Fix Makefile --- src/Makefile | 3 ++- src/test_hll_on_db.cpp | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/Makefile b/src/Makefile index 0c8f50e..b067a9c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,8 +2,9 @@ CXX = g++ FOPENMP?=-fopenmp CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS} #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors -PROGS = classify db_sort set_lcas db_shrink build_taxdb read_uid_mapping count_unique +PROGS1 = classify db_sort set_lcas db_shrink build_taxdb read_uid_mapping count_unique TEST_PROGS = grade_classification dump_taxdb test_hll_on_db +PROGS = $(PROGS1) $(TEST_PROGS) #LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS} LIBFLAGS = -L. -lz ${LDFLAGS} diff --git a/src/test_hll_on_db.cpp b/src/test_hll_on_db.cpp index 365ad5e..0ded629 100644 --- a/src/test_hll_on_db.cpp +++ b/src/test_hll_on_db.cpp @@ -43,14 +43,23 @@ int main(int argc, char **argv) { char *db_name = argv[1]; QuickFile db_file; db_file.open_file(db_name); - //db_file.load_file(); + db_file.load_file(); + cerr << "Fully loaded\n"; KrakenDB db(db_file.ptr()); size_t p = stoi(argv[2]); bool sparse = bool(stoi(argv[3])); size_t nr = stoi(argv[4]); - HyperLogLogPlusMinus hll(p, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll10(10, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll11(11, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll12(12, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll13(13, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll14(14, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll15(15, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll16(16, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll17(17, sparse); // unique k-mer count per taxon + HyperLogLogPlusMinus hll18(18, sparse); // unique k-mer count per taxon char* ptr = db.get_ptr(); //char* pair_ptr = db.get_pair_ptr(); @@ -73,6 +82,7 @@ int main(int argc, char **argv) { std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd() std::uniform_real_distribution<> dis(0.0, 1.0); + cout << "precision\ttrue_count\testimate\n"; size_t ctr = 0; for (uint64_t i = 0; i < key_ct; i++) { if (dis(gen) < prob) { @@ -80,10 +90,26 @@ int main(int argc, char **argv) { //uint32_t* taxon = (uint32_t *) (ptr + pair_sz * i + key_len); //if (taxon == NULL) { // std::cerr << "taxon is NULL (i is " << i << " and key_ct is " << key_ct << ")" << std::endl; - hll.add(*kmer); + hll10.add(*kmer); + hll11.add(*kmer); + hll12.add(*kmer); + hll13.add(*kmer); + hll14.add(*kmer); + hll15.add(*kmer); + hll16.add(*kmer); + hll17.add(*kmer); + hll18.add(*kmer); ++ctr; if (ctr < 10 || floor(log10(ctr)) == log10(ctr)) { - cout << ctr << '\t' << hll.cardinality() << '\n'; + cout << 10 << '\t' << ctr << '\t' << hll10.cardinality() << '\n'; + cout << 11 << '\t' << ctr << '\t' << hll11.cardinality() << '\n'; + cout << 12 << '\t' << ctr << '\t' << hll12.cardinality() << '\n'; + cout << 13 << '\t' << ctr << '\t' << hll13.cardinality() << '\n'; + cout << 14 << '\t' << ctr << '\t' << hll14.cardinality() << '\n'; + cout << 15 << '\t' << ctr << '\t' << hll15.cardinality() << '\n'; + cout << 16 << '\t' << ctr << '\t' << hll16.cardinality() << '\n'; + cout << 17 << '\t' << ctr << '\t' << hll17.cardinality() << '\n'; + cout << 18 << '\t' << ctr << '\t' << hll18.cardinality() << '\n'; } } } From a95bd8554653f3a37ced6405495b32a941fe8b55 Mon Sep 17 00:00:00 2001 From: Florian Breitwieser Date: Fri, 10 Nov 2017 21:15:00 -0500 Subject: [PATCH 105/105] Allow setting custom precision --- scripts/krakenhll | 6 +++++- src/classify.cpp | 27 ++++++++++++++++----------- src/readcounts.hpp | 3 ++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/scripts/krakenhll b/scripts/krakenhll index 69e3b56..6db00b2 100755 --- a/scripts/krakenhll +++ b/scripts/krakenhll @@ -61,6 +61,7 @@ my $outfile; my $report_file; my $print_sequence = 0; my $uid_mapping = 0; +my $hll_precision = 12; GetOptions( "help" => \&display_help, @@ -78,6 +79,7 @@ GetOptions( "report-file=s" => \$report_file, "preload" => \$preload, "paired" => \$paired, + "precision=i", \$hll_precision, "check-names" => \$check_names, "gzip-compressed" => \$gunzip, "bzip2-compressed" => \$bunzip2, @@ -94,7 +96,7 @@ if (! @ARGV) { usage(); } -if (!defined $report_file) { +if (!defined $report_file && !$preload) { print STDERR "Need to specify a report file with --report-file! See --help for more details.\n"; exit 1; @@ -157,6 +159,7 @@ push @flags, "-M" if $preload; push @flags, "-r", $report_file if defined $report_file; push @flags, "-a", $db_prefix[0]."/taxDB"; push @flags, "-s" if $print_sequence; +push @flags, "-p" $precision; if ($uid_mapping) { my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid.map"; if (!-f $uid_mapping_file) { @@ -245,6 +248,7 @@ Options: --fastq-input Input is FASTQ format --gzip-compressed Input is gzip compressed --bzip2-compressed Input is bzip2 compressed + --precision INT Precision for unique k-mer counting, between 10 and 18 (default: $hll_precision) --quick Quick operation (use first hit or hits) --min-hits NUM In quick op., number of hits req'd for classification NOTE: this is ignored if --quick is not specified diff --git a/src/classify.cpp b/src/classify.cpp index 049955e..1d4e9e8 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -152,15 +152,6 @@ int main(int argc, char **argv) { //} } - if (!TaxDB_file.empty()) { - // TODO: Define if the taxDB has read counts or not!! - taxdb = TaxonomyDB(TaxDB_file, false); - Parent_map = taxdb.getParentMap(); - } else { - cerr << "TaxDB argument is required!" << endl; - return 1; - } - if (Populate_memory) cerr << "Loading database(s)... " << endl; @@ -190,6 +181,16 @@ int main(int argc, char **argv) { if (Populate_memory) cerr << "\ncomplete." << endl; + + if (!TaxDB_file.empty()) { + // TODO: Define if the taxDB has read counts or not!! + taxdb = TaxonomyDB(TaxDB_file, false); + Parent_map = taxdb.getParentMap(); + } else { + cerr << "TaxDB argument is required!" << endl; + return 1; + } + if (Print_classified) { Classified_output = cout_or_file(Classified_output_file); } @@ -612,7 +613,7 @@ void parse_command_line(int argc, char **argv) { if (argc > 1 && strcmp(argv[1], "-h") == 0) usage(0); - while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:sI:")) != -1) { + while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:sI:p:")) != -1) { switch (opt) { case 'd' : DB_filenames.push_back(optarg); @@ -631,6 +632,9 @@ void parse_command_line(int argc, char **argv) { omp_set_num_threads(Num_threads); #endif break; + case 'p' : + ReadCounts::HLL_PRECISION = stoi(optarg); + break; case 'q' : Quick_mode = true; break; @@ -693,7 +697,7 @@ void parse_command_line(int argc, char **argv) { cerr << "Missing mandatory option -i" << endl; usage(); } - if (optind == argc) { + if (optind == argc && !Populate_memory) { cerr << "No sequence data files specified" << endl; } } @@ -708,6 +712,7 @@ void usage(int exit_code) { << " -r filename Output file for Kraken report output" << endl << " -a filename TaxDB" << endl << " -I filename UID to TaxId map" << endl + << " -p # Precision for unique k-mer counting, between 10 and 18" << endl << " -t # Number of threads" << endl << " -u # Thread work unit size (in bp)" << endl << " -q Quick operation" << endl diff --git a/src/readcounts.hpp b/src/readcounts.hpp index 74a52a6..c46d3e0 100644 --- a/src/readcounts.hpp +++ b/src/readcounts.hpp @@ -28,8 +28,9 @@ namespace kraken { uint64_t n_reads; uint64_t n_kmers; HyperLogLogPlusMinus kmers; // unique k-mer count per taxon + static size_t HLL_PRECISION = 12; - ReadCounts() : n_reads(0), n_kmers(0) { } + ReadCounts() : n_reads(0), n_kmers(0), kmers(HyperLogLogPlusMinus(HLL_PRECISION)) { } ReadCounts(size_t precision) : kmers(HyperLogLogPlusMinus(precision)) { }