From 4487ec115903ef9e1e30cc775536dc296a18ba0b Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Fri, 10 Jul 2015 10:41:05 -0400
Subject: [PATCH 001/105] Added .gitignore files

---
 .gitignore     | 1 +
 src/.gitignore | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 src/.gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..500b4a0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/install/
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..f30e916
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,6 @@
+*.o
+/db_sort
+/classify
+/db_shrink
+/set_lcas
+/make_seqid_to_taxid_map

From a07b9a2917689c9145785cc6a64d33c009a41255 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Fri, 10 Jul 2015 10:55:34 -0400
Subject: [PATCH 002/105] Skip empty FASTA sequences instead of exiting the
 program

---
 src/seqreader.cpp | 4 +---
 src/set_lcas.cpp  | 8 ++++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/seqreader.cpp b/src/seqreader.cpp
index 78c1442..6903c0d 100644
--- a/src/seqreader.cpp
+++ b/src/seqreader.cpp
@@ -72,9 +72,7 @@ namespace kraken {
     dna.seq = seq_ss.str();
 
     if (dna.seq.empty()) {
-      warnx("malformed fasta file - zero-length record (%s)", dna.id.c_str());
-      valid = false;
-      return dna;
+      valid = true; // set_lcas handles empty sequences
     }
 
     return dna;
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 46b2e09..c02307d 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -109,11 +109,19 @@ void process_single_file() {
   FastaReader reader(Multi_fasta_filename);
   DNASequence dna;
   uint32_t seqs_processed = 0;
+  uint32_t seqs_skipped = 0;
+  uint32_t seqs_no_taxid = 0;
 
   while (reader.is_valid()) {
     dna = reader.next_sequence();
     if (! reader.is_valid())
       break;
+
+    if ( dna.seq.empty() ) {
+      ++seq_skipped;
+      continue;
+    }
+
     uint32_t taxid = ID_to_taxon_map[dna.id];
     if (taxid) {
       #pragma omp parallel for schedule(dynamic)

From d6071dabe604f9faa98f29534dd7fa9caef9a379 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Fri, 10 Jul 2015 11:06:05 -0400
Subject: [PATCH 003/105] Added options -T to force taxid of the sequences, and
 -v for verbose output

When -T is set, for each observed k-mer the taxid of the sequence is set - instead of the
lowest common ancestor of of the sequence taxid and the currently set taxid. This is useful
for setting the taxid of contaminant sequences, which may also be observed in database
genomes, to the contaminant taxid.

-v gives more verbose output
---
 src/set_lcas.cpp | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index c02307d..6d12533 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -39,7 +39,10 @@ int Num_threads = 1;
 string DB_filename, Index_filename, Nodes_filename,
   File_to_taxon_map_filename,
   ID_to_taxon_map_filename, Multi_fasta_filename;
+bool force_taxid = false;
+
 bool Allow_extra_kmers = false;
+bool verbose = false;
 bool Operate_in_RAM = false;
 bool One_FASTA_file = false;
 map<uint32_t, uint32_t> Parent_map;
@@ -52,11 +55,12 @@ int main(int argc, char **argv) {
   #endif
 
   parse_command_line(argc, argv);
-  Parent_map = build_parent_map(Nodes_filename);
+
+  if (!force_taxid) {
+    Parent_map = build_parent_map(Nodes_filename);
+  }
 
   QuickFile db_file(DB_filename, "rw");
-  Database = KrakenDB(db_file.ptr());
-  KmerScanner::set_k(Database.get_k());
 
   char *temp_ptr = NULL;
   size_t db_file_size = db_file.size();
@@ -67,8 +71,12 @@ int main(int argc, char **argv) {
     ifs.read(temp_ptr, db_file_size);
     ifs.close();
     Database = KrakenDB(temp_ptr);
+  } else {
+    Database = KrakenDB(db_file.ptr());
   }
 
+  KmerScanner::set_k(Database.get_k());
+
   QuickFile idx_file(Index_filename);
   KrakenDBIndex db_index(idx_file.ptr());
   Database.set_index(&db_index);
@@ -127,11 +135,18 @@ void process_single_file() {
       #pragma omp parallel for schedule(dynamic)
       for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
         set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+
+        ++seqs_processed;
+    } else {
+        if (verbose) 
+            cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl;
+
+        ++seqs_no_taxid
     }
-    cerr << "\rProcessed " << ++seqs_processed << " sequences";
+    cerr << "\rProcessed " << seqs_processed << " sequences";
   }
-  cerr << "\r                                                       ";
-  cerr << "\rFinished processing " << seqs_processed << " sequences" << endl;
+  cerr << "\r                                                                            ";
+  cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< skipped_seqs <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl;
 }
 
 void process_files() {
@@ -186,9 +201,13 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
       if (! Allow_extra_kmers)
         errx(EX_DATAERR, "kmer found in sequence that is not in database");
       else
+        cerr << "kmer found in sequence w/ taxid " << taxid << " that is not in database" << endl;
         continue;
     }
-    *val_ptr = lca(Parent_map, taxid, *val_ptr);
+    if (!force_taxid)
+        *val_ptr = lca(Parent_map, taxid, *val_ptr);
+    else
+        *val_ptr = taxid;
   }
 }
 
@@ -198,7 +217,7 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xM")) != -1) {
+  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTv")) != -1) {
     switch (opt) {
       case 'f' :
         File_to_taxon_map_filename = optarg;
@@ -226,9 +245,15 @@ void parse_command_line(int argc, char **argv) {
         omp_set_num_threads(Num_threads);
         #endif
         break;
+      case 'T' :
+        force_taxid = true;
+        break;
       case 'n' :
         Nodes_filename = optarg;
         break;
+      case 'v' :
+        verbose = true;
+        break;
       case 'x' :
         Allow_extra_kmers = true;
         break;
@@ -267,6 +292,8 @@ void usage(int exit_code) {
        << "  -f filename      File to taxon map" << endl
        << "  -F filename      Multi-FASTA file with sequence data" << endl
        << "  -m filename      Sequence ID to taxon map" << endl
+       << "  -T               Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
+       << "  -v               Verbose output" << endl
        << "  -h               Print this message" << endl
        << endl
        << "-F and -m must be specified together.  If -f is given, "

From 14b74e2a1380d70582423f6d46c6493baf5d72b6 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Fri, 10 Jul 2015 11:06:37 -0400
Subject: [PATCH 004/105] added comments

---
 src/krakenutil.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index a00e6bb..0c424c4 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -53,11 +53,14 @@ namespace kraken {
     if (a == 0 || b == 0)
       return a ? a : b;
 
+    // create a path from a to the root
     set<uint32_t> a_path;
     while (a > 0) {
       a_path.insert(a);
       a = parent_map[a];
     }
+
+    // search for b in the path from a to the root
     while (b > 0) {
       if (a_path.count(b) > 0)
         return b;

From 567ae7bd2bc710f483614e5a34b604e6a411e3b5 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Thu, 17 Sep 2015 13:02:21 -0400
Subject: [PATCH 005/105] update

---
 scripts/build_kraken_db.sh |  9 +++++----
 src/set_lcas.cpp           | 10 +++++++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
index 7df4d0b..9d090e0 100755
--- a/scripts/build_kraken_db.sh
+++ b/scripts/build_kraken_db.sh
@@ -39,6 +39,7 @@ function report_time_elapsed() {
 start_time=$(date "+%s.%N")
 
 DATABASE_DIR="$KRAKEN_DB_NAME"
+FIND_OPTS=-L
 
 if [ ! -d "$DATABASE_DIR" ]
 then
@@ -72,11 +73,11 @@ else
   # Estimate hash size as 1.15 * chars in library FASTA files
   if [ -z "$KRAKEN_HASH_SIZE" ]
   then
-    KRAKEN_HASH_SIZE=$(find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -printf '%s\n' | perl -nle '$sum += $_; END {print int(1.15 * $sum)}')
+    KRAKEN_HASH_SIZE=$(find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -printf '%s\n' | perl -nle '$sum += $_; END {print int(1.15 * $sum)}')
     echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
   fi
 
-  find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
     xargs -0 cat | \
     jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
       -o database /dev/fd/0
@@ -160,7 +161,7 @@ then
 else
   echo "Creating GI number to seqID map (step 4 of 6)..."
   start_time1=$(date "+%s.%N")
-  find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
     xargs -0 cat | report_gi_numbers.pl > gi2seqid.map.tmp
   mv gi2seqid.map.tmp gi2seqid.map
 
@@ -187,7 +188,7 @@ then
 else
   echo "Setting LCAs in database (step 6 of 6)..."
   start_time1=$(date "+%s.%N")
-  find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
+  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
     xargs -0 cat | \
     set_lcas $MEMFLAG -x -d database.kdb -i database.idx \
     -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 6d12533..1fc333e 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -65,12 +65,14 @@ int main(int argc, char **argv) {
   char *temp_ptr = NULL;
   size_t db_file_size = db_file.size();
   if (Operate_in_RAM) {
+    cerr << "Getting " << DB_filename << " into memory ... ";
     db_file.close_file();
     temp_ptr = new char[ db_file_size ];
     ifstream ifs(DB_filename.c_str(), ifstream::binary);
     ifs.read(temp_ptr, db_file_size);
     ifs.close();
     Database = KrakenDB(temp_ptr);
+    cerr << "done" << endl;
   } else {
     Database = KrakenDB(db_file.ptr());
   }
@@ -97,6 +99,7 @@ int main(int argc, char **argv) {
 }
 
 void process_single_file() {
+  cerr << "Processing multiple FASTA files" << endl;
   ifstream map_file(ID_to_taxon_map_filename.c_str());
   if (map_file.rdstate() & ifstream::failbit) {
     err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str());
@@ -126,7 +129,7 @@ void process_single_file() {
       break;
 
     if ( dna.seq.empty() ) {
-      ++seq_skipped;
+      ++seqs_skipped;
       continue;
     }
 
@@ -141,15 +144,16 @@ void process_single_file() {
         if (verbose) 
             cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl;
 
-        ++seqs_no_taxid
+        ++seqs_no_taxid;
     }
     cerr << "\rProcessed " << seqs_processed << " sequences";
   }
   cerr << "\r                                                                            ";
-  cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< skipped_seqs <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl;
+  cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< seqs_skipped <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl;
 }
 
 void process_files() {
+  cerr << "Processing files in " << File_to_taxon_map_filename.c_str() << endl;
   ifstream map_file(File_to_taxon_map_filename.c_str());
   if (map_file.rdstate() & ifstream::failbit) {
     err(EX_NOINPUT, "can't open %s", File_to_taxon_map_filename.c_str());

From dfecb3138dcdb49d7e838ab98921a80f56371781 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Fri, 4 Dec 2015 10:32:09 -0500
Subject: [PATCH 006/105] Added '>kraken:taxid|' header parsing to set_lcas -
 makes it possible to run set_lcas on sequences that were not in the DB build
 originally

---
 src/set_lcas.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 1fc333e..e769132 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -133,7 +133,15 @@ void process_single_file() {
       continue;
     }
 
-    uint32_t taxid = ID_to_taxon_map[dna.id];
+    // Get the taxid. If the header specifies kraken:taxid, use that
+    uint32_t taxid;
+    string prefix = "kraken:taxid|";
+    if (dna.id.substr(0,prefix.size()) == prefix) {
+        taxid = std::atoi(dna.id.substr(prefix.size()).c_str());
+    } else {
+        taxid = ID_to_taxon_map[dna.id];
+    }
+
     if (taxid) {
       #pragma omp parallel for schedule(dynamic)
       for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)

From 9fc61e36ec05539cd6d22fad17d714dccb083de2 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Sun, 20 Dec 2015 12:53:59 -0500
Subject: [PATCH 007/105] Only report missing kmers when verbose

---
 src/set_lcas.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index e769132..a0d601a 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -210,11 +210,13 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
                 Database.canonical_representation(*kmer_ptr)
               );
     if (val_ptr == NULL) {
-      if (! Allow_extra_kmers)
+      if (! Allow_extra_kmers) {
         errx(EX_DATAERR, "kmer found in sequence that is not in database");
-      else
+      } 
+      else if (verbose) {
         cerr << "kmer found in sequence w/ taxid " << taxid << " that is not in database" << endl;
-        continue;
+      }
+      continue;
     }
     if (!force_taxid)
         *val_ptr = lca(Parent_map, taxid, *val_ptr);

From c71ffbc5a0c49f1b9f9425473710d46440c752e3 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Fri, 9 Dec 2016 16:11:25 -0500
Subject: [PATCH 008/105] Use 'find ... -exec cat' instead 'find .. -print0 |
 xargs -0 cat'

---
 scripts/build_kraken_db.sh | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
index 9d090e0..d0b49a3 100755
--- a/scripts/build_kraken_db.sh
+++ b/scripts/build_kraken_db.sh
@@ -77,8 +77,7 @@ else
     echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
   fi
 
-  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
-    xargs -0 cat | \
+  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
     jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
       -o database /dev/fd/0
 
@@ -114,9 +113,10 @@ else
     else
       echo "Reducing database size (step 2 of 6)..."
       max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc)
+      idx_size_gb=$(printf %.2f $(echo "$idx_size/2^30" | bc) )
       if (( $(echo "$max_kdb_size < 0" | bc) == 1 ))
       then
-        echo "Maximum database size too small, aborting reduction."
+        echo "Maximum database size too small - index alone needs $idx_size_gb GB.  Aborting reduction."
         exit 1
       fi
       # Key ct is 8 byte int stored 48 bytes from start of file
@@ -161,8 +161,8 @@ then
 else
   echo "Creating GI number to seqID map (step 4 of 6)..."
   start_time1=$(date "+%s.%N")
-  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
-    xargs -0 cat | report_gi_numbers.pl > gi2seqid.map.tmp
+  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
+    report_gi_numbers.pl > gi2seqid.map.tmp
   mv gi2seqid.map.tmp gi2seqid.map
 
   echo "GI number to seqID map created. [$(report_time_elapsed $start_time1)]"
@@ -188,8 +188,7 @@ then
 else
   echo "Setting LCAs in database (step 6 of 6)..."
   start_time1=$(date "+%s.%N")
-  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
-    xargs -0 cat | \
+  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
     set_lcas $MEMFLAG -x -d database.kdb -i database.idx \
     -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
   touch "lca.complete"

From 8259c6af049d5f368ded746cd5f6e891f5d45897 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Fri, 9 Dec 2016 16:12:28 -0500
Subject: [PATCH 009/105] Allow multiple --db arguments

---
 scripts/kraken | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/scripts/kraken b/scripts/kraken
index 57cc717..c81ed38 100755
--- a/scripts/kraken
+++ b/scripts/kraken
@@ -45,7 +45,7 @@ my $quick = 0;
 my $min_hits = 1;
 my $fasta_input = 0;
 my $fastq_input = 0;
-my $db_prefix;
+my @db_prefix;
 my $threads;
 my $preload = 0;
 my $gunzip = 0;
@@ -56,11 +56,12 @@ my $only_classified_output = 0;
 my $unclassified_out;
 my $classified_out;
 my $outfile;
+my $report_file;
 
 GetOptions(
   "help" => \&display_help,
   "version" => \&display_version,
-  "db=s" => \$db_prefix,
+  "db=s" => \@db_prefix,
   "threads=i" => \$threads,
   "fasta-input" => \$fasta_input,
   "fastq-input" => \$fastq_input,
@@ -69,6 +70,7 @@ GetOptions(
   "unclassified-out=s" => \$unclassified_out,
   "classified-out=s" => \$classified_out,
   "output=s" => \$outfile,
+  "report-file=s" => \$report_file,
   "preload" => \$preload,
   "paired" => \$paired,
   "check-names" => \$check_names,
@@ -85,23 +87,23 @@ if (! @ARGV) {
   print STDERR "Need to specify input filenames!\n";
   usage();
 }
-eval { $db_prefix = krakenlib::find_db($db_prefix); };
+
+eval { @db_prefix = map { krakenlib::find_db($_) } @db_prefix };
 if ($@) {
   die "$PROG: $@";
 }
 
-my $taxonomy = "$db_prefix/taxonomy/nodes.dmp";
+my $taxonomy = $db_prefix[0]."/taxonomy/nodes.dmp";
 if ($quick) {
   undef $taxonomy;  # Skip loading nodes file, not needed in quick mode
 }
 
-my $kdb_file = "$db_prefix/database.kdb";
-my $idx_file = "$db_prefix/database.idx";
-if (! -e $kdb_file) {
-  die "$PROG: $kdb_file does not exist!\n";
-}
-if (! -e $idx_file) {
-  die "$PROG: $idx_file does not exist!\n";
+
+my @kdb_files = map { "$_/database.kdb" } @db_prefix;
+my @idx_files = map { "$_/database.idx" } @db_prefix;
+
+foreach my $file (@kdb_files,@idx_files) {
+  die "$PROG: $file does not exist!\n" if (! -e $file);
 }
 
 if ($min_hits > 1 && ! $quick) {
@@ -133,8 +135,8 @@ if ($auto_detect) {
 
 # set flags for classifier
 my @flags;
-push @flags, "-d", $kdb_file;
-push @flags, "-i", $idx_file;
+push @flags, map { ("-d", $_) } @kdb_files;
+push @flags, map { ("-i", $_) } @idx_files;
 push @flags, "-t", $threads if $threads > 1;
 push @flags, "-n", $taxonomy if defined $taxonomy;
 push @flags, "-q" if $quick;
@@ -193,6 +195,7 @@ if (@pipe_argv) {
   }
 }
 
+print STDERR "$CLASSIFY, @flags, @ARGV\n";
 exec $CLASSIFY, @flags, @ARGV;
 die "$PROG: exec error: $!\n";
 

From 7c678d699c3b52c7f120785bb4ec2c7c04740fc6 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 9 Feb 2017 11:12:52 -0500
Subject: [PATCH 010/105] Count unique k-mers for each taxid

---
 scripts/kraken-report           |  18 +-
 scripts/read_merger.pl          |  21 +-
 scripts/report_gi_numbers.pl    |   8 +-
 src/Makefile                    |   6 +-
 src/assert_helpers.h            | 283 +++++++++++++++
 src/classify.cpp                | 127 +++++--
 src/get_kmers.cpp               | 309 ++++++++++++++++
 src/hyperloglogbias.h           | 133 +++++++
 src/hyperloglogplus.h           | 623 ++++++++++++++++++++++++++++++++
 src/make_seqid_to_taxid_map.cpp |  16 +-
 src/third_party/MurmurHash3.cpp | 335 +++++++++++++++++
 src/third_party/MurmurHash3.h   |  37 ++
 12 files changed, 1869 insertions(+), 47 deletions(-)
 create mode 100644 src/assert_helpers.h
 create mode 100644 src/get_kmers.cpp
 create mode 100644 src/hyperloglogbias.h
 create mode 100644 src/hyperloglogplus.h
 create mode 100644 src/third_party/MurmurHash3.cpp
 create mode 100644 src/third_party/MurmurHash3.h

diff --git a/scripts/kraken-report b/scripts/kraken-report
index 8351593..99cab1b 100755
--- a/scripts/kraken-report
+++ b/scripts/kraken-report
@@ -37,11 +37,13 @@ require "$KRAKEN_DIR/krakenlib.pm";
 
 my $show_zeros = 0;
 my $db_prefix;
+my $is_cnts_table = 0;
 
 GetOptions(
   "help" => \&display_help,
   "version" => \&display_version,
   "show-zeros" => \$show_zeros,
+  "cnts-table" => \$is_cnts_table,
   "db=s" => \$db_prefix,
 );
 
@@ -77,10 +79,18 @@ load_taxonomy($db_prefix);
 my %taxo_counts;
 my $seq_count = 0;
 $taxo_counts{0} = 0;
-while (<>) {
-  my @fields = split;
-  $taxo_counts{$fields[2]}++;
-  $seq_count++;
+if ($is_cnts_table) {
+  while (<>) {
+    my ($taxid,$count) = split;
+    $taxo_counts{$taxid} = $count;
+    $seq_count += $count;
+  }
+} else {
+  while (<>) {
+    my (undef,$taxid) = split;
+    $taxo_counts{$taxid}++;
+    $seq_count++;
+  }
 }
 my $classified_count = $seq_count - $taxo_counts{0};
 
diff --git a/scripts/read_merger.pl b/scripts/read_merger.pl
index 2d32477..6e97099 100755
--- a/scripts/read_merger.pl
+++ b/scripts/read_merger.pl
@@ -88,7 +88,12 @@
 while (defined($seq1 = read_sequence($fh1))) {
   $seq2 = read_sequence($fh2);
   if (! defined $seq2) {
-    die "$PROG: mismatched sequence counts\n";
+    print STDERR "$PROG: mismatched sequence counts - file 1 has more reads\n
+  Outputting the further reads unpaired\n";
+    print_sequence($seq1);
+    while (defined($seq1 = read_sequence($fh1))) {
+      print_sequence($seq1);
+    }
   }
   if ($check_names && $seq1->{id} ne $seq2->{id}) {
     die "$PROG: mismatched mate pair names ('$seq1->{id}' & '$seq2->{id}')\n";
@@ -96,7 +101,13 @@
   print_merged_sequence($seq1, $seq2);
 }
 if (defined($seq2 = read_sequence($fh2))) {
-  die "$PROG: mismatched sequence counts\n";
+  print STDERR "$PROG: mismatched sequence counts - file 2 has more reads\n
+  Outputting the further reads unpaired\n";
+  print_sequence($seq2);
+  while (defined($seq2 = read_sequence($fh2))) {
+    print_sequence($seq2);
+  }
+
 }
 close $fh1;
 close $fh2;
@@ -162,3 +173,9 @@ sub print_merged_sequence {
   print ">" . $seq1->{id} . "\n";
   print $seq1->{seq} . "N" . $seq2->{seq} . "\n";
 }
+
+sub print_sequence {
+  my ($seq1) = @_;
+  print ">" . $seq1->{id} . "\n";
+  print $seq1->{seq} . "\n";
+}
diff --git a/scripts/report_gi_numbers.pl b/scripts/report_gi_numbers.pl
index ce6a0bc..88a24f0 100755
--- a/scripts/report_gi_numbers.pl
+++ b/scripts/report_gi_numbers.pl
@@ -19,11 +19,11 @@
 
 # Reads multi-FASTA input and for each sequence ID reports a
 # tab-delimited line:
-#   <GI number> <sequence ID>
+#   <GI number> <sequence ID> <full header>
 # 
 #   or in the case of a sequence with Kraken taxid information:
 #
-#   TAXID <taxonomy ID> <sequence ID>
+#   TAXID <taxonomy ID> <sequence ID> <full header>
 #
 # Assumes all sequence IDs actually have GI numbers or Kraken
 # taxid information.
@@ -38,12 +38,12 @@
   next unless /^>(\S+)/;
   my $seq_id = $1;
   if ($seq_id =~ /(^|\|)kraken:taxid\|(\d+)/) {
-    print "TAXID\t$2\t$seq_id\n";
+    print "TAXID\t$2\t$seq_id\t$_\n";
     next;
   }
 
   if ($seq_id !~ /(^|\|)gi\|(\d+)/) {
     die "$PROG: sequence ID $seq_id lacks GI number, aborting.\n";
   }
-  print "$2\t$seq_id\n";
+  print "$2\t$seq_id\t$_\n";
 }
diff --git a/src/Makefile b/src/Makefile
index 2f927f6..6e2c938 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 CXX = g++
-CXXFLAGS = -Wall -fopenmp -O3
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink
+CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fsyntax-only
+PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink get_kmers
 
 .PHONY: all install clean
 
@@ -18,6 +18,8 @@ db_sort: krakendb.o quickfile.o
 
 set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o
 
+get_kmers: krakendb.o quickfile.o krakenutil.o seqreader.o
+
 classify: krakendb.o quickfile.o krakenutil.o seqreader.o
 
 make_seqid_to_taxid_map: quickfile.o
diff --git a/src/assert_helpers.h b/src/assert_helpers.h
new file mode 100644
index 0000000..6a2fe97
--- /dev/null
+++ b/src/assert_helpers.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
+ *
+ * This file is part of Bowtie 2.
+ *
+ * Bowtie 2 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Bowtie 2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ASSERT_HELPERS_H_
+#define ASSERT_HELPERS_H_
+
+#include <stdexcept>
+#include <string>
+#include <cassert>
+#include <iostream>
+
+/**
+ * Assertion for release-enabled assertions
+ */
+class ReleaseAssertException : public std::runtime_error {
+public:
+	ReleaseAssertException(const std::string& msg = "") : std::runtime_error(msg) {}
+};
+
+/**
+ * Macros for release-enabled assertions, and helper macros to make
+ * all assertion error messages more helpful.
+ */
+#ifndef NDEBUG
+#define ASSERT_ONLY(...) __VA_ARGS__
+#else
+#define ASSERT_ONLY(...)
+#endif
+
+#define rt_assert(b)  \
+	if(!(b)) { \
+		std::cerr << "rt_assert at " << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_msg(b,msg)  \
+	if(!(b)) { \
+		std::cerr << msg <<  " at " << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#define rt_assert_eq(ex,ac)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "rt_assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_eq_msg(ex,ac,msg)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "rt_assert_eq: " << msg <<  ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_eq(ex,ac)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_eq_msg(ex,ac,msg)  \
+	if(!((ex) == (ac))) { \
+		std::cerr << "assert_eq: " << msg <<  ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_eq(ex,ac)
+#define assert_eq_msg(ex,ac,msg)
+#endif
+
+#define rt_assert_neq(ex,ac)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "rt_assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_neq_msg(ex,ac,msg)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "rt_assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_neq(ex,ac)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_neq_msg(ex,ac,msg)  \
+	if(!((ex) != (ac))) { \
+		std::cerr << "assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_neq(ex,ac)
+#define assert_neq_msg(ex,ac,msg)
+#endif
+
+#define rt_assert_gt(a,b) \
+	if(!((a) > (b))) { \
+		std::cerr << "rt_assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_gt_msg(a,b,msg) \
+	if(!((a) > (b))) { \
+		std::cerr << "rt_assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_gt(a,b) \
+	if(!((a) > (b))) { \
+		std::cerr << "assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_gt_msg(a,b,msg) \
+	if(!((a) > (b))) { \
+		std::cerr << "assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_gt(a,b)
+#define assert_gt_msg(a,b,msg)
+#endif
+
+#define rt_assert_geq(a,b) \
+	if(!((a) >= (b))) { \
+		std::cerr << "rt_assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_geq_msg(a,b,msg) \
+	if(!((a) >= (b))) { \
+		std::cerr << "rt_assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_geq(a,b) \
+	if(!((a) >= (b))) { \
+		std::cerr << "assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_geq_msg(a,b,msg) \
+	if(!((a) >= (b))) { \
+		std::cerr << "assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_geq(a,b)
+#define assert_geq_msg(a,b,msg)
+#endif
+
+#define rt_assert_lt(a,b) \
+	if(!(a < b)) { \
+		std::cerr << "rt_assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_lt_msg(a,b,msg) \
+	if(!(a < b)) { \
+		std::cerr << "rt_assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_lt(a,b) \
+	if(!(a < b)) { \
+		std::cerr << "assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_lt_msg(a,b,msg) \
+	if(!(a < b)) { \
+		std::cerr << "assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_lt(a,b)
+#define assert_lt_msg(a,b,msg)
+#endif
+
+#define rt_assert_leq(a,b) \
+	if(!((a) <= (b))) { \
+		std::cerr << "rt_assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(); \
+	}
+#define rt_assert_leq_msg(a,b,msg) \
+	if(!((a) <= (b))) { \
+		std::cerr << "rt_assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		throw ReleaseAssertException(msg); \
+	}
+
+#ifndef NDEBUG
+#define assert_leq(a,b) \
+	if(!((a) <= (b))) { \
+		std::cerr << "assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#define assert_leq_msg(a,b,msg) \
+	if(!((a) <= (b))) { \
+		std::cerr << "assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \
+		std::cerr << __FILE__ << ":" << __LINE__ << std::endl; \
+		assert(0); \
+	}
+#else
+#define assert_leq(a,b)
+#define assert_leq_msg(a,b,msg)
+#endif
+
+#ifndef NDEBUG
+#define assert_in(c, s) assert_in2(c, s, __FILE__, __LINE__)
+static inline void assert_in2(char c, const char *str, const char *file, int line) {
+	const char *s = str;
+	while(*s != '\0') {
+		if(c == *s) return;
+		s++;
+	}
+	std::cerr << "assert_in: (" << c << ") not in  (" << str << ")" << std::endl;
+	std::cerr << file << ":" << line << std::endl;
+	assert(0);
+}
+#else
+#define assert_in(c, s)
+#endif
+
+#ifndef NDEBUG
+#define assert_range(b, e, v) assert_range_helper(b, e, v, __FILE__, __LINE__)
+template<typename T>
+inline static void assert_range_helper(const T& begin,
+                                       const T& end,
+                                       const T& val,
+                                       const char *file,
+                                       int line)
+{
+	if(val < begin || val > end) {
+		std::cerr << "assert_range: (" << val << ") not in  ["
+		          << begin << ", " << end << "]" << std::endl;
+		std::cerr << file << ":" << line << std::endl;
+		assert(0);
+	}
+}
+#else
+#define assert_range(b, e, v)
+#endif
+
+// define a macro to indicate variables that are only required for asserts
+// used to make production build happy, i.e. disable "warning: variable ‘x’ set but not used [-Wunused-but-set-variable]"
+#define _unused(x) ((void)x)
+
+#endif /*ASSERT_HELPERS_H_*/
diff --git a/src/classify.cpp b/src/classify.cpp
index 3fb9416..5909a85 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -22,6 +22,7 @@
 #include "krakenutil.hpp"
 #include "quickfile.hpp"
 #include "seqreader.hpp"
+#include "hyperloglogplus.h"
 
 const size_t DEF_WORK_UNIT_SIZE = 500000;
 
@@ -37,8 +38,18 @@ string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig);
 set<uint32_t> get_ancestry(uint32_t taxon);
 void report_stats(struct timeval time1, struct timeval time2);
 
+struct ReadCounts {
+	uint32_t n_reads;
+	uint32_t n_kmers;
+    HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
+};
+
+map<uint64_t, ReadCounts> taxon_counts; // stats per taxon
+
 int Num_threads = 1;
-string DB_filename, Index_filename, Nodes_filename;
+vector<string> DB_filenames;
+vector<string> Index_filenames;
+string Nodes_filename;
 bool Quick_mode = false;
 bool Fastq_input = false;
 bool Print_classified = false;
@@ -46,9 +57,10 @@ bool Print_unclassified = false;
 bool Print_kraken = true;
 bool Populate_memory = false;
 bool Only_classified_kraken_output = false;
+bool Print_sequence = true;
 uint32_t Minimum_hit_count = 1;
 map<uint32_t, uint32_t> Parent_map;
-KrakenDB Database;
+vector<KrakenDB*> KrakenDatabases;
 string Classified_output_file, Unclassified_output_file, Kraken_output_file;
 ostream *Classified_output;
 ostream *Unclassified_output;
@@ -59,34 +71,62 @@ uint64_t total_classified = 0;
 uint64_t total_sequences = 0;
 uint64_t total_bases = 0;
 
+void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) {
+	QuickFile db_file;
+	db_file.open_file(DB_filename);
+	if (Populate_memory) {
+		db_file.load_file();
+	}
+	database = KrakenDB(db_file.ptr());
+	QuickFile idx_file;
+	idx_file.open_file(Index_filename);
+	if (Populate_memory)
+		idx_file.load_file();
+
+	KrakenDBIndex db_index(idx_file.ptr());
+	database.set_index(&db_index);
+}
+
 int main(int argc, char **argv) {
   #ifdef _OPENMP
   omp_set_num_threads(1);
   #endif
 
   parse_command_line(argc, argv);
-  if (! Nodes_filename.empty())
+  if (! Nodes_filename.empty()) {
+    cerr << "Building parent node map " << endl;
     Parent_map = build_parent_map(Nodes_filename);
+  }
 
   if (Populate_memory)
-    cerr << "Loading database... ";
-
-  QuickFile db_file;
-  db_file.open_file(DB_filename);
-  if (Populate_memory)
-    db_file.load_file();
-  Database = KrakenDB(db_file.ptr());
-  KmerScanner::set_k(Database.get_k());
+    cerr << "Loading database(s)... " << endl;
+
+  // TODO: Check DB_filenames and Index_filesnames have the same length
+  for (size_t i=0; i < DB_filenames.size(); ++i) {
+    cerr << "\t " << DB_filenames[i] << endl;
+    static QuickFile db_file;
+    db_file.open_file(DB_filenames[i]);
+    if (Populate_memory)
+      db_file.load_file();
+    static KrakenDB Database = KrakenDB(db_file.ptr());
+    KmerScanner::set_k(Database.get_k());
+  
+    static QuickFile idx_file;
+    idx_file.open_file(Index_filenames[i]);
+    if (Populate_memory)
+      idx_file.load_file();
+    static KrakenDBIndex db_index(idx_file.ptr());
+    Database.set_index(&db_index);
+    
+  
+    KrakenDatabases.push_back(&Database);
+  }
 
-  QuickFile idx_file;
-  idx_file.open_file(Index_filename);
-  if (Populate_memory)
-    idx_file.load_file();
-  KrakenDBIndex db_index(idx_file.ptr());
-  Database.set_index(&db_index);
+  // TODO: Check all databases have the same k
+  KmerScanner::set_k(KrakenDatabases[0]->get_k());
 
   if (Populate_memory)
-    cerr << "complete." << endl;
+    cerr << "\ncomplete." << endl;
 
   if (Print_classified) {
     if (Classified_output_file == "-")
@@ -147,6 +187,7 @@ void report_stats(struct timeval time1, struct timeval time2) {
 }
 
 void process_file(char *filename) {
+  cerr << "k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl;
   string file_str(filename);
   DNASequenceReader *reader;
   DNASequence dna;
@@ -199,9 +240,26 @@ void process_file(char *filename) {
     }
   }  // end parallel section
 
+  // Write out report - print k-mers and read numbers
+  for (auto& elem : taxon_counts) {
+        //elem.first gives you the key (int)
+        //elem.second gives you the mapped element (vector)
+        cerr << elem.first << "\t" << elem.second.n_reads << "\t" << 
+            elem.second.n_kmers << "\t" << elem.second.kmers.cardinality() << "\n";
+  }
+
   delete reader;
 }
 
+uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& current_bin_key,
+		int64_t& current_min_pos, int64_t& current_max_pos) {
+	uint32_t* val_ptr = database.kmer_query(
+			database.canonical_representation(*kmer_ptr), &current_bin_key,
+			&current_min_pos, &current_max_pos);
+	uint32_t taxon = val_ptr ? *val_ptr : 0;
+	return taxon;
+}
+
 void classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss) {
   vector<uint32_t> taxa;
@@ -211,11 +269,9 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
   uint32_t taxon = 0;
   uint32_t hits = 0;  // only maintained if in quick mode
 
-  uint64_t current_bin_key;
-  int64_t current_min_pos = 1;
-  int64_t current_max_pos = 0;
+  uint64_t current_bin_key; int64_t current_min_pos = 1;  int64_t current_max_pos = 0;
 
-  if (dna.seq.size() >= Database.get_k()) {
+  if (dna.seq.size() >= KrakenDatabases[0]->get_k()) {
     KmerScanner scanner(dna.seq);
     while ((kmer_ptr = scanner.next_kmer()) != NULL) {
       taxon = 0;
@@ -224,13 +280,15 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
       }
       else {
         ambig_list.push_back(0);
-        uint32_t *val_ptr = Database.kmer_query(
-                              Database.canonical_representation(*kmer_ptr),
-                              &current_bin_key,
-                              &current_min_pos, &current_max_pos
-                            );
-        taxon = val_ptr ? *val_ptr : 0;
+
+        for (auto& db : KrakenDatabases) {
+            taxon = get_taxon_for_kmer(*db, kmer_ptr, current_bin_key, current_min_pos, current_max_pos);
+            if (taxon) break;
+        }
+
         if (taxon) {
+          taxon_counts[taxon].kmers.add(*kmer_ptr);
+          ++taxon_counts[taxon].n_kmers;
           hit_counts[taxon]++;
           if (Quick_mode && ++hits >= Minimum_hit_count)
             break;
@@ -249,6 +307,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
   if (call)
     #pragma omp atomic
     total_classified++;
+    ++(taxon_counts[call].n_reads);
 
   if (Print_unclassified || Print_classified) {
     ostringstream *oss_ptr = call ? &coss : &uoss;
@@ -290,6 +349,9 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
       koss << hitlist_string(taxa, ambig_list);
   }
 
+  if (Print_sequence)
+      koss << "\t" << dna.seq;
+
   koss << endl;
 }
 
@@ -349,10 +411,10 @@ void parse_command_line(int argc, char **argv) {
   while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:M")) != -1) {
     switch (opt) {
       case 'd' :
-        DB_filename = optarg;
+        DB_filenames.push_back(optarg);
         break;
       case 'i' :
-        Index_filename = optarg;
+        Index_filenames.push_back(optarg);
         break;
       case 't' :
         sig = atoll(optarg);
@@ -409,11 +471,11 @@ void parse_command_line(int argc, char **argv) {
     }
   }
 
-  if (DB_filename.empty()) {
+  if (DB_filenames.empty()) {
     cerr << "Missing mandatory option -d" << endl;
     usage();
   }
-  if (Index_filename.empty()) {
+  if (Index_filenames.empty()) {
     cerr << "Missing mandatory option -i" << endl;
     usage();
   }
@@ -443,6 +505,7 @@ void usage(int exit_code) {
        << "  -f               Input is in FASTQ format" << endl
        << "  -c               Only include classified reads in output" << endl
        << "  -M               Preload database files" << endl
+       << "  -s               Print sequence in Kraken output" << endl
        << "  -h               Print this message" << endl
        << endl
        << "At least one FASTA or FASTQ file must be specified." << endl
diff --git a/src/get_kmers.cpp b/src/get_kmers.cpp
new file mode 100644
index 0000000..9288078
--- /dev/null
+++ b/src/get_kmers.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+#include "krakendb.hpp"
+#include "krakenutil.hpp"
+#include "seqreader.hpp"
+#include <unordered_map>
+
+#define SKIP_LEN 50000
+
+using namespace std;
+using namespace kraken;
+
+void parse_command_line(int argc, char **argv);
+void usage(int exit_code=EX_USAGE);
+void process_files();
+void process_single_file();
+void process_file(string filename, uint32_t taxid);
+void get_kmers(uint32_t taxid, string &seq, size_t start, size_t finish);
+
+int Num_threads = 1;
+string DB_filename, Index_filename, Nodes_filename,
+  File_to_taxon_map_filename,
+  ID_to_taxon_map_filename, Multi_fasta_filename;
+bool force_taxid = false;
+
+bool Allow_extra_kmers = false;
+bool verbose = false;
+bool Operate_in_RAM = false;
+bool One_FASTA_file = false;
+map<uint32_t, uint32_t> Parent_map;
+map<string, uint32_t> ID_to_taxon_map;
+set<uint32_t> All_taxon_ids;
+unordered_multimap<uint64_t, uint32_t> Kmer_taxa_map;
+map<pair<uint32_t, uint32_t>, uint32_t > TaxidPair_counts;
+KrakenDB Database;
+
+int main(int argc, char **argv) {
+  #ifdef _OPENMP
+  omp_set_num_threads(1);
+  #endif
+
+  parse_command_line(argc, argv);
+
+  if (!force_taxid) {
+    Parent_map = build_parent_map(Nodes_filename);
+  }
+
+  QuickFile db_file(DB_filename, "rw");
+
+  char *temp_ptr = NULL;
+  size_t db_file_size = db_file.size();
+  if (Operate_in_RAM) {
+    cerr << "Getting " << DB_filename << " into memory ... ";
+    db_file.close_file();
+    temp_ptr = new char[ db_file_size ];
+    ifstream ifs(DB_filename.c_str(), ifstream::binary);
+    ifs.read(temp_ptr, db_file_size);
+    ifs.close();
+    Database = KrakenDB(temp_ptr);
+    cerr << "done" << endl;
+  } else {
+    Database = KrakenDB(db_file.ptr());
+  }
+
+  KmerScanner::set_k(Database.get_k());
+
+  QuickFile idx_file(Index_filename);
+  KrakenDBIndex db_index(idx_file.ptr());
+  Database.set_index(&db_index);
+
+  if (One_FASTA_file)
+    process_single_file();
+  else
+    process_files();
+
+
+
+  if (Operate_in_RAM) {
+    ofstream ofs(DB_filename.c_str(), ofstream::binary);
+    ofs.write(temp_ptr, db_file_size);
+    ofs.close();
+    delete temp_ptr;
+  }
+
+  return 0;
+}
+
+void process_single_file() {
+  cerr << "Processing multiple FASTA files" << endl;
+  ifstream map_file(ID_to_taxon_map_filename.c_str());
+  if (map_file.rdstate() & ifstream::failbit) {
+    err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str());
+  }
+  string line;
+  while (map_file.good()) {
+    getline(map_file, line);
+    if (line.empty())
+      break;
+    string seq_id;
+    uint32_t taxid;
+    istringstream iss(line);
+    iss >> seq_id;
+    iss >> taxid;
+    ID_to_taxon_map[seq_id] = taxid;
+  }
+
+  FastaReader reader(Multi_fasta_filename);
+  DNASequence dna;
+  uint32_t seqs_processed = 0;
+  uint32_t seqs_skipped = 0;
+  uint32_t seqs_no_taxid = 0;
+
+  while (reader.is_valid()) {
+    dna = reader.next_sequence();
+    if (! reader.is_valid())
+      break;
+
+    if ( dna.seq.empty() ) {
+      ++seqs_skipped;
+      continue;
+    }
+
+    // Get the taxid. If the header specifies kraken:taxid, use that
+    uint32_t taxid;
+    string prefix = "kraken:taxid|";
+    if (dna.id.substr(0,prefix.size()) == prefix) {
+        taxid = std::atoi(dna.id.substr(prefix.size()).c_str());
+    } else {
+        taxid = ID_to_taxon_map[dna.id];
+    }
+
+    if (taxid) {
+      #pragma omp parallel for schedule(dynamic)
+      for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
+        get_kmers(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+
+        ++seqs_processed;
+    } else {
+        if (verbose) 
+            cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl;
+
+        ++seqs_no_taxid;
+    }
+    cerr << "\rProcessed " << seqs_processed << " sequences";
+  }
+  cerr << "\r                                                                            ";
+  cerr << "\rFinished processing " << seqs_processed << " sequences (skipping "<< seqs_skipped <<" empty sequences, and " << seqs_no_taxid<<" sequences with no taxonomy mapping)" << endl;
+}
+
+void process_files() {
+  cerr << "Processing files in " << File_to_taxon_map_filename.c_str() << endl;
+  ifstream map_file(File_to_taxon_map_filename.c_str());
+  if (map_file.rdstate() & ifstream::failbit) {
+    err(EX_NOINPUT, "can't open %s", File_to_taxon_map_filename.c_str());
+  }
+  string line;
+  uint32_t seqs_processed = 0;
+
+  while (map_file.good()) {
+    getline(map_file, line);
+    if (line.empty())
+      break;
+    string filename;
+    uint32_t taxid;
+    istringstream iss(line);
+    iss >> filename;
+    iss >> taxid;
+    process_file(filename, taxid);
+    cerr << "\rProcessed " << ++seqs_processed << " sequences";
+  }
+  cerr << "\r                                                       ";
+  cerr << "\rFinished processing " << seqs_processed << " sequences" << endl;
+}
+
+void process_file(string filename, uint32_t taxid) {
+  FastaReader reader(filename);
+  DNASequence dna;
+  
+  // For the purposes of this program, we assume these files are
+  // single-fasta files.
+  dna = reader.next_sequence();
+
+  #pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
+    get_kmers(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+}
+
+void get_kmers(uint32_t taxid, string &seq, size_t start, size_t finish) {
+
+  All_taxon_ids.insert(taxid);
+  KmerScanner scanner(seq, start, finish);
+  uint64_t *kmer_ptr;
+
+  while ((kmer_ptr = scanner.next_kmer()) != NULL) {
+    if (scanner.ambig_kmer())
+      continue;
+
+    Kmer_taxa_map.insert({*kmer_ptr, taxid});
+  }
+}
+
+void parse_command_line(int argc, char **argv) {
+  int opt;
+  long long sig;
+
+  if (argc > 1 && strcmp(argv[1], "-h") == 0)
+    usage(0);
+  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTv")) != -1) {
+    switch (opt) {
+      case 'f' :
+        File_to_taxon_map_filename = optarg;
+        break;
+      case 'd' :
+        DB_filename = optarg;
+        break;
+      case 'i' :
+        Index_filename = optarg;
+        break;
+      case 'F' :
+        Multi_fasta_filename = optarg;
+        break;
+      case 'm' :
+        ID_to_taxon_map_filename = optarg;
+        break;
+      case 't' :
+        sig = atoll(optarg);
+        if (sig <= 0)
+          errx(EX_USAGE, "can't use nonpositive thread count");
+        #ifdef _OPENMP
+        if (sig > omp_get_num_procs())
+          errx(EX_USAGE, "thread count exceeds number of processors");
+        Num_threads = sig;
+        omp_set_num_threads(Num_threads);
+        #endif
+        break;
+      case 'T' :
+        force_taxid = true;
+        break;
+      case 'n' :
+        Nodes_filename = optarg;
+        break;
+      case 'v' :
+        verbose = true;
+        break;
+      case 'x' :
+        Allow_extra_kmers = true;
+        break;
+      case 'M' :
+        Operate_in_RAM = true;
+        break;
+      default:
+        usage();
+        break;
+    }
+  }
+
+  if (DB_filename.empty() || Index_filename.empty() ||
+      Nodes_filename.empty())
+    usage();
+  if (File_to_taxon_map_filename.empty() &&
+      (Multi_fasta_filename.empty() || ID_to_taxon_map_filename.empty()))
+    usage();
+
+  if (! File_to_taxon_map_filename.empty())
+    One_FASTA_file = false;
+  else
+    One_FASTA_file = true;
+}
+
+void usage(int exit_code) {
+  cerr << "Usage: get_kmers [options]" << endl
+       << endl
+       << "Options: (*mandatory)" << endl
+       << "* -d filename      Kraken DB filename" << endl
+       << "* -i filename      Kraken DB index filename" << endl
+       << "* -n filename      NCBI Taxonomy nodes file" << endl
+       << "  -t #             Number of threads" << endl
+       << "  -M               Copy DB to RAM during operation" << endl
+       << "  -x               K-mers not found in DB do not cause errors" << endl
+       << "  -f filename      File to taxon map" << endl
+       << "  -F filename      Multi-FASTA file with sequence data" << endl
+       << "  -m filename      Sequence ID to taxon map" << endl
+       << "  -T               Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
+       << "  -v               Verbose output" << endl
+       << "  -h               Print this message" << endl
+       << endl
+       << "-F and -m must be specified together.  If -f is given, "
+       << "-F/-m are ignored." << endl;
+  exit(exit_code);
+}
diff --git a/src/hyperloglogbias.h b/src/hyperloglogbias.h
new file mode 100644
index 0000000..013bd5b
--- /dev/null
+++ b/src/hyperloglogbias.h
@@ -0,0 +1,133 @@
+/*
+ * hyperloglogbias.h
+ *
+ *  Created on: Apr 25, 2015
+ *      Author: fbreitwieser
+ */
+
+#ifndef HYPERLOGLOGBIAS_H_
+#define HYPERLOGLOGBIAS_H_
+
+const double rawEstimateData_precision4[] = {
+    11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394
+};
+
+const double rawEstimateData_precision5[] = {
+    23, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, 154.7146, 155.807, 156.9228, 157.0372, 158.5852
+};
+
+const double rawEstimateData_precision6[] = {
+    46, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858
+};
+
+const double rawEstimateData_precision7[] = {
+    92, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, 638.6102
+};
+
+const double rawEstimateData_precision8[] = {
+    184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192
+};
+
+const double rawEstimateData_precision9[] = {
+    369, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, 2553.768
+};
+
+const double rawEstimateData_precision10[] = {
+    738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828
+};
+
+const double rawEstimateData_precision11[] = {
+    1477, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, 10229.9176
+};
+
+const double rawEstimateData_precision12[] = {
+    2954, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22
+};
+
+const double rawEstimateData_precision13[] = {
+    5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424
+};
+
+const double rawEstimateData_precision14[] = {
+    11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, 81035.6436, 81460.0448, 81876.3884
+};
+
+const double rawEstimateData_precision15[] = {
+    23635.0036, 24030.8034, 24431.4744, 24837.1524, 25246.7928, 25661.326, 26081.3532, 26505.2806, 26933.9892, 27367.7098, 27805.318, 28248.799, 28696.4382, 29148.8244, 29605.5138, 30066.8668, 30534.2344, 31006.32, 31480.778, 31962.2418, 32447.3324, 32938.0232, 33432.731, 33930.728, 34433.9896, 34944.1402, 35457.5588, 35974.5958, 36497.3296, 37021.9096, 37554.326, 38088.0826, 38628.8816, 39171.3192, 39723.2326, 40274.5554, 40832.3142, 41390.613, 41959.5908, 42532.5466, 43102.0344, 43683.5072, 44266.694, 44851.2822, 45440.7862, 46038.0586, 46640.3164, 47241.064, 47846.155, 48454.7396, 49076.9168, 49692.542, 50317.4778, 50939.65, 51572.5596, 52210.2906, 52843.7396, 53481.3996, 54127.236, 54770.406, 55422.6598, 56078.7958, 56736.7174, 57397.6784, 58064.5784, 58730.308, 59404.9784, 60077.0864, 60751.9158, 61444.1386, 62115.817, 62808.7742, 63501.4774, 64187.5454, 64883.6622, 65582.7468, 66274.5318, 66976.9276, 67688.7764, 68402.138, 69109.6274, 69822.9706, 70543.6108, 71265.5202, 71983.3848, 72708.4656, 73433.384, 74158.4664, 74896.4868, 75620.9564, 76362.1434, 77098.3204, 77835.7662, 78582.6114, 79323.9902, 80067.8658, 80814.9246, 81567.0136, 82310.8536, 83061.9952, 83821.4096, 84580.8608, 85335.547, 86092.5802, 86851.6506, 87612.311, 88381.2016, 89146.3296, 89907.8974, 90676.846, 91451.4152, 92224.5518, 92995.8686, 93763.5066, 94551.2796, 95315.1944, 96096.1806, 96881.0918, 97665.679, 98442.68, 99229.3002, 100011.0994, 100790.6386, 101580.1564, 102377.7484, 103152.1392, 103944.2712, 104730.216, 105528.6336, 106324.9398, 107117.6706, 107890.3988, 108695.2266, 109485.238, 110294.7876, 111075.0958, 111878.0496, 112695.2864, 113464.5486, 114270.0474, 115068.608, 115884.3626, 116673.2588, 117483.3716, 118275.097, 119085.4092, 119879.2808, 120687.5868, 121499.9944, 122284.916, 123095.9254, 123912.5038, 124709.0454, 125503.7182, 126323.259, 127138.9412, 127943.8294, 128755.646, 129556.5354, 130375.3298, 131161.4734, 131971.1962, 132787.5458, 133588.1056, 134431.351, 135220.2906, 136023.398, 136846.6558, 137667.0004, 138463.663, 139283.7154, 140074.6146, 140901.3072, 141721.8548, 142543.2322, 143356.1096, 144173.7412, 144973.0948, 145794.3162, 146609.5714, 147420.003, 148237.9784, 149050.5696, 149854.761, 150663.1966, 151494.0754, 152313.1416, 153112.6902, 153935.7206, 154746.9262, 155559.547, 156401.9746, 157228.7036, 158008.7254, 158820.75, 159646.9184, 160470.4458, 161279.5348, 162093.3114, 162918.542, 163729.2842
+};
+
+const double rawEstimateData_precision16[] = {
+    47271, 48062.3584, 48862.7074, 49673.152, 50492.8416, 51322.9514, 52161.03, 53009.407, 53867.6348, 54734.206, 55610.5144, 56496.2096, 57390.795, 58297.268, 59210.6448, 60134.665, 61068.0248, 62010.4472, 62962.5204, 63923.5742, 64895.0194, 65876.4182, 66862.6136, 67862.6968, 68868.8908, 69882.8544, 70911.271, 71944.0924, 72990.0326, 74040.692, 75100.6336, 76174.7826, 77252.5998, 78340.2974, 79438.2572, 80545.4976, 81657.2796, 82784.6336, 83915.515, 85059.7362, 86205.9368, 87364.4424, 88530.3358, 89707.3744, 90885.9638, 92080.197, 93275.5738, 94479.391, 95695.918, 96919.2236, 98148.4602, 99382.3474, 100625.6974, 101878.0284, 103141.6278, 104409.4588, 105686.2882, 106967.5402, 108261.6032, 109548.1578, 110852.0728, 112162.231, 113479.0072, 114806.2626, 116137.9072, 117469.5048, 118813.5186, 120165.4876, 121516.2556, 122875.766, 124250.5444, 125621.2222, 127003.2352, 128387.848, 129775.2644, 131181.7776, 132577.3086, 133979.9458, 135394.1132, 136800.9078, 138233.217, 139668.5308, 141085.212, 142535.2122, 143969.0684, 145420.2872, 146878.1542, 148332.7572, 149800.3202, 151269.66, 152743.6104, 154213.0948, 155690.288, 157169.4246, 158672.1756, 160160.059, 161650.6854, 163145.7772, 164645.6726, 166159.1952, 167682.1578, 169177.3328, 170700.0118, 172228.8964, 173732.6664, 175265.5556, 176787.799, 178317.111, 179856.6914, 181400.865, 182943.4612, 184486.742, 186033.4698, 187583.7886, 189148.1868, 190688.4526, 192250.1926, 193810.9042, 195354.2972, 196938.7682, 198493.5898, 200079.2824, 201618.912, 203205.5492, 204765.5798, 206356.1124, 207929.3064, 209498.7196, 211086.229, 212675.1324, 214256.7892, 215826.2392, 217412.8474, 218995.6724, 220618.6038, 222207.1166, 223781.0364, 225387.4332, 227005.7928, 228590.4336, 230217.8738, 231805.1054, 233408.9, 234995.3432, 236601.4956, 238190.7904, 239817.2548, 241411.2832, 243002.4066, 244640.1884, 246255.3128, 247849.3508, 249479.9734, 251106.8822, 252705.027, 254332.9242, 255935.129, 257526.9014, 259154.772, 260777.625, 262390.253, 264004.4906, 265643.59, 267255.4076, 268873.426, 270470.7252, 272106.4804, 273722.4456, 275337.794, 276945.7038, 278592.9154, 280204.3726, 281841.1606, 283489.171, 285130.1716, 286735.3362, 288364.7164, 289961.1814, 291595.5524, 293285.683, 294899.6668, 296499.3434, 298128.0462, 299761.8946, 301394.2424, 302997.6748, 304615.1478, 306269.7724, 307886.114, 309543.1028, 311153.2862, 312782.8546, 314421.2008, 316033.2438, 317692.9636, 319305.2648, 320948.7406, 322566.3364, 324228.4224, 325847.1542
+};
+
+const double rawEstimateData_precision17[] = {
+    94542, 96125.811, 97728.019, 99348.558, 100987.9705, 102646.7565, 104324.5125, 106021.7435, 107736.7865, 109469.272, 111223.9465, 112995.219, 114787.432, 116593.152, 118422.71, 120267.2345, 122134.6765, 124020.937, 125927.2705, 127851.255, 129788.9485, 131751.016, 133726.8225, 135722.592, 137736.789, 139770.568, 141821.518, 143891.343, 145982.1415, 148095.387, 150207.526, 152355.649, 154515.6415, 156696.05, 158887.7575, 161098.159, 163329.852, 165569.053, 167837.4005, 170121.6165, 172420.4595, 174732.6265, 177062.77, 179412.502, 181774.035, 184151.939, 186551.6895, 188965.691, 191402.8095, 193857.949, 196305.0775, 198774.6715, 201271.2585, 203764.78, 206299.3695, 208818.1365, 211373.115, 213946.7465, 216532.076, 219105.541, 221714.5375, 224337.5135, 226977.5125, 229613.0655, 232270.2685, 234952.2065, 237645.3555, 240331.1925, 243034.517, 245756.0725, 248517.6865, 251232.737, 254011.3955, 256785.995, 259556.44, 262368.335, 265156.911, 267965.266, 270785.583, 273616.0495, 276487.4835, 279346.639, 282202.509, 285074.3885, 287942.2855, 290856.018, 293774.0345, 296678.5145, 299603.6355, 302552.6575, 305492.9785, 308466.8605, 311392.581, 314347.538, 317319.4295, 320285.9785, 323301.7325, 326298.3235, 329301.3105, 332301.987, 335309.791, 338370.762, 341382.923, 344431.1265, 347464.1545, 350507.28, 353619.2345, 356631.2005, 359685.203, 362776.7845, 365886.488, 368958.2255, 372060.6825, 375165.4335, 378237.935, 381328.311, 384430.5225, 387576.425, 390683.242, 393839.648, 396977.8425, 400101.9805, 403271.296, 406409.8425, 409529.5485, 412678.7, 415847.423, 419020.8035, 422157.081, 425337.749, 428479.6165, 431700.902, 434893.1915, 438049.582, 441210.5415, 444379.2545, 447577.356, 450741.931, 453959.548, 457137.0935, 460329.846, 463537.4815, 466732.3345, 469960.5615, 473164.681, 476347.6345, 479496.173, 482813.1645, 486025.6995, 489249.4885, 492460.1945, 495675.8805, 498908.0075, 502131.802, 505374.3855, 508550.9915, 511806.7305, 515026.776, 518217.0005, 521523.9855, 524705.9855, 527950.997, 531210.0265, 534472.497, 537750.7315, 540926.922, 544207.094, 547429.4345, 550666.3745, 553975.3475, 557150.7185, 560399.6165, 563662.697, 566916.7395, 570146.1215, 573447.425, 576689.6245, 579874.5745, 583202.337, 586503.0255, 589715.635, 592910.161, 596214.3885, 599488.035, 602740.92, 605983.0685, 609248.67, 612491.3605, 615787.912, 619107.5245, 622307.9555, 625577.333, 628840.4385, 632085.2155, 635317.6135, 638691.7195, 641887.467, 645139.9405, 648441.546, 651666.252, 654941.845
+};
+
+const double rawEstimateData_precision18[] = {
+    189084, 192250.913, 195456.774, 198696.946, 201977.762, 205294.444, 208651.754, 212042.099, 215472.269, 218941.91, 222443.912, 225996.845, 229568.199, 233193.568, 236844.457, 240543.233, 244279.475, 248044.27, 251854.588, 255693.2, 259583.619, 263494.621, 267445.385, 271454.061, 275468.769, 279549.456, 283646.446, 287788.198, 291966.099, 296181.164, 300431.469, 304718.618, 309024.004, 313393.508, 317760.803, 322209.731, 326675.061, 331160.627, 335654.47, 340241.442, 344841.833, 349467.132, 354130.629, 358819.432, 363574.626, 368296.587, 373118.482, 377914.93, 382782.301, 387680.669, 392601.981, 397544.323, 402529.115, 407546.018, 412593.658, 417638.657, 422762.865, 427886.169, 433017.167, 438213.273, 443441.254, 448692.421, 453937.533, 459239.049, 464529.569, 469910.083, 475274.03, 480684.473, 486070.26, 491515.237, 496995.651, 502476.617, 507973.609, 513497.19, 519083.233, 524726.509, 530305.505, 535945.728, 541584.404, 547274.055, 552967.236, 558667.862, 564360.216, 570128.148, 575965.08, 581701.952, 587532.523, 593361.144, 599246.128, 605033.418, 610958.779, 616837.117, 622772.818, 628672.04, 634675.369, 640574.831, 646585.739, 652574.547, 658611.217, 664642.684, 670713.914, 676737.681, 682797.313, 688837.897, 694917.874, 701009.882, 707173.648, 713257.254, 719415.392, 725636.761, 731710.697, 737906.209, 744103.074, 750313.39, 756504.185, 762712.579, 768876.985, 775167.859, 781359, 787615.959, 793863.597, 800245.477, 806464.582, 812785.294, 819005.925, 825403.057, 831676.197, 837936.284, 844266.968, 850642.711, 856959.756, 863322.774, 869699.931, 876102.478, 882355.787, 888694.463, 895159.952, 901536.143, 907872.631, 914293.672, 920615.14, 927130.974, 933409.404, 939922.178, 946331.47, 952745.93, 959209.264, 965590.224, 972077.284, 978501.961, 984953.19, 991413.271, 997817.479, 1004222.658, 1010725.676, 1017177.138, 1023612.529, 1030098.236, 1036493.719, 1043112.207, 1049537.036, 1056008.096, 1062476.184, 1068942.337, 1075524.95, 1081932.864, 1088426.025, 1094776.005, 1101327.448, 1107901.673, 1114423.639, 1120884.602, 1127324.923, 1133794.24, 1140328.886, 1146849.376, 1153346.682, 1159836.502, 1166478.703, 1172953.304, 1179391.502, 1185950.982, 1192544.052, 1198913.41, 1205430.994, 1212015.525, 1218674.042, 1225121.683, 1231551.101, 1238126.379, 1244673.795, 1251260.649, 1257697.86, 1264320.983, 1270736.319, 1277274.694, 1283804.95, 1290211.514, 1296858.568, 1303455.691
+};
+
+
+const double biasData_precision4[] = {
+    10, 9.717, 9.207, 8.7896, 8.2882, 7.8204, 7.3772, 6.9342, 6.5202, 6.161, 5.7722, 5.4636, 5.0396, 4.6766, 4.3566, 4.0454, 3.7936, 3.4856, 3.2666, 2.9946, 2.766, 2.4692, 2.3638, 2.0764, 1.7864, 1.7602, 1.4814, 1.433, 1.2926, 1.0664, 0.999600000000001, 0.7956, 0.5366, 0.589399999999998, 0.573799999999999, 0.269799999999996, 0.368200000000002, 0.0544000000000011, 0.234200000000001, 0.0108000000000033, -0.203400000000002, -0.0701999999999998, -0.129600000000003, -0.364199999999997, -0.480600000000003, -0.226999999999997, -0.322800000000001, -0.382599999999996, -0.511200000000002, -0.669600000000003, -0.749400000000001, -0.500399999999999, -0.617600000000003, -0.6922, -0.601599999999998, -0.416200000000003, -0.338200000000001, -0.782600000000002, -0.648600000000002, -0.919800000000002, -0.851799999999997, -0.962400000000002, -0.6402, -1.1922, -1.0256, -1.086, -1.21899999999999, -0.819400000000002, -0.940600000000003, -1.1554, -1.2072, -1.1752, -1.16759999999999, -1.14019999999999, -1.3754, -1.29859999999999, -1.607, -1.3292, -1.7606
+};
+
+const double biasData_precision5[] = {
+    22, 21.1194, 20.8208, 20.2318, 19.77, 19.2436, 18.7774, 18.2848, 17.8224, 17.3742, 16.9336, 16.503, 16.0494, 15.6292, 15.2124, 14.798, 14.367, 13.9728, 13.5944, 13.217, 12.8438, 12.3696, 12.0956, 11.7044, 11.324, 11.0668, 10.6698, 10.3644, 10.049, 9.6918, 9.4146, 9.082, 8.687, 8.5398, 8.2462, 7.857, 7.6606, 7.4168, 7.1248, 6.9222, 6.6804, 6.447, 6.3454, 5.9594, 5.7636, 5.5776, 5.331, 5.19, 4.9676, 4.7564, 4.5314, 4.4442, 4.3708, 3.9774, 3.9624, 3.8796, 3.755, 3.472, 3.2076, 3.1024, 2.8908, 2.7338, 2.7728, 2.629, 2.413, 2.3266, 2.1524, 2.2642, 2.1806, 2.0566, 1.9192, 1.7598, 1.3516, 1.5802, 1.43859999999999, 1.49160000000001, 1.1524, 1.1892, 0.841399999999993, 0.879800000000003, 0.837599999999995, 0.469800000000006, 0.765600000000006, 0.331000000000003, 0.591399999999993, 0.601200000000006, 0.701599999999999, 0.558199999999999, 0.339399999999998, 0.354399999999998, 0.491200000000006, 0.308000000000007, 0.355199999999996, -0.0254000000000048, 0.205200000000005, -0.272999999999996, 0.132199999999997, 0.394400000000005, -0.241200000000006, 0.242000000000004, 0.191400000000002, 0.253799999999998, -0.122399999999999, -0.370800000000003, 0.193200000000004, -0.0848000000000013, 0.0867999999999967, -0.327200000000005, -0.285600000000002, 0.311400000000006, -0.128399999999999, -0.754999999999995, -0.209199999999996, -0.293599999999998, -0.364000000000004, -0.253600000000006, -0.821200000000005, -0.253600000000006, -0.510400000000004, -0.383399999999995, -0.491799999999998, -0.220200000000006, -0.0972000000000008, -0.557400000000001, -0.114599999999996, -0.295000000000002, -0.534800000000004, 0.346399999999988, -0.65379999999999, 0.0398000000000138, 0.0341999999999985, -0.995800000000003, -0.523400000000009, -0.489000000000004, -0.274799999999999, -0.574999999999989, -0.482799999999997, 0.0571999999999946, -0.330600000000004, -0.628800000000012, -0.140199999999993, -0.540600000000012, -0.445999999999998, -0.599400000000003, -0.262599999999992, 0.163399999999996, -0.100599999999986, -0.39500000000001, -1.06960000000001, -0.836399999999998, -0.753199999999993, -0.412399999999991, -0.790400000000005, -0.29679999999999, -0.28540000000001, -0.193000000000012, -0.0772000000000048, -0.962799999999987, -0.414800000000014
+};
+
+const double biasData_precision6[] = {
+    45, 44.1902, 43.271, 42.8358, 41.8142, 41.2854, 40.317, 39.354, 38.8924, 37.9436, 37.4596, 36.5262, 35.6248, 35.1574, 34.2822, 33.837, 32.9636, 32.074, 31.7042, 30.7976, 30.4772, 29.6564, 28.7942, 28.5004, 27.686, 27.291, 26.5672, 25.8556, 25.4982, 24.8204, 24.4252, 23.7744, 23.0786, 22.8344, 22.0294, 21.8098, 21.0794, 20.5732, 20.1878, 19.5648, 19.2902, 18.6784, 18.3352, 17.8946, 17.3712, 17.0852, 16.499, 16.2686, 15.6844, 15.2234, 14.9732, 14.3356, 14.2286, 13.7262, 13.3284, 13.1048, 12.5962, 12.3562, 12.1272, 11.4184, 11.4974, 11.0822, 10.856, 10.48, 10.2834, 10.0208, 9.637, 9.51739999999999, 9.05759999999999, 8.74760000000001, 8.42700000000001, 8.1326, 8.2372, 8.2788, 7.6776, 7.79259999999999, 7.1952, 6.9564, 6.6454, 6.87, 6.5428, 6.19999999999999, 6.02940000000001, 5.62780000000001, 5.6782, 5.792, 5.35159999999999, 5.28319999999999, 5.0394, 5.07480000000001, 4.49119999999999, 4.84899999999999, 4.696, 4.54040000000001, 4.07300000000001, 4.37139999999999, 3.7216, 3.7328, 3.42080000000001, 3.41839999999999, 3.94239999999999, 3.27719999999999, 3.411, 3.13079999999999, 2.76900000000001, 2.92580000000001, 2.68279999999999, 2.75020000000001, 2.70599999999999, 2.3886, 3.01859999999999, 2.45179999999999, 2.92699999999999, 2.41720000000001, 2.41139999999999, 2.03299999999999, 2.51240000000001, 2.5564, 2.60079999999999, 2.41720000000001, 1.80439999999999, 1.99700000000001, 2.45480000000001, 1.8948, 2.2346, 2.30860000000001, 2.15479999999999, 1.88419999999999, 1.6508, 0.677199999999999, 1.72540000000001, 1.4752, 1.72280000000001, 1.66139999999999, 1.16759999999999, 1.79300000000001, 1.00059999999999, 0.905200000000008, 0.659999999999997, 1.55879999999999, 1.1636, 0.688199999999995, 0.712600000000009, 0.450199999999995, 1.1978, 0.975599999999986, 0.165400000000005, 1.727, 1.19739999999999, -0.252600000000001, 1.13460000000001, 1.3048, 1.19479999999999, 0.313400000000001, 0.878999999999991, 1.12039999999999, 0.853000000000009, 1.67920000000001, 0.856999999999999, 0.448599999999999, 1.2362, 0.953399999999988, 1.02859999999998, 0.563199999999995, 0.663000000000011, 0.723000000000013, 0.756599999999992, 0.256599999999992, -0.837600000000009, 0.620000000000005, 0.821599999999989, 0.216600000000028, 0.205600000000004, 0.220199999999977, 0.372599999999977, 0.334400000000016, 0.928400000000011, 0.972800000000007, 0.192400000000021, 0.487199999999973, -0.413000000000011, 0.807000000000016, 0.120600000000024, 0.769000000000005, 0.870799999999974, 0.66500000000002, 0.118200000000002, 0.401200000000017, 0.635199999999998, 0.135400000000004, 0.175599999999974, 1.16059999999999, 0.34620000000001, 0.521400000000028, -0.586599999999976, -1.16480000000001, 0.968399999999974, 0.836999999999989, 0.779600000000016, 0.985799999999983
+};
+
+const double biasData_precision7[] = {
+    91, 89.4934, 87.9758, 86.4574, 84.9718, 83.4954, 81.5302, 80.0756, 78.6374, 77.1782, 75.7888, 73.9522, 72.592, 71.2532, 69.9086, 68.5938, 66.9474, 65.6796, 64.4394, 63.2176, 61.9768, 60.4214, 59.2528, 58.0102, 56.8658, 55.7278, 54.3044, 53.1316, 52.093, 51.0032, 49.9092, 48.6306, 47.5294, 46.5756, 45.6508, 44.662, 43.552, 42.3724, 41.617, 40.5754, 39.7872, 38.8444, 37.7988, 36.8606, 36.2118, 35.3566, 34.4476, 33.5882, 32.6816, 32.0824, 31.0258, 30.6048, 29.4436, 28.7274, 27.957, 27.147, 26.4364, 25.7592, 25.3386, 24.781, 23.8028, 23.656, 22.6544, 21.996, 21.4718, 21.1544, 20.6098, 19.5956, 19.0616, 18.5758, 18.4878, 17.5244, 17.2146, 16.724, 15.8722, 15.5198, 15.0414, 14.941, 14.9048, 13.87, 13.4304, 13.028, 12.4708, 12.37, 12.0624, 11.4668, 11.5532, 11.4352, 11.2564, 10.2744, 10.2118, 9.74720000000002, 10.1456, 9.2928, 8.75040000000001, 8.55279999999999, 8.97899999999998, 8.21019999999999, 8.18340000000001, 7.3494, 7.32499999999999, 7.66140000000001, 6.90300000000002, 7.25439999999998, 6.9042, 7.21499999999997, 6.28640000000001, 6.08139999999997, 6.6764, 6.30099999999999, 5.13900000000001, 5.65800000000002, 5.17320000000001, 4.59019999999998, 4.9538, 5.08280000000002, 4.92200000000003, 4.99020000000002, 4.7328, 5.4538, 4.11360000000002, 4.22340000000003, 4.08780000000002, 3.70800000000003, 4.15559999999999, 4.18520000000001, 3.63720000000001, 3.68220000000002, 3.77960000000002, 3.6078, 2.49160000000001, 3.13099999999997, 2.5376, 3.19880000000001, 3.21100000000001, 2.4502, 3.52820000000003, 2.91199999999998, 3.04480000000001, 2.7432, 2.85239999999999, 2.79880000000003, 2.78579999999999, 1.88679999999999, 2.98860000000002, 2.50639999999999, 1.91239999999999, 2.66160000000002, 2.46820000000002, 1.58199999999999, 1.30399999999997, 2.27379999999999, 2.68939999999998, 1.32900000000001, 3.10599999999999, 1.69080000000002, 2.13740000000001, 2.53219999999999, 1.88479999999998, 1.33240000000001, 1.45119999999997, 1.17899999999997, 2.44119999999998, 1.60659999999996, 2.16700000000003, 0.77940000000001, 2.37900000000002, 2.06700000000001, 1.46000000000004, 2.91160000000002, 1.69200000000001, 0.954600000000028, 2.49300000000005, 2.2722, 1.33500000000004, 2.44899999999996, 1.20140000000004, 3.07380000000001, 2.09739999999999, 2.85640000000001, 2.29960000000005, 2.40899999999999, 1.97040000000004, 0.809799999999996, 1.65279999999996, 2.59979999999996, 0.95799999999997, 2.06799999999998, 2.32780000000002, 4.20159999999998, 1.96320000000003, 1.86400000000003, 1.42999999999995, 3.77940000000001, 1.27200000000005, 1.86440000000005, 2.20600000000002, 3.21900000000005, 1.5154, 2.61019999999996
+};
+
+const double biasData_precision8[] = {
+    183.2152, 180.2454, 177.2096, 173.6652, 170.6312, 167.6822, 164.249, 161.3296, 158.0038, 155.2074, 152.4612, 149.27, 146.5178, 143.4412, 140.8032, 138.1634, 135.1688, 132.6074, 129.6946, 127.2664, 124.8228, 122.0432, 119.6824, 116.9464, 114.6268, 112.2626, 109.8376, 107.4034, 104.8956, 102.8522, 100.7638, 98.3552, 96.3556, 93.7526, 91.9292, 89.8954, 87.8198, 85.7668, 83.298, 81.6688, 79.9466, 77.9746, 76.1672, 74.3474, 72.3028, 70.8912, 69.114, 67.4646, 65.9744, 64.4092, 62.6022, 60.843, 59.5684, 58.1652, 56.5426, 55.4152, 53.5388, 52.3592, 51.1366, 49.486, 48.3918, 46.5076, 45.509, 44.3834, 43.3498, 42.0668, 40.7346, 40.1228, 38.4528, 37.7, 36.644, 36.0518, 34.5774, 33.9068, 32.432, 32.1666, 30.434, 29.6644, 28.4894, 27.6312, 26.3804, 26.292, 25.5496000000001, 25.0234, 24.8206, 22.6146, 22.4188, 22.117, 20.6762, 20.6576, 19.7864, 19.509, 18.5334, 17.9204, 17.772, 16.2924, 16.8654, 15.1836, 15.745, 15.1316, 15.0386, 14.0136, 13.6342, 12.6196, 12.1866, 12.4281999999999, 11.3324, 10.4794000000001, 11.5038, 10.129, 9.52800000000002, 10.3203999999999, 9.46299999999997, 9.79280000000006, 9.12300000000005, 8.74180000000001, 9.2192, 7.51020000000005, 7.60659999999996, 7.01840000000004, 7.22239999999999, 7.40139999999997, 6.76179999999999, 7.14359999999999, 5.65060000000005, 5.63779999999997, 5.76599999999996, 6.75139999999999, 5.57759999999996, 3.73220000000003, 5.8048, 5.63019999999995, 4.93359999999996, 3.47979999999995, 4.33879999999999, 3.98940000000005, 3.81960000000004, 3.31359999999995, 3.23080000000004, 3.4588, 3.08159999999998, 3.4076, 3.00639999999999, 2.38779999999997, 2.61900000000003, 1.99800000000005, 3.34820000000002, 2.95060000000001, 0.990999999999985, 2.11440000000005, 2.20299999999997, 2.82219999999995, 2.73239999999998, 2.7826, 3.76660000000004, 2.26480000000004, 2.31280000000004, 2.40819999999997, 2.75360000000001, 3.33759999999995, 2.71559999999999, 1.7478000000001, 1.42920000000004, 2.39300000000003, 2.22779999999989, 2.34339999999997, 0.87259999999992, 3.88400000000001, 1.80600000000004, 1.91759999999999, 1.16779999999994, 1.50320000000011, 2.52500000000009, 0.226400000000012, 2.31500000000005, 0.930000000000064, 1.25199999999995, 2.14959999999996, 0.0407999999999902, 2.5447999999999, 1.32960000000003, 0.197400000000016, 2.52620000000002, 3.33279999999991, -1.34300000000007, 0.422199999999975, 0.917200000000093, 1.12920000000008, 1.46060000000011, 1.45779999999991, 2.8728000000001, 3.33359999999993, -1.34079999999994, 1.57680000000005, 0.363000000000056, 1.40740000000005, 0.656600000000026, 0.801400000000058, -0.454600000000028, 1.51919999999996
+};
+
+const double biasData_precision9[] = {
+    368, 361.8294, 355.2452, 348.6698, 342.1464, 336.2024, 329.8782, 323.6598, 317.462, 311.2826, 305.7102, 299.7416, 293.9366, 288.1046, 282.285, 277.0668, 271.306, 265.8448, 260.301, 254.9886, 250.2422, 244.8138, 239.7074, 234.7428, 229.8402, 225.1664, 220.3534, 215.594, 210.6886, 205.7876, 201.65, 197.228, 192.8036, 188.1666, 184.0818, 180.0824, 176.2574, 172.302, 168.1644, 164.0056, 160.3802, 156.7192, 152.5234, 149.2084, 145.831, 142.485, 139.1112, 135.4764, 131.76, 129.3368, 126.5538, 122.5058, 119.2646, 116.5902, 113.3818, 110.8998, 107.9532, 105.2062, 102.2798, 99.4728, 96.9582, 94.3292, 92.171, 89.7809999999999, 87.5716, 84.7048, 82.5322, 79.875, 78.3972, 75.3464, 73.7274, 71.2834, 70.1444, 68.4263999999999, 66.0166, 64.018, 62.0437999999999, 60.3399999999999, 58.6856, 57.9836, 55.0311999999999, 54.6769999999999, 52.3188, 51.4846, 49.4423999999999, 47.739, 46.1487999999999, 44.9202, 43.4059999999999, 42.5342000000001, 41.2834, 38.8954000000001, 38.3286000000001, 36.2146, 36.6684, 35.9946, 33.123, 33.4338, 31.7378000000001, 29.076, 28.9692, 27.4964, 27.0998, 25.9864, 26.7754, 24.3208, 23.4838, 22.7388000000001, 24.0758000000001, 21.9097999999999, 20.9728, 19.9228000000001, 19.9292, 16.617, 17.05, 18.2996000000001, 15.6128000000001, 15.7392, 14.5174, 13.6322, 12.2583999999999, 13.3766000000001, 11.423, 13.1232, 9.51639999999998, 10.5938000000001, 9.59719999999993, 8.12220000000002, 9.76739999999995, 7.50440000000003, 7.56999999999994, 6.70440000000008, 6.41419999999994, 6.71019999999999, 5.60940000000005, 4.65219999999999, 6.84099999999989, 3.4072000000001, 3.97859999999991, 3.32760000000007, 5.52160000000003, 3.31860000000006, 2.06940000000009, 4.35400000000004, 1.57500000000005, 0.280799999999999, 2.12879999999996, -0.214799999999968, -0.0378000000000611, -0.658200000000079, 0.654800000000023, -0.0697999999999865, 0.858400000000074, -2.52700000000004, -2.1751999999999, -3.35539999999992, -1.04019999999991, -0.651000000000067, -2.14439999999991, -1.96659999999997, -3.97939999999994, -0.604400000000169, -3.08260000000018, -3.39159999999993, -5.29640000000018, -5.38920000000007, -5.08759999999984, -4.69900000000007, -5.23720000000003, -3.15779999999995, -4.97879999999986, -4.89899999999989, -7.48880000000008, -5.94799999999987, -5.68060000000014, -6.67180000000008, -4.70499999999993, -7.27779999999984, -4.6579999999999, -4.4362000000001, -4.32139999999981, -5.18859999999995, -6.66879999999992, -6.48399999999992, -5.1260000000002, -4.4032000000002, -6.13500000000022, -5.80819999999994, -4.16719999999987, -4.15039999999999, -7.45600000000013, -7.24080000000004, -9.83179999999993, -5.80420000000004, -8.6561999999999, -6.99940000000015, -10.5473999999999, -7.34139999999979, -6.80999999999995, -6.29719999999998, -6.23199999999997
+};
+
+const double biasData_precision10[] = {
+    737.1256, 724.4234, 711.1064, 698.4732, 685.4636, 673.0644, 660.488, 647.9654, 636.0832, 623.7864, 612.1992, 600.2176, 588.5228, 577.1716, 565.7752, 554.899, 543.6126, 532.6492, 521.9474, 511.5214, 501.1064, 490.6364, 480.2468, 470.4588, 460.3832, 451.0584, 440.8606, 431.3868, 422.5062, 413.1862, 404.463, 395.339, 386.1936, 378.1292, 369.1854, 361.2908, 353.3324, 344.8518, 337.5204, 329.4854, 321.9318, 314.552, 306.4658, 299.4256, 292.849, 286.152, 278.8956, 271.8792, 265.118, 258.62, 252.5132, 245.9322, 239.7726, 233.6086, 227.5332, 222.5918, 216.4294, 210.7662, 205.4106, 199.7338, 194.9012, 188.4486, 183.1556, 178.6338, 173.7312, 169.6264, 163.9526, 159.8742, 155.8326, 151.1966, 147.5594, 143.07, 140.037, 134.1804, 131.071, 127.4884, 124.0848, 120.2944, 117.333, 112.9626, 110.2902, 107.0814, 103.0334, 99.4832000000001, 96.3899999999999, 93.7202000000002, 90.1714000000002, 87.2357999999999, 85.9346, 82.8910000000001, 80.0264000000002, 78.3834000000002, 75.1543999999999, 73.8683999999998, 70.9895999999999, 69.4367999999999, 64.8701999999998, 65.0408000000002, 61.6738, 59.5207999999998, 57.0158000000001, 54.2302, 53.0962, 50.4985999999999, 52.2588000000001, 47.3914, 45.6244000000002, 42.8377999999998, 43.0072, 40.6516000000001, 40.2453999999998, 35.2136, 36.4546, 33.7849999999999, 33.2294000000002, 32.4679999999998, 30.8670000000002, 28.6507999999999, 28.9099999999999, 27.5983999999999, 26.1619999999998, 24.5563999999999, 23.2328000000002, 21.9484000000002, 21.5902000000001, 21.3346000000001, 17.7031999999999, 20.6111999999998, 19.5545999999999, 15.7375999999999, 17.0720000000001, 16.9517999999998, 15.326, 13.1817999999998, 14.6925999999999, 13.0859999999998, 13.2754, 10.8697999999999, 11.248, 7.3768, 4.72339999999986, 7.97899999999981, 8.7503999999999, 7.68119999999999, 9.7199999999998, 7.73919999999998, 5.6224000000002, 7.44560000000001, 6.6601999999998, 5.9058, 4.00199999999995, 4.51699999999983, 4.68240000000014, 3.86220000000003, 5.13639999999987, 5.98500000000013, 2.47719999999981, 2.61999999999989, 1.62800000000016, 4.65000000000009, 0.225599999999758, 0.831000000000131, -0.359400000000278, 1.27599999999984, -2.92559999999958, -0.0303999999996449, 2.37079999999969, -2.0033999999996, 0.804600000000391, 0.30199999999968, 1.1247999999996, -2.6880000000001, 0.0321999999996478, -1.18099999999959, -3.9402, -1.47940000000017, -0.188400000000001, -2.10720000000038, -2.04159999999956, -3.12880000000041, -4.16160000000036, -0.612799999999879, -3.48719999999958, -8.17900000000009, -5.37780000000021, -4.01379999999972, -5.58259999999973, -5.73719999999958, -7.66799999999967, -5.69520000000011, -1.1247999999996, -5.58520000000044, -8.04560000000038, -4.64840000000004, -11.6468000000004, -7.97519999999986, -5.78300000000036, -7.67420000000038, -10.6328000000003, -9.81720000000041
+};
+
+const double biasData_precision11[] = {
+    1476, 1449.6014, 1423.5802, 1397.7942, 1372.3042, 1347.2062, 1321.8402, 1297.2292, 1272.9462, 1248.9926, 1225.3026, 1201.4252, 1178.0578, 1155.6092, 1132.626, 1110.5568, 1088.527, 1066.5154, 1045.1874, 1024.3878, 1003.37, 982.1972, 962.5728, 942.1012, 922.9668, 903.292, 884.0772, 864.8578, 846.6562, 828.041, 809.714, 792.3112, 775.1806, 757.9854, 740.656, 724.346, 707.5154, 691.8378, 675.7448, 659.6722, 645.5722, 630.1462, 614.4124, 600.8728, 585.898, 572.408, 558.4926, 544.4938, 531.6776, 517.282, 505.7704, 493.1012, 480.7388, 467.6876, 456.1872, 445.5048, 433.0214, 420.806, 411.409, 400.4144, 389.4294, 379.2286, 369.651, 360.6156, 350.337, 342.083, 332.1538, 322.5094, 315.01, 305.6686, 298.1678, 287.8116, 280.9978, 271.9204, 265.3286, 257.5706, 249.6014, 242.544, 235.5976, 229.583, 220.9438, 214.672, 208.2786, 201.8628, 195.1834, 191.505, 186.1816, 178.5188, 172.2294, 167.8908, 161.0194, 158.052, 151.4588, 148.1596, 143.4344, 138.5238, 133.13, 127.6374, 124.8162, 118.7894, 117.3984, 114.6078, 109.0858, 105.1036, 103.6258, 98.6018000000004, 95.7618000000002, 93.5821999999998, 88.5900000000001, 86.9992000000002, 82.8800000000001, 80.4539999999997, 74.6981999999998, 74.3644000000004, 73.2914000000001, 65.5709999999999, 66.9232000000002, 65.1913999999997, 62.5882000000001, 61.5702000000001, 55.7035999999998, 56.1764000000003, 52.7596000000003, 53.0302000000001, 49.0609999999997, 48.4694, 44.933, 46.0474000000004, 44.7165999999997, 41.9416000000001, 39.9207999999999, 35.6328000000003, 35.5276000000003, 33.1934000000001, 33.2371999999996, 33.3864000000003, 33.9228000000003, 30.2371999999996, 29.1373999999996, 25.2272000000003, 24.2942000000003, 19.8338000000003, 18.9005999999999, 23.0907999999999, 21.8544000000002, 19.5176000000001, 15.4147999999996, 16.9314000000004, 18.6737999999996, 12.9877999999999, 14.3688000000002, 12.0447999999997, 15.5219999999999, 12.5299999999997, 14.5940000000001, 14.3131999999996, 9.45499999999993, 12.9441999999999, 3.91139999999996, 13.1373999999996, 5.44720000000052, 9.82779999999912, 7.87279999999919, 3.67760000000089, 5.46980000000076, 5.55099999999948, 5.65979999999945, 3.89439999999922, 3.1275999999998, 5.65140000000065, 6.3062000000009, 3.90799999999945, 1.87060000000019, 5.17020000000048, 2.46680000000015, 0.770000000000437, -3.72340000000077, 1.16400000000067, 8.05340000000069, 0.135399999999208, 2.15940000000046, 0.766999999999825, 1.0594000000001, 3.15500000000065, -0.287399999999252, 2.37219999999979, -2.86620000000039, -1.63199999999961, -2.22979999999916, -0.15519999999924, -1.46039999999994, -0.262199999999211, -2.34460000000036, -2.8078000000005, -3.22179999999935, -5.60159999999996, -8.42200000000048, -9.43740000000071, 0.161799999999857, -10.4755999999998, -10.0823999999993
+};
+
+const double biasData_precision12[] = {
+    2953, 2900.4782, 2848.3568, 2796.3666, 2745.324, 2694.9598, 2644.648, 2595.539, 2546.1474, 2498.2576, 2450.8376, 2403.6076, 2357.451, 2311.38, 2266.4104, 2221.5638, 2176.9676, 2134.193, 2090.838, 2048.8548, 2007.018, 1966.1742, 1925.4482, 1885.1294, 1846.4776, 1807.4044, 1768.8724, 1731.3732, 1693.4304, 1657.5326, 1621.949, 1586.5532, 1551.7256, 1517.6182, 1483.5186, 1450.4528, 1417.865, 1385.7164, 1352.6828, 1322.6708, 1291.8312, 1260.9036, 1231.476, 1201.8652, 1173.6718, 1145.757, 1119.2072, 1092.2828, 1065.0434, 1038.6264, 1014.3192, 988.5746, 965.0816, 940.1176, 917.9796, 894.5576, 871.1858, 849.9144, 827.1142, 805.0818, 783.9664, 763.9096, 742.0816, 724.3962, 706.3454, 688.018, 667.4214, 650.3106, 633.0686, 613.8094, 597.818, 581.4248, 563.834, 547.363, 531.5066, 520.455400000001, 505.583199999999, 488.366, 476.480799999999, 459.7682, 450.0522, 434.328799999999, 423.952799999999, 408.727000000001, 399.079400000001, 387.252200000001, 373.987999999999, 360.852000000001, 351.6394, 339.642, 330.902400000001, 322.661599999999, 311.662200000001, 301.3254, 291.7484, 279.939200000001, 276.7508, 263.215200000001, 254.811400000001, 245.5494, 242.306399999999, 234.8734, 223.787200000001, 217.7156, 212.0196, 200.793, 195.9748, 189.0702, 182.449199999999, 177.2772, 170.2336, 164.741, 158.613600000001, 155.311, 147.5964, 142.837, 137.3724, 132.0162, 130.0424, 121.9804, 120.451800000001, 114.8968, 111.585999999999, 105.933199999999, 101.705, 98.5141999999996, 95.0488000000005, 89.7880000000005, 91.4750000000004, 83.7764000000006, 80.9698000000008, 72.8574000000008, 73.1615999999995, 67.5838000000003, 62.6263999999992, 63.2638000000006, 66.0977999999996, 52.0843999999997, 58.9956000000002, 47.0912000000008, 46.4956000000002, 48.4383999999991, 47.1082000000006, 43.2392, 37.2759999999998, 40.0283999999992, 35.1864000000005, 35.8595999999998, 32.0998, 28.027, 23.6694000000007, 33.8266000000003, 26.3736000000008, 27.2008000000005, 21.3245999999999, 26.4115999999995, 23.4521999999997, 19.5013999999992, 19.8513999999996, 10.7492000000002, 18.6424000000006, 13.1265999999996, 18.2436000000016, 6.71860000000015, 3.39459999999963, 6.33759999999893, 7.76719999999841, 0.813999999998487, 3.82819999999992, 0.826199999999517, 8.07440000000133, -1.59080000000176, 5.01780000000144, 0.455399999998917, -0.24199999999837, 0.174800000000687, -9.07640000000174, -4.20160000000033, -3.77520000000004, -4.75179999999818, -5.3724000000002, -8.90680000000066, -6.10239999999976, -5.74120000000039, -9.95339999999851, -3.86339999999836, -13.7304000000004, -16.2710000000006, -7.51359999999841, -3.30679999999847, -13.1339999999982, -10.0551999999989, -6.72019999999975, -8.59660000000076, -10.9307999999983, -1.8775999999998, -4.82259999999951, -13.7788, -21.6470000000008, -10.6735999999983, -15.7799999999988
+};
+
+const double biasData_precision13[] = {
+    5907.5052, 5802.2672, 5697.347, 5593.5794, 5491.2622, 5390.5514, 5290.3376, 5191.6952, 5093.5988, 4997.3552, 4902.5972, 4808.3082, 4715.5646, 4624.109, 4533.8216, 4444.4344, 4356.3802, 4269.2962, 4183.3784, 4098.292, 4014.79, 3932.4574, 3850.6036, 3771.2712, 3691.7708, 3615.099, 3538.1858, 3463.4746, 3388.8496, 3315.6794, 3244.5448, 3173.7516, 3103.3106, 3033.6094, 2966.5642, 2900.794, 2833.7256, 2769.81, 2707.3196, 2644.0778, 2583.9916, 2523.4662, 2464.124, 2406.073, 2347.0362, 2292.1006, 2238.1716, 2182.7514, 2128.4884, 2077.1314, 2025.037, 1975.3756, 1928.933, 1879.311, 1831.0006, 1783.2144, 1738.3096, 1694.5144, 1649.024, 1606.847, 1564.7528, 1525.3168, 1482.5372, 1443.9668, 1406.5074, 1365.867, 1329.2186, 1295.4186, 1257.9716, 1225.339, 1193.2972, 1156.3578, 1125.8686, 1091.187, 1061.4094, 1029.4188, 1000.9126, 972.3272, 944.004199999999, 915.7592, 889.965, 862.834200000001, 840.4254, 812.598399999999, 785.924200000001, 763.050999999999, 741.793799999999, 721.466, 699.040799999999, 677.997200000002, 649.866999999998, 634.911800000002, 609.8694, 591.981599999999, 570.2922, 557.129199999999, 538.3858, 521.872599999999, 502.951400000002, 495.776399999999, 475.171399999999, 459.751, 439.995200000001, 426.708999999999, 413.7016, 402.3868, 387.262599999998, 372.0524, 357.050999999999, 342.5098, 334.849200000001, 322.529399999999, 311.613799999999, 295.848000000002, 289.273000000001, 274.093000000001, 263.329600000001, 251.389599999999, 245.7392, 231.9614, 229.7952, 217.155200000001, 208.9588, 199.016599999999, 190.839199999999, 180.6976, 176.272799999999, 166.976999999999, 162.5252, 151.196400000001, 149.386999999999, 133.981199999998, 130.0586, 130.164000000001, 122.053400000001, 110.7428, 108.1276, 106.232400000001, 100.381600000001, 98.7668000000012, 86.6440000000002, 79.9768000000004, 82.4722000000002, 68.7026000000005, 70.1186000000016, 71.9948000000004, 58.998599999999, 59.0492000000013, 56.9818000000014, 47.5338000000011, 42.9928, 51.1591999999982, 37.2740000000013, 42.7220000000016, 31.3734000000004, 26.8090000000011, 25.8934000000008, 26.5286000000015, 29.5442000000003, 19.3503999999994, 26.0760000000009, 17.9527999999991, 14.8419999999969, 10.4683999999979, 8.65899999999965, 9.86720000000059, 4.34139999999752, -0.907800000000861, -3.32080000000133, -0.936199999996461, -11.9916000000012, -8.87000000000262, -6.33099999999831, -11.3366000000024, -15.9207999999999, -9.34659999999712, -15.5034000000014, -19.2097999999969, -15.357799999998, -28.2235999999975, -30.6898000000001, -19.3271999999997, -25.6083999999973, -24.409599999999, -13.6385999999984, -33.4473999999973, -32.6949999999997, -28.9063999999998, -31.7483999999968, -32.2935999999972, -35.8329999999987, -47.620600000002, -39.0855999999985, -33.1434000000008, -46.1371999999974, -37.5892000000022, -46.8164000000033, -47.3142000000007, -60.2914000000019, -37.7575999999972
+};
+
+const double biasData_precision14[] = {
+    11816.475, 11605.0046, 11395.3792, 11188.7504, 10984.1814, 10782.0086, 10582.0072, 10384.503, 10189.178, 9996.2738, 9806.0344, 9617.9798, 9431.394, 9248.7784, 9067.6894, 8889.6824, 8712.9134, 8538.8624, 8368.4944, 8197.7956, 8031.8916, 7866.6316, 7703.733, 7544.5726, 7386.204, 7230.666, 7077.8516, 6926.7886, 6778.6902, 6631.9632, 6487.304, 6346.7486, 6206.4408, 6070.202, 5935.2576, 5799.924, 5671.0324, 5541.9788, 5414.6112, 5290.0274, 5166.723, 5047.6906, 4929.162, 4815.1406, 4699.127, 4588.5606, 4477.7394, 4369.4014, 4264.2728, 4155.9224, 4055.581, 3955.505, 3856.9618, 3761.3828, 3666.9702, 3575.7764, 3482.4132, 3395.0186, 3305.8852, 3221.415, 3138.6024, 3056.296, 2970.4494, 2896.1526, 2816.8008, 2740.2156, 2670.497, 2594.1458, 2527.111, 2460.8168, 2387.5114, 2322.9498, 2260.6752, 2194.2686, 2133.7792, 2074.767, 2015.204, 1959.4226, 1898.6502, 1850.006, 1792.849, 1741.4838, 1687.9778, 1638.1322, 1589.3266, 1543.1394, 1496.8266, 1447.8516, 1402.7354, 1361.9606, 1327.0692, 1285.4106, 1241.8112, 1201.6726, 1161.973, 1130.261, 1094.2036, 1048.2036, 1020.6436, 990.901400000002, 961.199800000002, 924.769800000002, 899.526400000002, 872.346400000002, 834.375, 810.432000000001, 780.659800000001, 756.013800000001, 733.479399999997, 707.923999999999, 673.858, 652.222399999999, 636.572399999997, 615.738599999997, 586.696400000001, 564.147199999999, 541.679600000003, 523.943599999999, 505.714599999999, 475.729599999999, 461.779600000002, 449.750800000002, 439.020799999998, 412.7886, 400.245600000002, 383.188199999997, 362.079599999997, 357.533799999997, 334.319000000003, 327.553399999997, 308.559399999998, 291.270199999999, 279.351999999999, 271.791400000002, 252.576999999997, 247.482400000001, 236.174800000001, 218.774599999997, 220.155200000001, 208.794399999999, 201.223599999998, 182.995600000002, 185.5268, 164.547400000003, 176.5962, 150.689599999998, 157.8004, 138.378799999999, 134.021200000003, 117.614399999999, 108.194000000003, 97.0696000000025, 89.6042000000016, 95.6030000000028, 84.7810000000027, 72.635000000002, 77.3482000000004, 59.4907999999996, 55.5875999999989, 50.7346000000034, 61.3916000000027, 50.9149999999936, 39.0384000000049, 58.9395999999979, 29.633600000001, 28.2032000000036, 26.0078000000067, 17.0387999999948, 9.22000000000116, 13.8387999999977, 8.07240000000456, 14.1549999999988, 15.3570000000036, 3.42660000000615, 6.24820000000182, -2.96940000000177, -8.79940000000352, -5.97860000000219, -14.4048000000039, -3.4143999999942, -13.0148000000045, -11.6977999999945, -25.7878000000055, -22.3185999999987, -24.409599999999, -31.9756000000052, -18.9722000000038, -22.8678000000073, -30.8972000000067, -32.3715999999986, -22.3907999999938, -43.6720000000059, -35.9038, -39.7492000000057, -54.1641999999993, -45.2749999999942, -42.2989999999991, -44.1089999999967, -64.3564000000042, -49.9551999999967, -42.6116000000038
+};
+
+const double biasData_precision15[] = {
+    23634.0036, 23210.8034, 22792.4744, 22379.1524, 21969.7928, 21565.326, 21165.3532, 20770.2806, 20379.9892, 19994.7098, 19613.318, 19236.799, 18865.4382, 18498.8244, 18136.5138, 17778.8668, 17426.2344, 17079.32, 16734.778, 16397.2418, 16063.3324, 15734.0232, 15409.731, 15088.728, 14772.9896, 14464.1402, 14157.5588, 13855.5958, 13559.3296, 13264.9096, 12978.326, 12692.0826, 12413.8816, 12137.3192, 11870.2326, 11602.5554, 11340.3142, 11079.613, 10829.5908, 10583.5466, 10334.0344, 10095.5072, 9859.694, 9625.2822, 9395.7862, 9174.0586, 8957.3164, 8738.064, 8524.155, 8313.7396, 8116.9168, 7913.542, 7718.4778, 7521.65, 7335.5596, 7154.2906, 6968.7396, 6786.3996, 6613.236, 6437.406, 6270.6598, 6107.7958, 5945.7174, 5787.6784, 5635.5784, 5482.308, 5337.9784, 5190.0864, 5045.9158, 4919.1386, 4771.817, 4645.7742, 4518.4774, 4385.5454, 4262.6622, 4142.74679999999, 4015.5318, 3897.9276, 3790.7764, 3685.13800000001, 3573.6274, 3467.9706, 3368.61079999999, 3271.5202, 3170.3848, 3076.4656, 2982.38400000001, 2888.4664, 2806.4868, 2711.9564, 2634.1434, 2551.3204, 2469.7662, 2396.61139999999, 2318.9902, 2243.8658, 2171.9246, 2105.01360000001, 2028.8536, 1960.9952, 1901.4096, 1841.86079999999, 1777.54700000001, 1714.5802, 1654.65059999999, 1596.311, 1546.2016, 1492.3296, 1433.8974, 1383.84600000001, 1339.4152, 1293.5518, 1245.8686, 1193.50659999999, 1162.27959999999, 1107.19439999999, 1069.18060000001, 1035.09179999999, 999.679000000004, 957.679999999993, 925.300199999998, 888.099400000006, 848.638600000006, 818.156400000007, 796.748399999997, 752.139200000005, 725.271200000003, 692.216, 671.633600000001, 647.939799999993, 621.670599999998, 575.398799999995, 561.226599999995, 532.237999999998, 521.787599999996, 483.095799999996, 467.049599999998, 465.286399999997, 415.548599999995, 401.047399999996, 380.607999999993, 377.362599999993, 347.258799999996, 338.371599999999, 310.096999999994, 301.409199999995, 276.280799999993, 265.586800000005, 258.994399999996, 223.915999999997, 215.925399999993, 213.503800000006, 191.045400000003, 166.718200000003, 166.259000000005, 162.941200000001, 148.829400000002, 141.645999999993, 123.535399999993, 122.329800000007, 89.473399999988, 80.1962000000058, 77.5457999999926, 59.1056000000099, 83.3509999999951, 52.2906000000075, 36.3979999999865, 40.6558000000077, 42.0003999999899, 19.6630000000005, 19.7153999999864, -8.38539999999921, -0.692799999989802, 0.854800000000978, 3.23219999999856, -3.89040000000386, -5.25880000001052, -24.9052000000083, -22.6837999999989, -26.4286000000138, -34.997000000003, -37.0216000000073, -43.430400000012, -58.2390000000014, -68.8034000000043, -56.9245999999985, -57.8583999999973, -77.3097999999882, -73.2793999999994, -81.0738000000129, -87.4530000000086, -65.0254000000132, -57.296399999992, -96.2746000000043, -103.25, -96.081600000005, -91.5542000000132, -102.465200000006, -107.688599999994, -101.458000000013, -109.715800000005
+};
+
+const double biasData_precision16[] = {
+    47270, 46423.3584, 45585.7074, 44757.152, 43938.8416, 43130.9514, 42330.03, 41540.407, 40759.6348, 39988.206, 39226.5144, 38473.2096, 37729.795, 36997.268, 36272.6448, 35558.665, 34853.0248, 34157.4472, 33470.5204, 32793.5742, 32127.0194, 31469.4182, 30817.6136, 30178.6968, 29546.8908, 28922.8544, 28312.271, 27707.0924, 27114.0326, 26526.692, 25948.6336, 25383.7826, 24823.5998, 24272.2974, 23732.2572, 23201.4976, 22674.2796, 22163.6336, 21656.515, 21161.7362, 20669.9368, 20189.4424, 19717.3358, 19256.3744, 18795.9638, 18352.197, 17908.5738, 17474.391, 17052.918, 16637.2236, 16228.4602, 15823.3474, 15428.6974, 15043.0284, 14667.6278, 14297.4588, 13935.2882, 13578.5402, 13234.6032, 12882.1578, 12548.0728, 12219.231, 11898.0072, 11587.2626, 11279.9072, 10973.5048, 10678.5186, 10392.4876, 10105.2556, 9825.766, 9562.5444, 9294.2222, 9038.2352, 8784.848, 8533.2644, 8301.7776, 8058.30859999999, 7822.94579999999, 7599.11319999999, 7366.90779999999, 7161.217, 6957.53080000001, 6736.212, 6548.21220000001, 6343.06839999999, 6156.28719999999, 5975.15419999999, 5791.75719999999, 5621.32019999999, 5451.66, 5287.61040000001, 5118.09479999999, 4957.288, 4798.4246, 4662.17559999999, 4512.05900000001, 4364.68539999999, 4220.77720000001, 4082.67259999999, 3957.19519999999, 3842.15779999999, 3699.3328, 3583.01180000001, 3473.8964, 3338.66639999999, 3233.55559999999, 3117.799, 3008.111, 2909.69140000001, 2814.86499999999, 2719.46119999999, 2624.742, 2532.46979999999, 2444.7886, 2370.1868, 2272.45259999999, 2196.19260000001, 2117.90419999999, 2023.2972, 1969.76819999999, 1885.58979999999, 1833.2824, 1733.91200000001, 1682.54920000001, 1604.57980000001, 1556.11240000001, 1491.3064, 1421.71960000001, 1371.22899999999, 1322.1324, 1264.7892, 1196.23920000001, 1143.8474, 1088.67240000001, 1073.60380000001, 1023.11660000001, 959.036400000012, 927.433199999999, 906.792799999996, 853.433599999989, 841.873800000001, 791.1054, 756.899999999994, 704.343200000003, 672.495599999995, 622.790399999998, 611.254799999995, 567.283200000005, 519.406599999988, 519.188400000014, 495.312800000014, 451.350799999986, 443.973399999988, 431.882199999993, 392.027000000002, 380.924200000009, 345.128999999986, 298.901400000002, 287.771999999997, 272.625, 247.253000000026, 222.490600000019, 223.590000000026, 196.407599999977, 176.425999999978, 134.725199999986, 132.4804, 110.445599999977, 86.7939999999944, 56.7038000000175, 64.915399999998, 38.3726000000024, 37.1606000000029, 46.170999999973, 49.1716000000015, 15.3362000000197, 6.71639999997569, -34.8185999999987, -39.4476000000141, 12.6830000000191, -12.3331999999937, -50.6565999999875, -59.9538000000175, -65.1054000000004, -70.7576000000117, -106.325200000021, -126.852200000023, -110.227599999984, -132.885999999999, -113.897200000007, -142.713800000027, -151.145399999979, -150.799200000009, -177.756200000003, -156.036399999983, -182.735199999996, -177.259399999981, -198.663600000029, -174.577600000019, -193.84580000001
+};
+
+const double biasData_precision17[] = {
+    94541, 92848.811, 91174.019, 89517.558, 87879.9705, 86262.7565, 84663.5125, 83083.7435, 81521.7865, 79977.272, 78455.9465, 76950.219, 75465.432, 73994.152, 72546.71, 71115.2345, 69705.6765, 68314.937, 66944.2705, 65591.255, 64252.9485, 62938.016, 61636.8225, 60355.592, 59092.789, 57850.568, 56624.518, 55417.343, 54231.1415, 53067.387, 51903.526, 50774.649, 49657.6415, 48561.05, 47475.7575, 46410.159, 45364.852, 44327.053, 43318.4005, 42325.6165, 41348.4595, 40383.6265, 39436.77, 38509.502, 37594.035, 36695.939, 35818.6895, 34955.691, 34115.8095, 33293.949, 32465.0775, 31657.6715, 30877.2585, 30093.78, 29351.3695, 28594.1365, 27872.115, 27168.7465, 26477.076, 25774.541, 25106.5375, 24452.5135, 23815.5125, 23174.0655, 22555.2685, 21960.2065, 21376.3555, 20785.1925, 20211.517, 19657.0725, 19141.6865, 18579.737, 18081.3955, 17578.995, 17073.44, 16608.335, 16119.911, 15651.266, 15194.583, 14749.0495, 14343.4835, 13925.639, 13504.509, 13099.3885, 12691.2855, 12328.018, 11969.0345, 11596.5145, 11245.6355, 10917.6575, 10580.9785, 10277.8605, 9926.58100000001, 9605.538, 9300.42950000003, 8989.97850000003, 8728.73249999998, 8448.3235, 8175.31050000002, 7898.98700000002, 7629.79100000003, 7413.76199999999, 7149.92300000001, 6921.12650000001, 6677.1545, 6443.28000000003, 6278.23450000002, 6014.20049999998, 5791.20299999998, 5605.78450000001, 5438.48800000001, 5234.2255, 5059.6825, 4887.43349999998, 4682.935, 4496.31099999999, 4322.52250000002, 4191.42499999999, 4021.24200000003, 3900.64799999999, 3762.84250000003, 3609.98050000001, 3502.29599999997, 3363.84250000003, 3206.54849999998, 3079.70000000001, 2971.42300000001, 2867.80349999998, 2727.08100000001, 2630.74900000001, 2496.6165, 2440.902, 2356.19150000002, 2235.58199999999, 2120.54149999999, 2012.25449999998, 1933.35600000003, 1820.93099999998, 1761.54800000001, 1663.09350000002, 1578.84600000002, 1509.48149999999, 1427.3345, 1379.56150000001, 1306.68099999998, 1212.63449999999, 1084.17300000001, 1124.16450000001, 1060.69949999999, 1007.48849999998, 941.194499999983, 879.880500000028, 836.007500000007, 782.802000000025, 748.385499999975, 647.991500000004, 626.730500000005, 570.776000000013, 484.000500000024, 513.98550000001, 418.985499999952, 386.996999999974, 370.026500000036, 355.496999999974, 356.731499999994, 255.92200000002, 259.094000000041, 205.434499999974, 165.374500000034, 197.347500000033, 95.718499999959, 67.6165000000037, 54.6970000000438, 31.7395000000251, -15.8784999999916, 8.42500000004657, -26.3754999999655, -118.425500000012, -66.6629999999423, -42.9745000000112, -107.364999999991, -189.839000000036, -162.611499999999, -164.964999999967, -189.079999999958, -223.931499999948, -235.329999999958, -269.639500000048, -249.087999999989, -206.475499999942, -283.04449999996, -290.667000000016, -304.561499999953, -336.784499999951, -380.386500000022, -283.280499999993, -364.533000000054, -389.059499999974, -364.454000000027, -415.748000000021, -417.155000000028
+};
+
+const double biasData_precision18[] = {
+    189083, 185696.913, 182348.774, 179035.946, 175762.762, 172526.444, 169329.754, 166166.099, 163043.269, 159958.91, 156907.912, 153906.845, 150924.199, 147996.568, 145093.457, 142239.233, 139421.475, 136632.27, 133889.588, 131174.2, 128511.619, 125868.621, 123265.385, 120721.061, 118181.769, 115709.456, 113252.446, 110840.198, 108465.099, 106126.164, 103823.469, 101556.618, 99308.004, 97124.508, 94937.803, 92833.731, 90745.061, 88677.627, 86617.47, 84650.442, 82697.833, 80769.132, 78879.629, 77014.432, 75215.626, 73384.587, 71652.482, 69895.93, 68209.301, 66553.669, 64921.981, 63310.323, 61742.115, 60205.018, 58698.658, 57190.657, 55760.865, 54331.169, 52908.167, 51550.273, 50225.254, 48922.421, 47614.533, 46362.049, 45098.569, 43926.083, 42736.03, 41593.473, 40425.26, 39316.237, 38243.651, 37170.617, 36114.609, 35084.19, 34117.233, 33206.509, 32231.505, 31318.728, 30403.404, 29540.0550000001, 28679.236, 27825.862, 26965.216, 26179.148, 25462.08, 24645.952, 23922.523, 23198.144, 22529.128, 21762.4179999999, 21134.779, 20459.117, 19840.818, 19187.04, 18636.3689999999, 17982.831, 17439.7389999999, 16874.547, 16358.2169999999, 15835.684, 15352.914, 14823.681, 14329.313, 13816.897, 13342.874, 12880.882, 12491.648, 12021.254, 11625.392, 11293.7610000001, 10813.697, 10456.209, 10099.074, 9755.39000000001, 9393.18500000006, 9047.57900000003, 8657.98499999999, 8395.85900000005, 8033, 7736.95900000003, 7430.59699999995, 7258.47699999996, 6924.58200000005, 6691.29399999999, 6357.92500000005, 6202.05700000003, 5921.19700000004, 5628.28399999999, 5404.96799999999, 5226.71100000001, 4990.75600000005, 4799.77399999998, 4622.93099999998, 4472.478, 4171.78700000001, 3957.46299999999, 3868.95200000005, 3691.14300000004, 3474.63100000005, 3341.67200000002, 3109.14000000001, 3071.97400000005, 2796.40399999998, 2756.17799999996, 2611.46999999997, 2471.93000000005, 2382.26399999997, 2209.22400000005, 2142.28399999999, 2013.96100000001, 1911.18999999994, 1818.27099999995, 1668.47900000005, 1519.65800000005, 1469.67599999998, 1367.13800000004, 1248.52899999998, 1181.23600000003, 1022.71900000004, 1088.20700000005, 959.03600000008, 876.095999999903, 791.183999999892, 703.337000000058, 731.949999999953, 586.86400000006, 526.024999999907, 323.004999999888, 320.448000000091, 340.672999999952, 309.638999999966, 216.601999999955, 102.922999999952, 19.2399999999907, -0.114000000059605, -32.6240000000689, -89.3179999999702, -153.497999999905, -64.2970000000205, -143.695999999996, -259.497999999905, -253.017999999924, -213.948000000091, -397.590000000084, -434.006000000052, -403.475000000093, -297.958000000101, -404.317000000039, -528.898999999976, -506.621000000043, -513.205000000075, -479.351000000024, -596.139999999898, -527.016999999993, -664.681000000099, -680.306000000099, -704.050000000047, -850.486000000034, -757.43200000003, -713.308999999892
+};
+
+
+#endif /* HYPERLOGLOGBIAS_H_ */
diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
new file mode 100644
index 0000000..33f5dc1
--- /dev/null
+++ b/src/hyperloglogplus.h
@@ -0,0 +1,623 @@
+/*
+ * hyperloglogplus.h
+ *
+ * Implementation of HyperLogLog++ algorithm described by Stefan Heule et al.
+ *
+ *  Created on: Apr 25, 2015
+ *      Author: fbreitwieser
+ */
+
+#ifndef HYPERLOGLOGPLUS_H_
+#define HYPERLOGLOGPLUS_H_
+
+#include<set>
+#include<vector>
+#include<stdexcept>
+#include<iostream>
+#include<fstream>
+#include<math.h>    //log
+#include<algorithm> //vector.count
+#include<bitset>
+
+#include "hyperloglogbias.h"
+#include "third_party/MurmurHash3.cpp"
+#include "assert_helpers.h"
+
+using namespace std;
+
+//#define HLL_DEBUG
+//#define NDEBUG
+//#define NDEBUG2
+#define arr_len(a) (a + sizeof a / sizeof a[0])
+
+// experimentally determined threshold values for  p - 4
+static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100,
+							  6500, 11500, 20000, 50000, 120000, 350000};
+
+
+///////////////////////
+
+//
+/**
+ * gives the estimated cardinality for m bins, v of which are non-zero
+ * @param m number of bins in the matrix
+ * @param v number of non-zero bins
+ * @return
+ */
+double linearCounting(uint32_t m, uint32_t v) {
+	if (v > m) {
+	    throw std::invalid_argument("number of v should not be greater than m");
+	}
+	double fm = double(m);
+	return fm * log(fm/double(v));
+}
+
+/**
+  * from Numerical Recipes, 3rd Edition, p 352
+  * Returns hash of u as a 64-bit integer.
+  *
+*/
+inline uint64_t ranhash (uint64_t u) {
+  uint64_t v = u * 3935559000370003845 + 2691343689449507681;
+
+  v ^= v >> 21; v ^= v << 37; v ^= v >>  4;
+
+  v *= 4768777513237032717;
+
+  v ^= v << 20; v ^= v >> 41; v ^= v <<  5;
+
+  return v;
+}
+
+inline uint64_t murmurhash3_finalizer (uint64_t key)  {
+	key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that.
+	key ^= key >> 33;
+	key *= 0xff51afd7ed558ccd;
+	key ^= key >> 33;
+	key *= 0xc4ceb9fe1a85ec53;
+	key ^= key >> 33;
+	return key;
+}
+
+/**
+ * Bias correction factors for specific m's
+ * @param m
+ * @return
+ */
+double alpha(uint32_t m)  {
+	switch (m) {
+	case 16: return 0.673;
+	case 32: return 0.697;
+	case 64: return 0.709;
+	}
+
+	// m >= 128
+	return 0.7213 / (1 + 1.079/double(m));
+}
+
+/**
+ * calculate the raw estimate as harmonic mean of the ranks in the register
+ * @param array
+ * @return
+ */
+double calculateEstimate(vector<uint8_t> array) {
+	double inverseSum = 0.0;
+	for (size_t i = 0; i < array.size(); ++i) {
+		// TODO: pre-calculate the power calculation
+		inverseSum += pow(2,-array[i]);
+	}
+	return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum;
+}
+
+uint32_t countZeros(vector<uint8_t> s) {
+	return (uint32_t)count(s.begin(), s.end(), 0);
+}
+
+/**
+ * Extract bits (from uint32_t or uint64_t) using LSB 0 numbering from hi to lo, including lo
+ * @param bits
+ * @param hi
+ * @param lo
+ * @return
+ */
+template<typename T>
+T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {
+
+    // create a bitmask:
+    //            (T(1) << (hi - lo)                 a 1 at the position (hi - lo)
+    //           ((T(1) << (hi - lo) - 1)              1's from position 0 to position (hi-lo-1)
+    //          (((T(1) << (hi - lo)) - 1) << lo)      1's from position lo to position hi
+
+	// The T(1) is required to not cause overflow on 32bit machines
+	// TODO: consider creating a bitmask only once in the beginning
+	T bitmask = (((T(1) << (hi - lo)) - 1) << lo);
+    T result = value & bitmask;
+
+    if (!shift_left) {
+        // shift resulting bits to the right
+        result = result >> lo;
+    } else {
+        // shift resulting bits to the left
+        result = result << (sizeof(T)*8 - hi);
+    }
+    return result;	
+}
+
+template<typename T>
+T extractBits(T bits, uint8_t hi) {
+    // create a bitmask for first hi bits (LSB 0 numbering)
+	T bitmask = T(-1) << (sizeof(T)*8 - hi);
+
+	return (bits & bitmask);
+}
+
+// functions for counting the number of leading 0-bits (clz)
+//           and counting the number of trailing 0-bits (ctz)
+//#ifdef __GNUC__
+
+// TODO: switch between builtin clz and 64_clz based on architecture
+//#define clz(x) __builtin_clz(x)
+#if 0
+static int clz_manual(uint64_t x)
+{
+  // This uses a binary search (counting down) algorithm from Hacker's Delight.
+   uint64_t y;
+   int n = 64;
+   y = x >>32;  if (y != 0) {n -= 32;  x = y;}
+   y = x >>16;  if (y != 0) {n -= 16;  x = y;}
+   y = x >> 8;  if (y != 0) {n -=  8;  x = y;}
+   y = x >> 4;  if (y != 0) {n -=  4;  x = y;}
+   y = x >> 2;  if (y != 0) {n -=  2;  x = y;}
+   y = x >> 1;  if (y != 0) return n - 2;
+   return n - x;
+}
+#endif
+
+inline uint32_t clz(const uint32_t x) {
+	return __builtin_clz(x);
+}
+
+inline uint32_t clz(const uint64_t x) {
+    uint32_t u32 = (x >> 32);
+    uint32_t result = u32 ? __builtin_clz(u32) : 32;
+    if (result == 32) {
+        u32 = x & 0xFFFFFFFFUL;
+        result += (u32 ? __builtin_clz(u32) : 32);
+    }
+    return result;
+}
+//#else
+
+uint32_t clz_log2(const uint64_t w) {
+	return 63 - floor(log2(w));
+}
+//#endif
+
+
+// TODO: the sparse list may be encoded with variable length encoding
+//   see Heule et al., section 5.3.2
+// Also, using sets might give a larger overhead as each insertion costs more
+//  consider using vector and sort/unique when merging.
+typedef set<uint32_t> SparseListType;
+typedef uint64_t HashSize;
+
+/**
+ * HyperLogLogPlusMinus class
+ * typename T corresponds to the hash size - usually either uint32_t or uint64_t (implemented for uint64_t)
+ */
+
+typedef uint64_t T_KEY;
+template <typename T_KEY>
+class HyperLogLogPlusMinus {
+
+private:
+
+	vector<uint8_t> M;  // registers (M) of size m
+	uint8_t p;            // precision
+	uint32_t m;           // number of registers
+	bool sparse;          // sparse representation of the data?
+	SparseListType sparseList; // TODO: use a compressed list instead
+
+	// vectors containing data for bias correction
+	vector<vector<double> > rawEstimateData; // TODO: make this static
+	vector<vector<double> > biasData;
+
+	// sparse versions of p and m
+	static const uint8_t  pPrime = 25; // precision when using a sparse representation
+	                                   // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32
+	static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime
+
+
+public:
+
+	~HyperLogLogPlusMinus() {};
+
+	/**
+	 * Create new HyperLogLogPlusMinus counter
+	 * @param precision
+	 * @param sparse
+	 */
+	HyperLogLogPlusMinus(uint8_t precision=10, bool sparse=true):p(precision),sparse(sparse) {
+		if (precision > 18 || precision < 4) {
+	        throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
+		}
+
+		this->m = 1 << precision;
+
+		if (sparse) {
+			this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
+		} else {
+			this->M = vector<uint8_t>(m);
+		}
+	}
+
+	/**
+	 * Add a new item to the counter.
+	 * @param item
+	 */
+	void add(T_KEY item) {
+		add(item, sizeof(T_KEY));
+	}
+
+	/**
+	 * Add a new item to the counter.
+	 * @param item
+	 * @param size  size of item
+	 */
+	void add(T_KEY item, size_t size) {
+
+		// compute hash for item
+		HashSize hash_value = murmurhash3_finalizer(item);
+
+#ifdef HLL_DEBUG
+		cerr << "Value: " << item << "; hash(value): " << hash_value << endl;
+		cerr << bitset<64>(hash_value) << endl;
+#endif
+
+		if (sparse) {
+			// sparse mode: put the encoded hash into sparse list
+			uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value);
+			this->sparseList.insert(encoded_hash_value);
+
+#ifdef HLL_DEBUG
+			idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value);
+			assert_eq(ir.idx,get_index(hash_value, p));
+			assert_eq(ir.rank, get_rank(hash_value, p));
+#endif
+
+			// if the sparseList is too large, switch to normal (register) representation
+			if (this->sparseList.size() > this->m) { // TODO: is the size of m correct?
+				switchToNormalRepresentation();
+			}
+		} else {
+			// normal mode
+			// take first p bits as index  {x63,...,x64-p}
+			uint32_t idx = get_index(hash_value, p);
+			// shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0}
+			uint8_t rank = get_rank(hash_value, p);
+
+			// update the register if current rank is bigger
+			if (rank > this->M[idx]) {
+				this->M[idx] = rank;
+			}
+		}
+	}
+
+	void add(vector<T_KEY> words) {
+		for(size_t i = 0; i < words.size(); ++i) {
+			this->add(words[i]);
+		}
+	}
+
+	/**
+	 * Reset to its initial state.
+	 */
+	void reset() {
+		this->sparse = true;
+		this->sparseList.clear();  // 
+		this->M.clear();
+	}
+
+	/**
+	 * Convert from sparse representation (using tmpSet and sparseList) to normal (using register)
+	 */
+	void switchToNormalRepresentation() {
+#ifdef HLL_DEBUG
+		cerr << "switching to normal representation" << endl;
+		cerr << " est before: " << cardinality(true) << endl;
+#endif
+		this->sparse = false;
+		this->M = vector<uint8_t>(this->m);
+		if (sparseList.size() > 0) { //TDOD: do I need to check this, here?
+			addToRegisters(this->sparseList);
+			this->sparseList.clear();
+		}
+#ifdef HLL_DEBUG
+		cerr << " est after: " << cardinality(true) << endl;
+#endif
+	}
+
+	/**
+	 * add sparseList to the registers of M
+	 */
+	void addToRegisters(const SparseListType &sparseList) {
+		if (sparseList.size() == 0) {
+			return;
+		}
+		for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) {
+
+			idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr);
+
+			assert_lt(ir.idx,M.size());
+			if (ir.rank > this->M[ir.idx]) {
+				this->M[ir.idx] = ir.rank;
+			}
+		}
+	}
+
+	/**
+	 * Merge another HyperLogLogPlusMinus into this. Converts to normal representation
+	 * @param other
+	 */
+	void merge(const HyperLogLogPlusMinus* other) {
+		if (this->p != other->p) {
+			throw std::invalid_argument("precisions must be equal");
+		}
+
+		if (this->sparse && other->sparse) {
+			if (this->sparseList.size()+other->sparseList.size() > this->m) {
+				switchToNormalRepresentation();
+				addToRegisters(other->sparseList);
+			} else {
+				this->sparseList.insert(other->sparseList.begin(),other->sparseList.end());
+			}
+		} else if (other->sparse) {
+			// other is sparse, but this is not
+			addToRegisters(other->sparseList);
+		} else {
+			if (this->sparse) {
+				switchToNormalRepresentation();
+			}
+
+			// merge registers
+			for (size_t i = 0; i < other->M.size(); ++i) {
+				if (other->M[i] > this->M[i]) {
+					this->M[i] = other->M[i];
+				}
+			}
+		}
+	}
+
+	/**
+	 *
+	 * @return cardinality estimate
+	 */
+	uint64_t cardinality(bool verbose=true) {
+		if (sparse) {
+			// if we are still 'sparse', then use linear counting, which is more
+			//  accurate for low cardinalities, and use increased precision pPrime
+			return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
+		}
+
+		// initialize bias correction data
+		if (rawEstimateData.empty()) { initRawEstimateData(); }
+		if (biasData.empty())        { initBiasData(); }
+
+		// calculate raw estimate on registers
+		//double est = alpha(m) * harmonicMean(M, m);
+		double est = calculateEstimate(M);
+
+		// correct for biases if estimate is smaller than 5m
+		if (est <= double(m)*5.0) {
+			est -= getEstimateBias(est);
+		}
+
+		uint32_t v = countZeros(M);
+		if (v > 2) {
+			// calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix
+			double lc_estimate = linearCounting(m, v);
+
+			// check if the lc estimate is below the threshold
+			if (lc_estimate <= double(threshold[p-4])) {
+				if (lc_estimate < 0) { throw; }
+				// return lc estimate of cardinality
+				return lc_estimate;
+			}
+			return lc_estimate; // always use lc_estimate when available
+		}
+
+		// return bias-corrected hyperloglog estimate of cardinality
+		return uint64_t(est);
+	}
+
+private:
+
+    uint8_t rank(HashSize x, uint8_t b) {
+        uint8_t v = 1;
+        while (v <= b && !(x & 0x80000000)) {
+            v++;
+            x <<= 1;
+        }
+        return v;
+    }
+
+    template<typename T> inline uint32_t get_index(const T hash_value, const uint8_t p, const uint8_t size) const {
+    	// take first p bits as index  {x63,...,x64-p}
+    	assert_lt(p,size);
+    	uint32_t idx = hash_value >> (size - p);
+    	return idx;
+    }
+
+    inline uint32_t get_index(const uint64_t hash_value, const uint8_t p) const {
+        return get_index(hash_value, p, 64);
+    }
+
+    inline uint32_t get_index(const uint32_t hash_value, const uint8_t p) const {
+    	return get_index(hash_value, p, 32);
+    }
+
+    template<typename T> inline
+	T get_trailing_ones(const uint8_t p) const {
+    	return (T(1) << p ) - 1;
+    }
+
+    template<typename T> inline
+    uint8_t get_rank(const T hash_value, const uint8_t p) const {
+    	// shift p values off, and count leading zeros of the remaining string {x63-p,...,x0}
+    	T_KEY rank_bits = (hash_value << p | get_trailing_ones<T>(p));
+#ifdef HLL_DEBUG
+    	cerr << "rank bits: " << bitset<32>(rank_bits) << endl;
+#endif
+
+    	uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1;
+    	assert_leq(rank_val,64-p+1);
+    	return rank_val;
+    }
+
+	void initRawEstimateData() {
+	    rawEstimateData = vector<vector<double> >();
+
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)));
+	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)));
+
+	}
+
+	void initBiasData() {
+		biasData = vector<vector<double> >();
+
+		biasData.push_back(vector<double>(biasData_precision4,arr_len(biasData_precision4)));
+		biasData.push_back(vector<double>(biasData_precision5,arr_len(biasData_precision5)));
+		biasData.push_back(vector<double>(biasData_precision6,arr_len(biasData_precision6)));
+		biasData.push_back(vector<double>(biasData_precision7,arr_len(biasData_precision7)));
+		biasData.push_back(vector<double>(biasData_precision8,arr_len(biasData_precision8)));
+		biasData.push_back(vector<double>(biasData_precision9,arr_len(biasData_precision9)));
+		biasData.push_back(vector<double>(biasData_precision10,arr_len(biasData_precision10)));
+		biasData.push_back(vector<double>(biasData_precision11,arr_len(biasData_precision11)));
+		biasData.push_back(vector<double>(biasData_precision12,arr_len(biasData_precision12)));
+		biasData.push_back(vector<double>(biasData_precision13,arr_len(biasData_precision13)));
+		biasData.push_back(vector<double>(biasData_precision14,arr_len(biasData_precision14)));
+		biasData.push_back(vector<double>(biasData_precision15,arr_len(biasData_precision15)));
+		biasData.push_back(vector<double>(biasData_precision16,arr_len(biasData_precision16)));
+		biasData.push_back(vector<double>(biasData_precision17,arr_len(biasData_precision17)));
+		biasData.push_back(vector<double>(biasData_precision18,arr_len(biasData_precision18)));
+	}
+
+	/**
+	 * Estimate the bias using empirically determined values.
+	 * Uses weighted average of the two cells between which the estimate falls.
+	 * TODO: Check if nearest neighbor average gives better values, as proposed in the paper
+	 * @param est
+	 * @return correction value for
+	 */
+	double getEstimateBias(double estimate) {
+		vector<double> rawEstimateTable = rawEstimateData[p-4];
+		vector<double> biasTable = biasData[p-4];
+	
+		// check if estimate is lower than first entry, or larger than last
+		if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); }
+		if (rawEstimateTable.back()  <= estimate) { return rawEstimateTable.back() - biasTable.back(); }
+	
+		// get iterator to first element that is not smaller than estimate
+		vector<double>::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate);
+		size_t pos = it - rawEstimateTable.begin();
+
+		double e1 = rawEstimateTable[pos-1];
+		double e2 = rawEstimateTable[pos];
+	
+		double c = (estimate - e1) / (e2 - e1);
+
+		return biasTable[pos-1]*(1-c) + biasTable[pos]*c;
+	}
+	
+
+	/**
+	 * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation.
+	 *
+	 * Difference from the algorithm described in the paper:
+	 * The index always is in the p most significant bits
+	 *
+	 * see section 5.3 in Heule et al.
+	 * @param x the hash bits
+	 * @return encoded hash value
+	 */
+	uint32_t encodeHashIn32Bit(uint64_t hash_value) {
+		// extract first pPrime bits, and shift them onto a 32-bit integer
+		uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32);
+
+#ifdef HLL_DEBUG
+		cerr << "value:  " << bitset<64>(hash_value) << endl;
+        cerr << "index: " << std::bitset<32>(idx) << " ( bits from 64 to " << 64-pPrime << "; " << idx << ")" << endl;
+#endif
+
+		// are the bits {63-p, ..., 63-p'} all 0?
+		if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) {
+			// compute the additional rank (minimum rank is already p'-p)
+			// the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag
+			uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0
+			return idx | uint32_t(additional_rank<<1) | 1;
+		} else {
+			// else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0)
+			assert_eq((idx & 1),0);
+			return idx;
+		}
+	}
+
+
+	/**
+	 * struct holding the index and rank/rho of an entry
+	 */
+	struct idx_n_rank {
+		uint32_t idx;
+		uint8_t rank;
+		idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {}
+	};
+
+	//
+	//
+	/**
+	 * Decode a hash from the sparse representation.
+	 * Returns the index and number of leading zeros (nlz) with precision p stored in k
+	 * @param k the hash bits
+	 * @return index and rank in non-sparse format
+	 */
+	idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const  {
+
+		// difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes
+		uint32_t idx = get_index(encoded_hash_value, p);
+		uint8_t rank_val;
+
+		// check if the last bit is 1
+		if ( (encoded_hash_value & 1) == 1) {
+			// if yes: the hash was stored with higher precision, bits p to pPrime were 0
+			uint8_t additional_rank = pPrime - p;
+			rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1);
+		} else {
+			rank_val = get_rank(encoded_hash_value,p);
+
+			// clz counts 64 bit only, it seems
+			if (rank_val > 32)
+				rank_val -= 32;
+		}
+
+		return(idx_n_rank(idx,rank_val));
+	}
+
+};
+
+
+
+
+#endif /* HYPERLOGLOGPLUS_H_ */
diff --git a/src/make_seqid_to_taxid_map.cpp b/src/make_seqid_to_taxid_map.cpp
index 30b3091..8b968aa 100644
--- a/src/make_seqid_to_taxid_map.cpp
+++ b/src/make_seqid_to_taxid_map.cpp
@@ -41,12 +41,22 @@ void report_taxo_numbers(char *filename);
 
 int main(int argc, char **argv) {
   if (argc < 3) {
-    cerr << "Usage: make_seqid_to_taxid_map <gi to taxid map> <gi to seqid list>"
+    cerr << "Usage: make_seqid_to_taxid_map <gi to taxid map> <gi to seqid list> [<names.dmp file> <nodes.dmp file>]\n"
+         << "  If nodes.dmp and names.dmp files are provided, then each sequence header is added with a further link\n"
+         << "  to the taxonomy."
          << endl;
     return 1;
   }
   char *map_filename = argv[1];
   char *list_filename = argv[2];
+
+  char *nodes_filename;
+  char *names_filename;
+  if (argc == 5) {
+      nodes_filename = argv[3];
+      names_filename = argv[4];
+  }
+
   fill_request_map(list_filename);
   report_taxo_numbers(map_filename);
 
@@ -96,8 +106,8 @@ void fill_request_map(char *filename) {
   fptr_start = fptr = file.ptr();
   size_t file_size = file.size();
 
-  // Line format: <gi num><tab><sequence ID>
-  // OR: TAXID<tab><taxonomy ID><tab><sequence ID>        (user spec'ed)
+  // Line format: <gi num><tab><sequence ID><tab><full sequence header>
+  // OR: TAXID<tab><taxonomy ID><tab><sequence ID><tab><full sequence header>    (user spec'ed)
   while ((size_t)(fptr - fptr_start) < file_size) {
     char *nl_ptr = strchr(fptr, '\n');
     char *sep_ptr = strchr(fptr, '\t');
diff --git a/src/third_party/MurmurHash3.cpp b/src/third_party/MurmurHash3.cpp
new file mode 100644
index 0000000..aa7982d
--- /dev/null
+++ b/src/third_party/MurmurHash3.cpp
@@ -0,0 +1,335 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#define	FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
+{
+  return p[i];
+}
+
+FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
+{
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64 ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+    
+    h1 ^= k1;
+    h1 = ROTL32(h1,13); 
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+} 
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  const uint32_t c1 = 0x239b961b; 
+  const uint32_t c2 = 0xab0e9789;
+  const uint32_t c3 = 0x38b34ae5; 
+  const uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  for(int i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock32(blocks,i*4+0);
+    uint32_t k2 = getblock32(blocks,i*4+1);
+    uint32_t k3 = getblock32(blocks,i*4+2);
+    uint32_t k4 = getblock32(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  for(int i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock64(blocks,i*2+0);
+    uint64_t k2 = getblock64(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
+  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
+  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
+  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
+  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
+  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
+  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
+  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
+  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
+  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
+  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
+  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
+  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
+  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  ((uint64_t*)out)[0] = h1;
+  ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
+
diff --git a/src/third_party/MurmurHash3.h b/src/third_party/MurmurHash3.h
new file mode 100644
index 0000000..e1c6d34
--- /dev/null
+++ b/src/third_party/MurmurHash3.h
@@ -0,0 +1,37 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_

From 3e5a0090b2ff4997b8b869be537924bb68d1fe13 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 12 Feb 2017 15:19:13 -0500
Subject: [PATCH 011/105] Skip two gi2seqid and seqid2taxid map creation for
 now

---
 scripts/build_kraken_db.sh   | 43 +++++++++++++-----------------------
 scripts/report_gi_numbers.pl |  2 ++
 2 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
index d0b49a3..5d42fca 100755
--- a/scripts/build_kraken_db.sh
+++ b/scripts/build_kraken_db.sh
@@ -66,7 +66,7 @@ if [ -e "database.jdb" ]
 then
   echo "Skipping step 1, k-mer set already exists."
 else
-  echo "Creating k-mer set (step 1 of 6)..."
+  echo "Creating k-mer set (step 1 of 5)..."
   start_time1=$(date "+%s.%N")
 
   check_for_jellyfish.sh
@@ -111,7 +111,7 @@ else
     then
       echo "Skipping step 2, database reduction unnecessary."
     else
-      echo "Reducing database size (step 2 of 6)..."
+      echo "Reducing database size (step 2 of 5)..."
       max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc)
       idx_size_gb=$(printf %.2f $(echo "$idx_size/2^30" | bc) )
       if (( $(echo "$max_kdb_size < 0" | bc) == 1 ))
@@ -143,7 +143,7 @@ if [ -e "database.kdb" ]
 then
   echo "Skipping step 3, k-mer set already sorted."
 else
-  echo "Sorting k-mer set (step 3 of 6)..."
+  echo "Sorting k-mer set (step 3 of 5)..."
   start_time1=$(date "+%s.%N")
   db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \
     -d database.jdb -o database.kdb.tmp \
@@ -155,41 +155,28 @@ else
   echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]"
 fi
 
-if [ -e "gi2seqid.map" ]
-then
-  echo "Skipping step 4, GI number to seqID map already complete."
-else
-  echo "Creating GI number to seqID map (step 4 of 6)..."
-  start_time1=$(date "+%s.%N")
-  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
-    report_gi_numbers.pl > gi2seqid.map.tmp
-  mv gi2seqid.map.tmp gi2seqid.map
-
-  echo "GI number to seqID map created. [$(report_time_elapsed $start_time1)]"
-fi
-
 if [ -e "seqid2taxid.map" ]
 then
-  echo "Skipping step 5, seqID to taxID map already complete."
+  echo "Skipping step 4, seqID to taxID map already complete."
 else
-  echo "Creating seqID to taxID map (step 5 of 6)..."
-  start_time1=$(date "+%s.%N")
-  make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \
-    > seqid2taxid.map.tmp
-  mv seqid2taxid.map.tmp seqid2taxid.map
-  line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
-
-  echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
+  echo "Creating seqID to taxID map (step 4 of 5)... [blu]"
+#  start_time1=$(date "+%s.%N")
+#  make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \
+#    > seqid2taxid.map.tmp
+#  mv seqid2taxid.map.tmp seqid2taxid.map
+#  line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
+
+#  echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
 fi
 
 if [ -e "lca.complete" ]
 then
-  echo "Skipping step 6, LCAs already set."
+  echo "Skipping step 5, LCAs already set."
 else
-  echo "Setting LCAs in database (step 6 of 6)..."
+  echo "Setting LCAs in database (step 5 of 5)..."
   start_time1=$(date "+%s.%N")
   find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
-    set_lcas $MEMFLAG -x -d database.kdb -i database.idx \
+    set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \
     -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
   touch "lca.complete"
 
diff --git a/scripts/report_gi_numbers.pl b/scripts/report_gi_numbers.pl
index 88a24f0..0d07b85 100755
--- a/scripts/report_gi_numbers.pl
+++ b/scripts/report_gi_numbers.pl
@@ -38,6 +38,7 @@
   next unless /^>(\S+)/;
   my $seq_id = $1;
   if ($seq_id =~ /(^|\|)kraken:taxid\|(\d+)/) {
+
     print "TAXID\t$2\t$seq_id\t$_\n";
     next;
   }
@@ -45,5 +46,6 @@
   if ($seq_id !~ /(^|\|)gi\|(\d+)/) {
     die "$PROG: sequence ID $seq_id lacks GI number, aborting.\n";
   }
+
   print "$2\t$seq_id\t$_\n";
 }

From 93904c7397b9baac187d346ff477f59e38b3ddad Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Tue, 14 Feb 2017 20:54:57 -0500
Subject: [PATCH 012/105] Don't display counts

---
 src/classify.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/classify.cpp b/src/classify.cpp
index 5909a85..9dbe48b 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -58,6 +58,7 @@ bool Print_kraken = true;
 bool Populate_memory = false;
 bool Only_classified_kraken_output = false;
 bool Print_sequence = true;
+bool Print_Progress = false;
 uint32_t Minimum_hit_count = 1;
 map<uint32_t, uint32_t> Parent_map;
 vector<KrakenDB*> KrakenDatabases;
@@ -235,7 +236,8 @@ void process_file(char *filename) {
           (*Unclassified_output) << unclassified_output_ss.str();
         total_sequences += work_unit.size();
         total_bases += total_nt;
-        cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ...";
+        if (Print_Progress) 
+          cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ...";
       }
     }
   }  // end parallel section

From 905fa088a2355b50491016b5c63291f5a935e533 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Tue, 14 Feb 2017 20:55:10 -0500
Subject: [PATCH 013/105] Fix Makefile

---
 src/Makefile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 6e2c938..48debd2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 CXX = g++
-CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fsyntax-only
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink get_kmers
+CXXFLAGS = -Wall -std=c++11 -fopenmp -O3
+PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink
 
 .PHONY: all install clean
 
@@ -18,8 +18,6 @@ db_sort: krakendb.o quickfile.o
 
 set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o
 
-get_kmers: krakendb.o quickfile.o krakenutil.o seqreader.o
-
 classify: krakendb.o quickfile.o krakenutil.o seqreader.o
 
 make_seqid_to_taxid_map: quickfile.o

From 7b4530441a4a1b7591f76ff0a9ee4a70089b9bba Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 15 Feb 2017 12:50:36 -0500
Subject: [PATCH 014/105] Added taxdb from k-SLAM for writing report after
 classification

---
 src/build_taxdb.cpp |  33 ++++
 src/taxdb.h         | 372 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 405 insertions(+)
 create mode 100644 src/build_taxdb.cpp
 create mode 100644 src/taxdb.h

diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
new file mode 100644
index 0000000..a802aa2
--- /dev/null
+++ b/src/build_taxdb.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "taxdb.h"
+
+#include <iostream>
+using namespace std;
+
+int main(int argc, char **argv) {
+    std::string database_dir = argv[0];
+    TaxonomyDB taxdb;
+    taxdb.writeTaxonomyIndex(
+            std::cout,
+            database_dir + "/taxonomy/nodes.dmp",
+            database_dir + "/taxonomy/names.dmp");
+
+}
diff --git a/src/taxdb.h b/src/taxdb.h
new file mode 100644
index 0000000..da975e3
--- /dev/null
+++ b/src/taxdb.h
@@ -0,0 +1,372 @@
+/* Original work Copyright 2013 David Ainsworth
+ * Modified work copyright 2017 Florian Breitwieser
+ *
+ * The original file is part of SLAM
+ *
+ * SLAM is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SLAM is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+
+ * You should have received a copy of the GNU Affero General Public License
+ * along with SLAM.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TAXD_DB_H_
+#define TAXD_DB_H_
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+void log (const std::string& s) {
+	std::cerr << s << "\n";
+}
+
+std::vector<std::string> tokenise(const std::string &line, const std::string& delimiters) {
+	std::vector<std::string> tokens;
+	// Skip delimiters at beginning.
+	std::string::size_type lastPos = line.find_first_not_of(delimiters, 0);
+	std::string::size_type pos = line.find_first_of(delimiters, lastPos);
+	while (std::string::npos != pos || std::string::npos != lastPos) {
+	  tokens.push_back(line.substr(lastPos, pos - lastPos));
+	  // Skip delimiters.  Note the "not_of"
+	  lastPos = line.find_first_not_of(delimiters, pos);
+	  pos = line.find_first_of(delimiters, lastPos);
+	}
+	return tokens;
+}
+
+class TaxonomyEntry {
+ public:
+  uint32_t taxonomyID = 0;
+  uint32_t parentTaxonomyID = 0;
+  std::string rank;
+  std::string scientificName;
+
+  TaxonomyEntry() {}
+  TaxonomyEntry(uint32_t taxonomyID_, uint32_t parentTaxonomyID_, std::string rank_, std::string scientificName_) :
+	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {}
+
+  inline bool operator==(const TaxonomyEntry& other) const {
+    return this->taxonomyID == other.taxonomyID &&
+           this->parentTaxonomyID == other.parentTaxonomyID &&
+           this->scientificName == other.scientificName;
+  }
+  TaxonomyEntry* parent = nullptr;
+  std::vector<TaxonomyEntry*> children;
+  unsigned numReadsAligned = 0;
+  unsigned numReadsAlignedToChildren = 0;
+  bool used = false;
+  uint64_t genomeSize = 0;
+  uint64_t genomeSizeOfChildren = 0;
+  uint64_t numBelow = 0;
+};
+
+class TaxonomyDB {
+ public:
+  TaxonomyDB(const std::string inFileName);
+  TaxonomyDB() {};
+  std::unordered_map<uint32_t, TaxonomyEntry> taxIDsAndEntries;
+  void parseNamesDump(const std::string namesDumpFileName);
+  void parseNodesDump(const std::string nodesDumpFileName);
+  uint32_t getTaxIDAtRank(const uint32_t taxID, const std::string& rank) const;
+  std::string getScientificName(const uint32_t taxID) const;
+  std::string getRank(const uint32_t taxID) const;
+  uint32_t getLowestCommonAncestor(const std::vector<uint32_t>& taxIDs) const;
+  uint32_t getParentTaxID(const uint32_t taxID) const;
+  std::string getLineage(uint32_t taxonomyID) const;
+  std::string getMetaPhlAnLineage(uint32_t taxonomyID) const;
+  char* getIndexFileName(const uint32_t hostTaxID) const;
+  void readTaxonomyIndex(const std::string inFileName);
+  void writeTaxonomyIndex(std::ostream & outs,
+                          const std::string namesDumpFileName,
+                          const std::string nodesDumpFileName);
+  bool isSubSpecies(uint32_t taxonomyID) const;
+  int isBelowInTree(uint32_t upper, uint32_t lower) const;
+  void createPointers();
+};
+
+
+void TaxonomyDB::createPointers() {
+  for (auto& tax : taxIDsAndEntries) {
+    auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
+    if (parentIt != taxIDsAndEntries.end()) {
+      tax.second.parent = &(parentIt->second);
+      parentIt->second.children.push_back(&tax.second);
+    }
+  }
+}
+TaxonomyDB::TaxonomyDB(const std::string inFileName) {
+  log("Building taxonomy index");
+  readTaxonomyIndex(inFileName);
+  createPointers();
+  log("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) +
+      " nodes");
+}
+
+void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) {
+  std::ifstream nodesDumpFile(nodesDumpFileName);
+  if (!nodesDumpFile.is_open())
+    throw std::runtime_error("unable to open nodes file");
+  std::string line;
+  while (nodesDumpFile.good()) {
+    getline(nodesDumpFile, line);
+    std::vector<std::string> tokens = tokenise(line, "\t|");
+    if (tokens.size() > 2) {
+      TaxonomyEntry newEntry;
+      newEntry.taxonomyID = stoi(tokens[0]);
+      newEntry.parentTaxonomyID = stoi(tokens[1]);
+      newEntry.rank = tokens[2];
+      auto entryIt = taxIDsAndEntries.insert({
+        newEntry.taxonomyID, newEntry
+      });
+      if (!entryIt.second) {
+        entryIt.first->second.taxonomyID = newEntry.taxonomyID;
+        newEntry.parentTaxonomyID = stoi(tokens[1]);
+      }
+    }
+  }
+}
+
+void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) {
+  std::ifstream namesDumpFile(namesDumpFileName);
+  if (!namesDumpFile.is_open())
+    throw std::runtime_error("unable to open names file");
+  std::string line;
+  while (namesDumpFile.good()) {
+    getline(namesDumpFile, line);
+    std::vector<std::string> tokens = tokenise(line, "|");
+    for (auto& token : tokens) {
+      if (token.size() > 1) {
+        if (token[0] == '\t') token.erase(0, 1);
+        if (token[token.size() - 1] == '\t') token.erase(token.size() - 1, 1);
+      }
+    }
+    if (tokens.size() > 3) {
+      TaxonomyEntry newEntry;
+      newEntry.taxonomyID = stoi(tokens[0]);
+      //            for(auto & token : tokens)
+      //                std::cout<<token<<"\n";
+      if (tokens[3] == "scientific name") {
+        //      std::cout<<"Found\n";
+        newEntry.scientificName = tokens[1];
+        //      std::cout<<newEntry.scientificName<<"\n";
+      } else
+        continue;
+      auto entryIt = taxIDsAndEntries.insert({
+        newEntry.taxonomyID, newEntry
+      });
+      if (!entryIt.second) {
+        entryIt.first->second.scientificName = newEntry.scientificName;
+      }
+    }
+  }
+}
+
+void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs,
+									const std::string namesDumpFileName,
+                                    const std::string nodesDumpFileName) {
+  parseNodesDump(nodesDumpFileName);
+  parseNamesDump(namesDumpFileName);
+  for (auto& entry : taxIDsAndEntries) {
+    outs << entry.first << "\t" << entry.second.parentTaxonomyID << "\t"
+            << entry.second.scientificName << "\t" << entry.second.rank << "\n";
+  }
+}
+
+void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) {
+  std::ifstream inFile(inFileName);
+  if (!inFile.is_open())
+    throw std::runtime_error("unable to open taxonomy index file");
+
+  uint32_t taxonomyID, parentTaxonomyID;
+  std::string scientificName, rank;
+
+  while (inFile >> taxonomyID >> parentTaxonomyID >> rank >> scientificName) {
+    TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
+
+    taxIDsAndEntries.insert({
+      taxonomyID, newEntry
+    });
+  }
+}
+
+uint32_t TaxonomyDB::getLowestCommonAncestor(
+    const std::vector<uint32_t>& taxIDs) const {
+  if (taxIDs.size() == 0) {
+    return 0;
+  }
+  std::vector<std::vector<uint32_t> > paths;
+  for (auto& taxID : taxIDs) {
+    bool good = true;
+    std::vector<uint32_t> path;
+    uint32_t tempTaxID = taxID;
+    while (tempTaxID != 0) {
+      path.push_back(tempTaxID);
+      tempTaxID = getParentTaxID(tempTaxID);
+    }
+    if (good) paths.push_back(path);
+  }
+  if (paths.size() == 0) {
+    return 0;
+  }
+  for (auto& path : paths)
+    std::reverse(path.begin(), path.end());
+  std::sort(paths.begin(), paths.end(),
+            [](std::vector<uint32_t> i, std::vector<uint32_t> j) {
+    return i.size() < j.size();
+  });
+  uint32_t consensus = 0;
+  for (unsigned i = 0; i < paths[0].size(); i++) {
+    uint32_t temp = 0;
+    for (auto& path : paths) {
+      if (temp == 0)
+        temp = path[i];
+      else if (temp != path[i]) {
+        return consensus;
+      }
+    }
+    consensus = temp;
+  }
+  return consensus;
+}
+
+uint32_t TaxonomyDB::getParentTaxID(const uint32_t taxID) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1)
+    return entry->second.parentTaxonomyID;
+  else
+    return 0;
+}
+
+std::string TaxonomyDB::getScientificName(const uint32_t taxID) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  if (entry != taxIDsAndEntries.end()) {
+    return entry->second.scientificName;
+  } else
+    return std::string();
+}
+
+std::string TaxonomyDB::getRank(const uint32_t taxID) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  if (entry != taxIDsAndEntries.end()) {
+    return entry->second.rank;
+  } else
+    return std::string();
+}
+
+std::string TaxonomyDB::getLineage(uint32_t taxonomyID) const {
+  std::string lineage;
+  while (true) {
+    // 131567 = Cellular organisms
+    if (taxonomyID != 131567) {
+      if (lineage.size()) lineage.insert(0, "; ");
+      lineage.insert(0, getScientificName(taxonomyID));
+      if (getRank(taxonomyID) == "species") lineage.clear();
+    }
+    taxonomyID = getParentTaxID(taxonomyID);
+    if (taxonomyID == 0) {
+      if (lineage.size()) lineage.append(".");
+      break;
+    }
+  }
+  return lineage;
+}
+std::string TaxonomyDB::getMetaPhlAnLineage(uint32_t taxonomyID) const {
+  std::string rank = getRank(taxonomyID);
+  if (rank == "superphylum") return std::string();
+  std::string lineage;
+  while (true) {
+    // 131567 = Cellular organisms
+    if (taxonomyID != 131567) {
+      std::string rank = getRank(taxonomyID);
+      if (rank == "species") {
+        lineage.insert(0, "|s__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "genus") {
+        lineage.insert(0, "|g__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "family") {
+        lineage.insert(0, "|f__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "order") {
+        lineage.insert(0, "|o__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "class") {
+        lineage.insert(0, "|c__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "phylum") {
+        lineage.insert(0, "|p__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "superkingdom") {
+        lineage.insert(0, "k__");
+        lineage.insert(3, getScientificName(taxonomyID));
+      }
+    }
+    taxonomyID = getParentTaxID(taxonomyID);
+    if (taxonomyID == 0) {
+      break;
+    }
+  }
+  std::replace(lineage.begin(), lineage.end(), ' ', '_');
+  return lineage;
+}
+
+uint32_t TaxonomyDB::getTaxIDAtRank(const uint32_t taxID,
+                                    const std::string& rank) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  while (entry != taxIDsAndEntries.end() &&
+         entry->second.parentTaxonomyID != 1) {
+    if (entry->second.rank == rank) {
+      return entry->second.taxonomyID;
+    } else
+      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+  }
+  return 0;
+}
+int TaxonomyDB::isBelowInTree(uint32_t upper, uint32_t lower) const {
+  auto entry = taxIDsAndEntries.find(lower);
+  unsigned level = 0;
+  while (entry != taxIDsAndEntries.end() &&
+         entry->second.parentTaxonomyID != 1) {
+    if (entry->first == upper) {
+      return level;
+    } else {
+      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+      level++;
+    }
+  }
+  return -1;
+}
+bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const {
+  bool isSubSpecies = false;
+  auto entry = taxIDsAndEntries.find(taxonomyID);
+  int numLevels = 0;
+  while (entry != taxIDsAndEntries.end() &&
+         entry->second.parentTaxonomyID != 1) {
+    if (entry->second.rank == "species") {
+      if (numLevels > 0) {
+        isSubSpecies = true;
+      }
+      break;
+    } else
+      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+    numLevels++;
+  }
+  return isSubSpecies;
+}
+
+
+
+
+#endif /* TAXD_DB_H_ */

From 330e186976fe66a55bdfd6e4591c62ca2c8ef45e Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 16 Feb 2017 13:22:13 -0500
Subject: [PATCH 015/105] Build report and taxDB in Kraken

---
 scripts/build_kraken_db.sh |  10 ++++
 scripts/kraken             |   2 +
 src/Makefile               |   7 ++-
 src/build_taxdb.cpp        |   9 +--
 src/classify.cpp           |  47 ++++++++++-----
 src/hyperloglogplus.h      |  10 ++++
 src/report-cols.h          |  53 +++++++++++++++++
 src/taxdb.h                | 118 +++++++++++++++++++++++++++++++++++++
 8 files changed, 235 insertions(+), 21 deletions(-)
 create mode 100644 src/report-cols.h

diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
index 5d42fca..4f64c14 100755
--- a/scripts/build_kraken_db.sh
+++ b/scripts/build_kraken_db.sh
@@ -169,6 +169,16 @@ else
 #  echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
 fi
 
+if [ -e "taxDB" ]
+then
+  echo "Skipping step 4.5, taxDB exists."
+else
+  echo "Creating taxDB (step 4.5 of 5)... "
+  build_taxdb taxonomy/nodes.dmp taxonomy/names.dmp > taxDB
+fi
+
+
+
 if [ -e "lca.complete" ]
 then
   echo "Skipping step 5, LCAs already set."
diff --git a/scripts/kraken b/scripts/kraken
index c81ed38..1119868 100755
--- a/scripts/kraken
+++ b/scripts/kraken
@@ -147,6 +147,8 @@ push @flags, "-C", $classified_out if defined $classified_out;
 push @flags, "-o", $outfile if defined $outfile;
 push @flags, "-c", if $only_classified_output;
 push @flags, "-M" if $preload;
+push @flags, "-r", $report_file;
+push @flags, "-a", $db_prefix[0]."/taxDB";
 
 # handle piping for decompression/merging
 my @pipe_argv;
diff --git a/src/Makefile b/src/Makefile
index 48debd2..98bdd00 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 CXX = g++
-CXXFLAGS = -Wall -std=c++11 -fopenmp -O3
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink
+CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3
+PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb
 
 .PHONY: all install clean
 
@@ -22,6 +22,9 @@ classify: krakendb.o quickfile.o krakenutil.o seqreader.o
 
 make_seqid_to_taxid_map: quickfile.o
 
+build_taxdb: taxdb.h
+	$(CXX) $(CXXFLAGS) -o build_taxdb build_taxdb.cpp
+
 krakenutil.o: krakenutil.cpp krakenutil.hpp
 	$(CXX) $(CXXFLAGS) -c krakenutil.cpp
 
diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index a802aa2..08e649a 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -23,11 +23,12 @@
 using namespace std;
 
 int main(int argc, char **argv) {
-    std::string database_dir = argv[0];
+	if (argc != 3) {
+      std::cout << "Provide names.dmp and nodes.dmp\n";
+      return 1;
+    }
     TaxonomyDB taxdb;
     taxdb.writeTaxonomyIndex(
-            std::cout,
-            database_dir + "/taxonomy/nodes.dmp",
-            database_dir + "/taxonomy/names.dmp");
+            std::cout, argv[1], argv[2]);
 
 }
diff --git a/src/classify.cpp b/src/classify.cpp
index 9dbe48b..2f09ff7 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -23,6 +23,7 @@
 #include "quickfile.hpp"
 #include "seqreader.hpp"
 #include "hyperloglogplus.h"
+#include "taxdb.h"
 
 const size_t DEF_WORK_UNIT_SIZE = 500000;
 
@@ -38,13 +39,7 @@ string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig);
 set<uint32_t> get_ancestry(uint32_t taxon);
 void report_stats(struct timeval time1, struct timeval time2);
 
-struct ReadCounts {
-	uint32_t n_reads;
-	uint32_t n_kmers;
-    HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
-};
-
-map<uint64_t, ReadCounts> taxon_counts; // stats per taxon
+unordered_map<uint32_t, ReadCounts> taxon_counts; // stats per taxon
 
 int Num_threads = 1;
 vector<string> DB_filenames;
@@ -55,6 +50,7 @@ bool Fastq_input = false;
 bool Print_classified = false;
 bool Print_unclassified = false;
 bool Print_kraken = true;
+bool Print_kraken_report = true;
 bool Populate_memory = false;
 bool Only_classified_kraken_output = false;
 bool Print_sequence = true;
@@ -62,11 +58,13 @@ bool Print_Progress = false;
 uint32_t Minimum_hit_count = 1;
 map<uint32_t, uint32_t> Parent_map;
 vector<KrakenDB*> KrakenDatabases;
-string Classified_output_file, Unclassified_output_file, Kraken_output_file;
+string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file;
 ostream *Classified_output;
 ostream *Unclassified_output;
 ostream *Kraken_output;
+ostream *Report_output;
 size_t Work_unit_size = DEF_WORK_UNIT_SIZE;
+TaxonomyDB taxdb;
 
 uint64_t total_classified = 0;
 uint64_t total_sequences = 0;
@@ -152,6 +150,18 @@ int main(int argc, char **argv) {
   else
     Kraken_output = &cout;
 
+  if (Report_output_file.empty() || Report_output_file == "-") {
+     Print_kraken_report = false;
+  } else {
+     Report_output = new ofstream(Report_output_file.c_str());
+  }
+
+  if (!TaxDB_file.empty() && Print_kraken_report) {
+	  taxdb.readTaxonomyIndex(TaxDB_file);
+  } else {
+     Print_kraken_report = false;
+  }
+
   struct timeval tv1, tv2;
   gettimeofday(&tv1, NULL);
   for (int i = optind; i < argc; i++)
@@ -242,12 +252,11 @@ void process_file(char *filename) {
     }
   }  // end parallel section
 
-  // Write out report - print k-mers and read numbers
-  for (auto& elem : taxon_counts) {
-        //elem.first gives you the key (int)
-        //elem.second gives you the mapped element (vector)
-        cerr << elem.first << "\t" << elem.second.n_reads << "\t" << 
-            elem.second.n_kmers << "\t" << elem.second.kmers.cardinality() << "\n";
+  if (Print_kraken_report) {
+	  // Fill TaxDB with counts
+	  taxdb.fillCounts(taxon_counts);
+	  TaxReport rep = TaxReport(*Report_output, taxdb, false);
+	  rep.printReport("kraken","blu");
   }
 
   delete reader;
@@ -410,7 +419,7 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:M")) != -1) {
+  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:")) != -1) {
     switch (opt) {
       case 'd' :
         DB_filenames.push_back(optarg);
@@ -458,6 +467,12 @@ void parse_command_line(int argc, char **argv) {
       case 'o' :
         Kraken_output_file = optarg;
         break;
+      case 'r' :
+        Report_output_file = optarg;
+        break;
+      case 'a' :
+        TaxDB_file = optarg;
+        break;
       case 'u' :
         sig = atoll(optarg);
         if (sig <= 0)
@@ -498,6 +513,8 @@ void usage(int exit_code) {
        << "* -i filename      Kraken DB index filename" << endl
        << "  -n filename      NCBI Taxonomy nodes file" << endl
        << "  -o filename      Output file for Kraken output" << endl
+       << "  -r filename      Output file for Kraken report output" << endl
+       << "  -a filename      TaxDB" << endl
        << "  -t #             Number of threads" << endl
        << "  -u #             Thread work unit size (in bp)" << endl
        << "  -q               Quick operation" << endl
diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
index 33f5dc1..8cd2bdc 100644
--- a/src/hyperloglogplus.h
+++ b/src/hyperloglogplus.h
@@ -388,6 +388,16 @@ class HyperLogLogPlusMinus {
 		}
 	}
 
+	HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus* other) {
+		merge(other);
+		return *this;
+	}
+
+	HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus& other) {
+		merge(&other);
+		return *this;
+	}
+
 	/**
 	 *
 	 * @return cardinality estimate
diff --git a/src/report-cols.h b/src/report-cols.h
new file mode 100644
index 0000000..7087a82
--- /dev/null
+++ b/src/report-cols.h
@@ -0,0 +1,53 @@
+/*
+ * report-cols.h
+ * Copyright (C) 2017 fbreitwieser <fbreitwieser@sherman>
+ *
+ * Distributed under terms of the MIT license.
+ */
+
+#ifndef REPORT_COLS_H
+#define REPORT_COLS_H
+
+#include<map>
+
+enum class REPORTCOLS : uint8_t {
+	SPACED_NAME,
+	NAME,
+	TAX_ID,
+	TAX_RANK,
+	DEPTH,
+	GENOME_SIZE,
+	NUM_READS,
+	NUM_READS_CLADE,
+	NUM_UNIQUE_KMERS,
+	TOTAL_SCORE,
+	TOTAL_HIT_LENGTH,
+	ABUNDANCE,
+	ABUNDANCE_LEN,
+	PERCENTAGE
+};
+
+
+static const std::map<string, REPORTCOLS> report_col_name_map = {
+		{"name", REPORTCOLS::NAME},
+		{"spaced_name", REPORTCOLS::SPACED_NAME},
+		{"taxID", REPORTCOLS::TAX_ID},
+		{"taxRank", REPORTCOLS::TAX_RANK},
+		{"depth", REPORTCOLS::DEPTH},
+		{"genomeSize", REPORTCOLS::GENOME_SIZE},
+		{"numReads", REPORTCOLS::NUM_READS},
+		{"numReadsClade", REPORTCOLS::NUM_READS_CLADE},
+		{"numUniqueKmers", REPORTCOLS::NUM_UNIQUE_KMERS},
+		{"totalHitLen", REPORTCOLS::TOTAL_HIT_LENGTH},
+		{"totalScore", REPORTCOLS::TOTAL_SCORE},
+		{"abundance", REPORTCOLS::ABUNDANCE},
+		{"abundance_len", REPORTCOLS::ABUNDANCE_LEN},
+
+		{"percent", REPORTCOLS::PERCENTAGE},
+		{"taxId", REPORTCOLS::TAX_ID},
+		{"reads_clade", REPORTCOLS::NUM_READS_CLADE}, // Change to clade reads!
+		{"reads_stay", REPORTCOLS::NUM_READS}, // Change to clade reads!
+
+};
+
+#endif /* !REPORT_COLS_H */
diff --git a/src/taxdb.h b/src/taxdb.h
index da975e3..da11c96 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -2,6 +2,7 @@
  * Modified work copyright 2017 Florian Breitwieser
  *
  * The original file is part of SLAM
+ * The modified file is part of a modified Kraken version
  *
  * SLAM is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Affero General Public License as published by
@@ -27,6 +28,17 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "hyperloglogplus.h"
+#include "report-cols.h"
+
+typedef uint32_t TaxId;
+
+struct ReadCounts {
+	uint32_t n_reads;
+	uint32_t n_kmers;
+    HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
+};
+
 
 void log (const std::string& s) {
 	std::cerr << s << "\n";
@@ -64,12 +76,15 @@ class TaxonomyEntry {
   }
   TaxonomyEntry* parent = nullptr;
   std::vector<TaxonomyEntry*> children;
+
   unsigned numReadsAligned = 0;
   unsigned numReadsAlignedToChildren = 0;
   bool used = false;
   uint64_t genomeSize = 0;
   uint64_t genomeSizeOfChildren = 0;
   uint64_t numBelow = 0;
+  uint64_t numKmers;
+  HyperLogLogPlusMinus<uint64_t> kmers;
 };
 
 class TaxonomyDB {
@@ -93,7 +108,9 @@ class TaxonomyDB {
                           const std::string nodesDumpFileName);
   bool isSubSpecies(uint32_t taxonomyID) const;
   int isBelowInTree(uint32_t upper, uint32_t lower) const;
+  void fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_counts);
   void createPointers();
+  void printReport();
 };
 
 
@@ -366,6 +383,107 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const {
   return isSubSpecies;
 }
 
+void TaxonomyDB::fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_counts) {
+	for (auto& elem : taxon_counts) {
+		TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first);
+		tax->numReadsAligned += elem.second.n_reads;
+		tax->numKmers += elem.second.n_kmers;
+		tax->kmers += elem.second.kmers;
+
+		while (tax->parent != nullptr) {
+			tax = tax->parent;
+			tax->numReadsAlignedToChildren += elem.second.n_reads;
+			tax->numKmers += elem.second.n_kmers;
+			tax->kmers += elem.second.kmers;
+		}
+	 }
+}
+
+
+class TaxReport {
+private:
+	std::ostream& _reportOfb;
+	TaxonomyDB & _taxdb;
+	std::vector<REPORTCOLS> _report_cols;
+	uint64_t _total_n_reads;
+	bool _show_zeros;
+
+	void printLine(TaxonomyEntry& tax, unsigned depth);
+
+public:
+	TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros);
+
+	void printReport(std::string format, std::string rank);
+	void printReport(TaxonomyEntry& tax, unsigned depth);
+};
+
+TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
+	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
+}
+
+void TaxReport::printReport(std::string format, std::string rank) {
+	_total_n_reads =
+			_taxdb.taxIDsAndEntries.at(0).numReadsAligned +
+			_taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren +
+			_taxdb.taxIDsAndEntries.at(1).numReadsAligned +
+			_taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren +
+			_taxdb.taxIDsAndEntries.at(-1).numReadsAligned +
+			_taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree
+
+	if (format == "kraken") {
+		// A: print number of unidentified reads
+		printReport(_taxdb.taxIDsAndEntries.at(0),0u);
+		// B: print normal results
+		printReport(_taxdb.taxIDsAndEntries.at(1),0u);
+		// C: Print Unclassified stuff
+		printReport(_taxdb.taxIDsAndEntries.at(-1),0u);
+	} else {
+		// print stuff at a certain level ..
+		//_uid_abundance;
+		//_taxinfo
+
+	}
+}
+
+void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) {
+
+	if (_show_zeros || (tax.numReadsAligned+tax.numReadsAlignedToChildren) > 0) {
+		printLine(tax, depth);
+
+		for (auto child : tax.children) {
+			printReport(*child, depth+1);
+		}
+	}
+
+}
+
+void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) {
+	for (auto& col : _report_cols) {
+		switch (col) {
+		case REPORTCOLS::NAME:        _reportOfb << tax.scientificName ; break;
+		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
+		case REPORTCOLS::TAX_ID:     _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
+		case REPORTCOLS::DEPTH:     _reportOfb << depth; break;
+		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break;
+		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
+		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
+		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (tax.numReadsAligned + tax.numReadsAlignedToChildren); break;
+		case REPORTCOLS::NUM_READS:  _reportOfb << tax.numReadsAligned; break;
+		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
+		//case REPORTCOLS::GENOME_SIZE: ; break;
+		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
+		//case REPORTCOLS::SUM_SCORE: ; break;
+		case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break;
+		default: _reportOfb << "NA";
+		}
+		if (&col == &_report_cols.back()) {
+			_reportOfb << '\n';
+		} else {
+			_reportOfb << '\t';
+		}
+	}
+}
+
 
 
 

From c60b8a2d4e9d628802a2377332896fbd700b88db Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sat, 18 Feb 2017 10:19:03 -0500
Subject: [PATCH 016/105] Allow compressed output, and fix report file
 generation

---
 install_kraken.sh               |   1 +
 scripts/kraken                  |   9 +-
 src/Makefile                    |   6 +-
 src/classify.cpp                | 136 ++++++---
 src/gzstream/.Makefile.swp      | Bin 0 -> 12288 bytes
 src/gzstream/COPYING.LIB        | 504 ++++++++++++++++++++++++++++++++
 src/gzstream/Makefile           |  88 ++++++
 src/gzstream/README             |   6 +
 src/gzstream/gzstream.C         | 165 +++++++++++
 src/gzstream/gzstream.h         | 121 ++++++++
 src/gzstream/index.html         | 145 +++++++++
 src/gzstream/libgzstream.a      | Bin 0 -> 14254 bytes
 src/gzstream/logo.gif           | Bin 0 -> 1651 bytes
 src/gzstream/test_gunzip.C      |  78 +++++
 src/gzstream/test_gzip.C        |  78 +++++
 src/gzstream/version            |   1 +
 src/krakenutil.cpp              |  14 +-
 src/krakenutil.hpp              |   9 +-
 src/make_seqid_to_taxid_map.cpp |  12 +-
 src/report-cols.h               |   1 +
 src/set_lcas.cpp                |   5 +-
 src/taxdb.h                     |  52 +++-
 22 files changed, 1352 insertions(+), 79 deletions(-)
 create mode 100644 src/gzstream/.Makefile.swp
 create mode 100644 src/gzstream/COPYING.LIB
 create mode 100644 src/gzstream/Makefile
 create mode 100644 src/gzstream/README
 create mode 100644 src/gzstream/gzstream.C
 create mode 100644 src/gzstream/gzstream.h
 create mode 100644 src/gzstream/index.html
 create mode 100644 src/gzstream/libgzstream.a
 create mode 100644 src/gzstream/logo.gif
 create mode 100644 src/gzstream/test_gunzip.C
 create mode 100644 src/gzstream/test_gzip.C
 create mode 100644 src/gzstream/version

diff --git a/install_kraken.sh b/install_kraken.sh
index f0673a2..e7af3d7 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -39,6 +39,7 @@ fi
 export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")
 
 mkdir -p "$KRAKEN_DIR"
+make -C src clean
 make -C src install
 for file in scripts/*
 do
diff --git a/scripts/kraken b/scripts/kraken
index 1119868..29cce0d 100755
--- a/scripts/kraken
+++ b/scripts/kraken
@@ -57,6 +57,7 @@ my $unclassified_out;
 my $classified_out;
 my $outfile;
 my $report_file;
+my $print_sequence = 0;
 
 GetOptions(
   "help" => \&display_help,
@@ -69,6 +70,7 @@ GetOptions(
   "min-hits=i" => \$min_hits,
   "unclassified-out=s" => \$unclassified_out,
   "classified-out=s" => \$classified_out,
+  "print-sequence=s" => \$print_sequence,
   "output=s" => \$outfile,
   "report-file=s" => \$report_file,
   "preload" => \$preload,
@@ -77,7 +79,7 @@ GetOptions(
   "gzip-compressed" => \$gunzip,
   "bzip2-compressed" => \$bunzip2,
   "only-classified-output" => \$only_classified_output,
-);
+) or die $!;
 
 if (! defined $threads) {
   $threads = $ENV{"KRAKEN_NUM_THREADS"} || 1;
@@ -147,8 +149,9 @@ push @flags, "-C", $classified_out if defined $classified_out;
 push @flags, "-o", $outfile if defined $outfile;
 push @flags, "-c", if $only_classified_output;
 push @flags, "-M" if $preload;
-push @flags, "-r", $report_file;
+push @flags, "-r", $report_file if defined $report_file;
 push @flags, "-a", $db_prefix[0]."/taxDB";
+push @flags, "-s" if $print_sequence;
 
 # handle piping for decompression/merging
 my @pipe_argv;
@@ -197,7 +200,7 @@ if (@pipe_argv) {
   }
 }
 
-print STDERR "$CLASSIFY, @flags, @ARGV\n";
+print STDERR "$CLASSIFY @flags @ARGV\n";
 exec $CLASSIFY, @flags, @ARGV;
 die "$PROG: exec error: $!\n";
 
diff --git a/src/Makefile b/src/Makefile
index 98bdd00..73b6b9c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,7 @@
 CXX = g++
-CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3
+CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3 -g
 PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb
+LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
 .PHONY: all install clean
 
@@ -18,7 +19,8 @@ db_sort: krakendb.o quickfile.o
 
 set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o
 
-classify: krakendb.o quickfile.o krakenutil.o seqreader.o
+classify: krakendb.o quickfile.o krakenutil.o seqreader.o taxdb.h
+	$(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS)
 
 make_seqid_to_taxid_map: quickfile.o
 
diff --git a/src/classify.cpp b/src/classify.cpp
index 2f09ff7..9f7933e 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -24,6 +24,7 @@
 #include "seqreader.hpp"
 #include "hyperloglogplus.h"
 #include "taxdb.h"
+#include "gzstream.h"
 
 const size_t DEF_WORK_UNIT_SIZE = 500000;
 
@@ -38,7 +39,6 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
 string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig);
 set<uint32_t> get_ancestry(uint32_t taxon);
 void report_stats(struct timeval time1, struct timeval time2);
-
 unordered_map<uint32_t, ReadCounts> taxon_counts; // stats per taxon
 
 int Num_threads = 1;
@@ -53,16 +53,18 @@ bool Print_kraken = true;
 bool Print_kraken_report = true;
 bool Populate_memory = false;
 bool Only_classified_kraken_output = false;
-bool Print_sequence = true;
-bool Print_Progress = false;
+bool Print_sequence = false;
+bool Print_Progress = true;
 uint32_t Minimum_hit_count = 1;
-map<uint32_t, uint32_t> Parent_map;
+unordered_map<uint32_t, uint32_t> Parent_map;
 vector<KrakenDB*> KrakenDatabases;
 string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file;
 ostream *Classified_output;
 ostream *Unclassified_output;
 ostream *Kraken_output;
 ostream *Report_output;
+vector<ofstream*> Open_fstreams;
+vector<ogzstream*> Open_gzstreams;
 size_t Work_unit_size = DEF_WORK_UNIT_SIZE;
 TaxonomyDB taxdb;
 
@@ -70,6 +72,29 @@ uint64_t total_classified = 0;
 uint64_t total_sequences = 0;
 uint64_t total_bases = 0;
 
+inline bool ends_with(std::string const & value, std::string const & ending)
+{
+        if (ending.size() > value.size()) return false;
+            return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+
+ostream* cout_or_file(string file) {
+    if (file == "-")
+      return &cout;
+
+    if (ends_with(file, ".gz")) {
+      ogzstream* ogzs = new ogzstream(file.c_str());
+      Open_gzstreams.push_back(ogzs);
+      return ogzs;
+    } else {
+      ofstream* ofs = new ofstream(file.c_str());
+      Open_fstreams.push_back(ofs);
+      return ofs;
+    }
+}
+
+
+
 void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) {
 	QuickFile db_file;
 	db_file.open_file(DB_filename);
@@ -92,9 +117,21 @@ int main(int argc, char **argv) {
   #endif
 
   parse_command_line(argc, argv);
-  if (! Nodes_filename.empty()) {
-    cerr << "Building parent node map " << endl;
-    Parent_map = build_parent_map(Nodes_filename);
+  //if (! Nodes_filename.empty()) {
+  //  cerr << "Building parent node map " << endl;
+  //  Parent_map = build_parent_map(Nodes_filename);
+  //}
+  
+  if (!TaxDB_file.empty()) {
+	  taxdb = TaxonomyDB(TaxDB_file);
+      for (const auto & tax : taxdb.taxIDsAndEntries) {
+          if (tax.first != 0)
+          Parent_map[tax.first] = tax.second.parentTaxonomyID;
+      }
+      Parent_map[1] = 0;
+  } else {
+      cerr << "TaxDB argument is required!" << endl;
+      return 1;
   }
 
   if (Populate_memory)
@@ -102,7 +139,7 @@ int main(int argc, char **argv) {
 
   // TODO: Check DB_filenames and Index_filesnames have the same length
   for (size_t i=0; i < DB_filenames.size(); ++i) {
-    cerr << "\t " << DB_filenames[i] << endl;
+    //cerr << "\t " << DB_filenames[i] << endl;
     static QuickFile db_file;
     db_file.open_file(DB_filenames[i]);
     if (Populate_memory)
@@ -128,46 +165,54 @@ int main(int argc, char **argv) {
     cerr << "\ncomplete." << endl;
 
   if (Print_classified) {
-    if (Classified_output_file == "-")
-      Classified_output = &cout;
-    else
-      Classified_output = new ofstream(Classified_output_file.c_str());
+    Classified_output = cout_or_file(Classified_output_file);
   }
 
   if (Print_unclassified) {
-    if (Unclassified_output_file == "-")
-      Unclassified_output = &cout;
-    else
-      Unclassified_output = new ofstream(Unclassified_output_file.c_str());
+    Unclassified_output = cout_or_file(Unclassified_output_file);
   }
 
   if (! Kraken_output_file.empty()) {
-    if (Kraken_output_file == "-")
+    if (Kraken_output_file == "off")
       Print_kraken = false;
-    else
-      Kraken_output = new ofstream(Kraken_output_file.c_str());
-  }
-  else
-    Kraken_output = &cout;
-
-  if (Report_output_file.empty() || Report_output_file == "-") {
-     Print_kraken_report = false;
+    else {
+      cerr << "Writing Kraken output to " << Kraken_output_file << endl;
+      Kraken_output = cout_or_file(Kraken_output_file);
+    }
   } else {
-     Report_output = new ofstream(Report_output_file.c_str());
+    Kraken_output = &cout;
   }
 
-  if (!TaxDB_file.empty() && Print_kraken_report) {
-	  taxdb.readTaxonomyIndex(TaxDB_file);
-  } else {
-     Print_kraken_report = false;
+  if (!Report_output_file.empty()) {
+     Print_kraken_report = true;
+      cerr << "Writing Kraken report output to " << Report_output_file << endl;
+     Report_output = cout_or_file(Report_output_file);
   }
 
+  cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl;
+
   struct timeval tv1, tv2;
   gettimeofday(&tv1, NULL);
   for (int i = optind; i < argc; i++)
     process_file(argv[i]);
   gettimeofday(&tv2, NULL);
 
+  std::cerr << "Finishing up ..\n";
+
+  if (Print_kraken_report) {
+	taxdb.fillCounts(taxon_counts);
+	TaxReport rep = TaxReport(*Report_output, taxdb, false);
+	rep.printReport("kraken","blu");
+  }
+
+  for (ofstream* ofs : Open_fstreams) {
+    ofs->close();
+  }
+  for (ogzstream* ogzs : Open_gzstreams) {
+    ogzs->close();
+  }
+
+
   report_stats(tv1, tv2);
 
   return 0;
@@ -198,7 +243,6 @@ void report_stats(struct timeval time1, struct timeval time2) {
 }
 
 void process_file(char *filename) {
-  cerr << "k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl;
   string file_str(filename);
   DNASequenceReader *reader;
   DNASequence dna;
@@ -246,19 +290,12 @@ void process_file(char *filename) {
           (*Unclassified_output) << unclassified_output_ss.str();
         total_sequences += work_unit.size();
         total_bases += total_nt;
-        if (Print_Progress) 
+        if (Print_Progress && total_sequences % 100000 < work_unit.size()) 
           cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ...";
       }
     }
   }  // end parallel section
 
-  if (Print_kraken_report) {
-	  // Fill TaxDB with counts
-	  taxdb.fillCounts(taxon_counts);
-	  TaxReport rep = TaxReport(*Report_output, taxdb, false);
-	  rep.printReport("kraken","blu");
-  }
-
   delete reader;
 }
 
@@ -275,7 +312,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss) {
   vector<uint32_t> taxa;
   vector<uint8_t> ambig_list;
-  map<uint32_t, uint32_t> hit_counts;
+  unordered_map<uint32_t, uint32_t> hit_counts;
   uint64_t *kmer_ptr;
   uint32_t taxon = 0;
   uint32_t hits = 0;  // only maintained if in quick mode
@@ -297,9 +334,13 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
             if (taxon) break;
         }
 
+        #pragma omp critical
+        {
+        taxon_counts[taxon].kmers.add(*kmer_ptr);
+        ++taxon_counts[taxon].n_kmers;
+        }
+
         if (taxon) {
-          taxon_counts[taxon].kmers.add(*kmer_ptr);
-          ++taxon_counts[taxon].n_kmers;
           hit_counts[taxon]++;
           if (Quick_mode && ++hits >= Minimum_hit_count)
             break;
@@ -318,7 +359,9 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
   if (call)
     #pragma omp atomic
     total_classified++;
-    ++(taxon_counts[call].n_reads);
+
+  #pragma omp critical
+  ++(taxon_counts[call].n_reads);
 
   if (Print_unclassified || Print_classified) {
     ostringstream *oss_ptr = call ? &coss : &uoss;
@@ -419,7 +462,7 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:")) != -1) {
+  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:s")) != -1) {
     switch (opt) {
       case 'd' :
         DB_filenames.push_back(optarg);
@@ -470,6 +513,9 @@ void parse_command_line(int argc, char **argv) {
       case 'r' :
         Report_output_file = optarg;
         break;
+      case 's' :
+        Print_sequence = true;
+        break;
       case 'a' :
         TaxDB_file = optarg;
         break;
@@ -524,7 +570,7 @@ void usage(int exit_code) {
        << "  -f               Input is in FASTQ format" << endl
        << "  -c               Only include classified reads in output" << endl
        << "  -M               Preload database files" << endl
-       << "  -s               Print sequence in Kraken output" << endl
+       << "  -s               Print read sequence in Kraken output" << endl
        << "  -h               Print this message" << endl
        << endl
        << "At least one FASTA or FASTQ file must be specified." << endl
diff --git a/src/gzstream/.Makefile.swp b/src/gzstream/.Makefile.swp
new file mode 100644
index 0000000000000000000000000000000000000000..f5e077d91d7cee7748a0835509de1d24b28f23d8
GIT binary patch
literal 12288
zcmeHNJ8UDz86M|x$=PudpUb%TZ5`m#nOu^RFXy8m1AElt2vZV4QaQ=ZVYoXa2e;ha
z%*=|Sj6;BwhJh3|V7N%<1bG2*nj}b?BzBP^RXUeOiU0{x1Syg--z>S*O1XotP~%`1
z`oPuB%s2o4{g0)f=~uti+@YJ!%M91o8GHAY=U#p5(wkR4{xM^Nz7o<*q|idy;YVG8
zjKW8a<)2ryr?~M()k>bZG8TppMHr2xr^~U7MIb{_(UZ!3#qW!-qLo(}o@k>4A6Itp
zXdnZT=Pm9F1quZoM}eo=>PoF@XD+?)JUx45|8W)-a|#6t1quZU1quZU1quZU1quZU
z1^!PIF!2-YJMiUmbAJxzpHH0ooc~&U6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy
zd<GTZK4U+?AN$XL9>(MU|JmRFzxWzs5~u=Y;2&RQ>;vH2KnYj|{`xdyKLq+f6(|FL
z`wC<40|&rm;1clnrx^Pq@E+g;9Jm5J1N`Ah#=Z}{4qO5Le3`L#fK6Zn`1h9?`z`Pr
z;77olz!W$JmVtkLiLpNc-vthUXMv9{G4>(wGvKGdTYv=a0<Qu!;2GdQUu5hrz^{N`
z0&fEz&<ECmr+_DcPrd*i;3vTMfCvbIJHT_mH-LXazuyDD1AYyB0Q?+yA2<dkz%}3n
zz-moEaeWpPSfb6nEw)7K7kHsI)R?Y25e2fJoo#VA8KKR+;fajrcS(qQ<^_U>JP4@1
zST}o?KDhcu<KW-{m81Nc{5^KL`bKMGtL1KYAF$aLDKXX0O*>-@_Bu~}Y8dVwMr$Wz
zq_ZjiOX+qVob!4iK*Y{W2UTupVGgWnw`bl^**oVrtjc-lH1hspEHf_k-)Wrkk51iv
z6#vYjMTi#Mv*#JJ{q1piPrW-eq``7ea{`>>T%zmOuhX9MnnOJ#4Gm-{^+@=n6VDS`
z50W65I(AZ;YQq!9jALhbf{pAQyX?!sfEs)4!{+X`(`w#c3^<jXfb*yMg=u<zD$8(|
zVT*m)<%|~sJ!D_(kcO&GxE31cV~KZ%81N)8^|R@Nvn*7;zzcL^95SP<>eh&q7)M7!
z!<um%DXylFI}o}=W+bdJ_ECv==u@nsq2gnGh6D__9zl}Lv_8)!k{69*%bzleW8ozj
zOr>yU5g#MKI5G!2)qR~s+{u{cGFh?(Q_fIKivuo0(sCRJf>dI+GSSRy>qP2#f*y40
zT^*|~P~$RVZYNcRXq~tsPqBH2vJc8Fhsr@#2e>_9S;^bcCPw!dL))8CtkdwNmvFm8
zgGf<JhRHE?gnx${`leGWrK3keX&h6e73bCsX9eEmRyP_{4p&#Ot-HHj*T!FPb+?<;
zRPq=+H%heAY<Cf0!u*Wxrn8FUR2Qtx61g_^!~-K#$c;P_aAJM}63xZNXz}ZaJWTBi
z623ITw^@*j8*xn&fmg?5gp8W!gA9kn<2aBWxT8>0pKEjus3VPCkKB+(Q#1)F<jA#{
zd%~gJ$iNIUk{X%fL55HWKTRBLu7%0Az%fmO9l*nVpI$6)Y`@;^bvE6dd%Nz=X1DD&
zHrFlfi#F#LVZ}YN(i5riUcj{$wurdTV`HD`bPr3MZaIw-l^SsIiFv_q`Xz!JZem7}
zx~7-Ygsszt5HY_`w-L!{6mx$x<!B(S#GKG;!kADfEb53OX><0T&yB#F>*@wR>$Fm>
z)+)7XrTQ|}Zq%!{>MO6%)!g)1O<;PqzD_k~HJ!3$b6|1RX>ph~HSz+#->k!)ttbg$
zIUFcaGxVGi-MEE!RoZjfHq<5RCP-OYU9FbrcBGA+x#JR5S8BCdxwcxpNwnW}!PSzU
z2$2Fjh>*JzX-2a|ocG=}k^>4OQ=*Boj7TP116xW6>P7=I;YxtzFiJ>|q9pKX#E;PV
zgeOp2K5-OAJf&!mF5BMSrxrS_P_!)&3La29>7!aeVQqQX(85C7fyR*%<l|Jc479DM
zoy|_;j=S4)Z#P@b-XUSiR<pOe+3nKSUWZ(2yPaOMvEM>ZZ0~p4d)-Zks0-t*7cd7L
z{lw7SX7A44evjPUL%QpBI-opUOVx-lDvm^$F0gG~!Yg>KcxbF&An4h`o$E!o^(9VJ
zOVDB_3f}HCiekGZ+NMM`OLPrXcA^EoOO(KXpRDbzIJHczd1y-3t7}-AR%R62xG$qq
zO({Uus%)i|V-ZYEvDG#G+R~q<CY%XbE``<zC(%b~S&Somb(vZdkzZi8mScx88rG-B
z@4N~Fn<9FiB5T(8w4Ew6(@Y+_6Y8`Q*_y;p1F53qaAfFOV>zuxdc1nhT6T#VcwI>(
lygkv5w{6<&kh<u}wl!OaP7tZITPtK6q|72`^}EnI{VzORJx2fl

literal 0
HcmV?d00001

diff --git a/src/gzstream/COPYING.LIB b/src/gzstream/COPYING.LIB
new file mode 100644
index 0000000..b1e3f5a
--- /dev/null
+++ b/src/gzstream/COPYING.LIB
@@ -0,0 +1,504 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+  
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/src/gzstream/Makefile b/src/gzstream/Makefile
new file mode 100644
index 0000000..9884a9e
--- /dev/null
+++ b/src/gzstream/Makefile
@@ -0,0 +1,88 @@
+# ============================================================================
+# gzstream, C++ iostream classes wrapping the zlib compression library.
+# Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+# 
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# 
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+# 
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# ============================================================================
+# 
+# File          : Makefile
+# Revision      : $Revision: 1.3 $
+# Revision_date : $Date: 2001/10/04 15:09:28 $
+# Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+# 
+# ============================================================================
+
+# ----------------------------------------------------------------------------
+# adapt these settings to your need:
+# add '-DGZSTREAM_NAMESPACE=name' to CPPFLAGS to place the classes
+# in its own namespace. Note, this macro needs to be set while creating
+# the library as well while compiling applications based on it.
+# As an alternative, gzstream.C and gzstream.h can be edited.
+# ----------------------------------------------------------------------------
+
+# CXX      = CC -n32 -LANG:std   # for SGI Irix 6.5, MIPSpro CC version 7.30
+CXX      = g++   # for Linux RedHat 6.1, g++ version 2.95.2
+
+CPPFLAGS = -I. -O
+LDFLAGS  = -L. -lgzstream -lz
+AR       = ar cr
+
+# ----------------------------------------------------------------------------
+# plain simple rules to make and cleanup the library:
+# make default;   compiles the library
+# make test;      compiles and executes test. O.K. message marks success.
+# make clean;     removes temporary files
+# make cleanall;  removes temporary files, the library, and programs
+# ----------------------------------------------------------------------------
+
+default: libgzstream.a
+
+test:    test_gzip test_gunzip
+	./test_gzip COPYING.LIB gz.tmp.gz
+	gunzip gz.tmp.gz
+	diff COPYING.LIB gz.tmp
+	gzip gz.tmp
+	./test_gunzip gz.tmp.gz gz.tmp
+	diff COPYING.LIB gz.tmp
+	rm gz.tmp.gz gz.tmp
+	# *** O.K. Test finished successfully. ***
+
+gzstream.o : gzstream.C gzstream.h
+	${CXX} ${CPPFLAGS} -c -o gzstream.o gzstream.C
+
+test_gzip.o : test_gzip.C gzstream.h
+	${CXX} ${CPPFLAGS} -c -o test_gzip.o test_gzip.C
+
+test_gunzip.o : test_gunzip.C gzstream.h
+	${CXX} ${CPPFLAGS} -c -o test_gunzip.o test_gunzip.C
+
+libgzstream.a : gzstream.o
+	${AR} libgzstream.a gzstream.o
+
+test_gzip : test_gzip.o libgzstream.a
+	${CXX} -o test_gzip test_gzip.o ${LDFLAGS}
+
+test_gunzip : test_gunzip.o libgzstream.a
+	${CXX} -o test_gunzip test_gunzip.o ${LDFLAGS}
+
+clean :
+	rm *.o
+
+cleanall :
+	rm *.o libgzstream.a test_gzip test_gunzip
+
+# ============================================================================
+# EOF
+
diff --git a/src/gzstream/README b/src/gzstream/README
new file mode 100644
index 0000000..5fb78b2
--- /dev/null
+++ b/src/gzstream/README
@@ -0,0 +1,6 @@
+
+                              gzstream
+      C++ iostream classes wrapping the zlib compression library.
+===========================================================================
+
+    See index.html for documentation and installation instructions.
diff --git a/src/gzstream/gzstream.C b/src/gzstream/gzstream.C
new file mode 100644
index 0000000..8cb4590
--- /dev/null
+++ b/src/gzstream/gzstream.C
@@ -0,0 +1,165 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.C
+// Revision      : $Revision: 1.7 $
+// Revision_date : $Date: 2003/01/08 14:41:27 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#include <gzstream.h>
+#include <iostream>
+#include <string.h>  // for memcpy
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See header file for user classes.
+// ----------------------------------------------------------------------------
+
+// --------------------------------------
+// class gzstreambuf:
+// --------------------------------------
+
+gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
+    if ( is_open())
+        return (gzstreambuf*)0;
+    mode = open_mode;
+    // no append nor read/write mode
+    if ((mode & std::ios::ate) || (mode & std::ios::app)
+        || ((mode & std::ios::in) && (mode & std::ios::out)))
+        return (gzstreambuf*)0;
+    char  fmode[10];
+    char* fmodeptr = fmode;
+    if ( mode & std::ios::in)
+        *fmodeptr++ = 'r';
+    else if ( mode & std::ios::out)
+        *fmodeptr++ = 'w';
+    *fmodeptr++ = 'b';
+    *fmodeptr = '\0';
+    file = gzopen( name, fmode);
+    if (file == 0)
+        return (gzstreambuf*)0;
+    opened = 1;
+    return this;
+}
+
+gzstreambuf * gzstreambuf::close() {
+    if ( is_open()) {
+        sync();
+        opened = 0;
+        if ( gzclose( file) == Z_OK)
+            return this;
+    }
+    return (gzstreambuf*)0;
+}
+
+int gzstreambuf::underflow() { // used for input buffer only
+    if ( gptr() && ( gptr() < egptr()))
+        return * reinterpret_cast<unsigned char *>( gptr());
+
+    if ( ! (mode & std::ios::in) || ! opened)
+        return EOF;
+    // Josuttis' implementation of inbuf
+    int n_putback = gptr() - eback();
+    if ( n_putback > 4)
+        n_putback = 4;
+    memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);
+
+    int num = gzread( file, buffer+4, bufferSize-4);
+    if (num <= 0) // ERROR or EOF
+        return EOF;
+
+    // reset buffer pointers
+    setg( buffer + (4 - n_putback),   // beginning of putback area
+          buffer + 4,                 // read position
+          buffer + 4 + num);          // end of buffer
+
+    // return next character
+    return * reinterpret_cast<unsigned char *>( gptr());    
+}
+
+int gzstreambuf::flush_buffer() {
+    // Separate the writing of the buffer from overflow() and
+    // sync() operation.
+    int w = pptr() - pbase();
+    if ( gzwrite( file, pbase(), w) != w)
+        return EOF;
+    pbump( -w);
+    return w;
+}
+
+int gzstreambuf::overflow( int c) { // used for output buffer only
+    if ( ! ( mode & std::ios::out) || ! opened)
+        return EOF;
+    if (c != EOF) {
+        *pptr() = c;
+        pbump(1);
+    }
+    if ( flush_buffer() == EOF)
+        return EOF;
+    return c;
+}
+
+int gzstreambuf::sync() {
+    // Changed to use flush_buffer() instead of overflow( EOF)
+    // which caused improper behavior with std::endl and flush(),
+    // bug reported by Vincent Ricard.
+    if ( pptr() && pptr() > pbase()) {
+        if ( flush_buffer() == EOF)
+            return -1;
+    }
+    return 0;
+}
+
+// --------------------------------------
+// class gzstreambase:
+// --------------------------------------
+
+gzstreambase::gzstreambase( const char* name, int mode) {
+    init( &buf);
+    open( name, mode);
+}
+
+gzstreambase::~gzstreambase() {
+    buf.close();
+}
+
+void gzstreambase::open( const char* name, int open_mode) {
+    if ( ! buf.open( name, open_mode))
+        clear( rdstate() | std::ios::badbit);
+}
+
+void gzstreambase::close() {
+    if ( buf.is_open())
+        if ( ! buf.close())
+            clear( rdstate() | std::ios::badbit);
+}
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+// ============================================================================
+// EOF //
diff --git a/src/gzstream/gzstream.h b/src/gzstream/gzstream.h
new file mode 100644
index 0000000..861653f
--- /dev/null
+++ b/src/gzstream/gzstream.h
@@ -0,0 +1,121 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.h
+// Revision      : $Revision: 1.5 $
+// Revision_date : $Date: 2002/04/26 23:30:15 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#ifndef GZSTREAM_H
+#define GZSTREAM_H 1
+
+// standard C++ with new header file names and std:: namespace
+#include <iostream>
+#include <fstream>
+#include <zlib.h>
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See below for user classes.
+// ----------------------------------------------------------------------------
+
+class gzstreambuf : public std::streambuf {
+private:
+    static const int bufferSize = 47+256;    // size of data buff
+    // totals 512 bytes under g++ for igzstream at the end.
+
+    gzFile           file;               // file handle for compressed file
+    char             buffer[bufferSize]; // data buffer
+    char             opened;             // open/close state of stream
+    int              mode;               // I/O mode
+
+    int flush_buffer();
+public:
+    gzstreambuf() : opened(0) {
+        setp( buffer, buffer + (bufferSize-1));
+        setg( buffer + 4,     // beginning of putback area
+              buffer + 4,     // read position
+              buffer + 4);    // end position      
+        // ASSERT: both input & output capabilities will not be used together
+    }
+    int is_open() { return opened; }
+    gzstreambuf* open( const char* name, int open_mode);
+    gzstreambuf* close();
+    ~gzstreambuf() { close(); }
+    
+    virtual int     overflow( int c = EOF);
+    virtual int     underflow();
+    virtual int     sync();
+};
+
+class gzstreambase : virtual public std::ios {
+protected:
+    gzstreambuf buf;
+public:
+    gzstreambase() { init(&buf); }
+    gzstreambase( const char* name, int open_mode);
+    ~gzstreambase();
+    void open( const char* name, int open_mode);
+    void close();
+    gzstreambuf* rdbuf() { return &buf; }
+};
+
+// ----------------------------------------------------------------------------
+// User classes. Use igzstream and ogzstream analogously to ifstream and
+// ofstream respectively. They read and write files based on the gz* 
+// function interface of the zlib. Files are compatible with gzip compression.
+// ----------------------------------------------------------------------------
+
+class igzstream : public gzstreambase, public std::istream {
+public:
+    igzstream() : std::istream( &buf) {} 
+    igzstream( const char* name, int open_mode = std::ios::in)
+        : gzstreambase( name, open_mode), std::istream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::in) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+class ogzstream : public gzstreambase, public std::ostream {
+public:
+    ogzstream() : std::ostream( &buf) {}
+    ogzstream( const char* name, int mode = std::ios::out)
+        : gzstreambase( name, mode), std::ostream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::out) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+#endif // GZSTREAM_H
+// ============================================================================
+// EOF //
+
diff --git a/src/gzstream/index.html b/src/gzstream/index.html
new file mode 100644
index 0000000..8a9ef8e
--- /dev/null
+++ b/src/gzstream/index.html
@@ -0,0 +1,145 @@
+<html> <head>  
+<title>Gzstream Library Home Page</title>
+</head>  
+<body BGCOLOR="FAF8E8" TEXT="#000000">
+
+<h1>Gzstream Library Home Page</h1>
+
+<hr>
+    <TABLE><TR><TD ALIGN=LEFT VALIGN=TOP>
+        <img border=0 src="logo.gif" align=center>
+    </TD><TD ALIGN=LEFT VALIGN=TOP NOWRAP>
+	<ul>
+	<li><a href="#intro"> Introduction</a>
+        <li><a href="#sys">   Supported Systems</a>
+	<li><a href="#inst">  Installation</a>
+	<li><a href="#doc">   Documentation</a>
+	<li><a href="#miss">  What's Missing</a>
+	<li><a href="#src">   Download</a>
+	<li><a href="#links"> Links</a><P>
+	</ul>
+    </TD></TR></TABLE>
+
+
+<hr><!-------------------------------------------------------------------->
+<a name="intro"><h2> Introduction </h2></a>
+
+<i>Gzstream</i> is a small C++ library, basically just a wrapper,
+that provides the functionality of the 
+<a href="http://www.gzip.org/zlib/">zlib C-library</a> in a C++ iostream.
+It is freely available under the <a href="COPYING.LIB">LGPL license</a>.<P>
+
+Gzstream has been written by 
+<a href="http://www.cs.unc.edu/~debug/">Deepak Bandyopadhyay</a> and
+<a href="http://www.cs.unc.edu/~kettner/">Lutz Kettner</a> at 
+the <a href="http://www.cs.unc.edu/Research/compgeom/">Computational 
+Geometry Group at UNC Chapel Hill</a>.<P>
+
+
+<hr><!-------------------------------------------------------------------->
+<a name="sys"><h2> Supported Systems </h2></a>
+
+Gzstream requires a standard compliant C++ compiler (we use the new
+header file conventions and the new iostream in the std:: name space)
+and, of course, zlib. We used zlib 1.1.3 so far, but see the <a
+href="http://www.gzip.org/zlib/">zlib home page</a> for why you should
+upgrade to zlib 1.1.4. So, in theory, the provided sources could run
+on many platforms. However, we used only the following few
+platforms.<P>
+<P>
+
+<ul>
+  <li> PC Linux, RedHat 6.1, g++ version 2.95.2
+  <li> PC Linux, Debian, g++ version 2.95.2 and 3.1
+  <li> SGI Irix 6.5, MIPSpro CC version 7.30
+</ul><P>
+
+
+<hr><!-------------------------------------------------------------------->
+<a name="inst"><h2> Installation </h2></a>
+
+Either compile <tt>gzstream.C</tt> by hand, place it in some library,
+and move <tt>gzstream.h</tt> into the include search path of your
+compiler. Or use the provided <tt>Makefile</tt>, adapt its 
+variables, and follow the remarks in the <tt>Makefile</tt>. Two 
+test programs are provided, <tt>test_gzip.C</tt> and <tt>test_gunzip.C</tt>.
+The <tt>Makefile</tt> contains a rule that performs a small test
+with these programs.<P>
+
+
+<hr><!-------------------------------------------------------------------->
+<a name="doc"><h2> Documentation </h2></a>
+
+The library provides two classes, <tt>igzstream</tt> and <tt>ogzstream</tt>,
+that can be used analogously to <tt>ifstream</tt> and <tt>ofstream</tt>
+respectively.<P>
+
+The classes are by default in the global name space. This can 
+be changed by setting the macro <tt>GZSTREAM_NAMESPACE</tt> to
+the desired name space, e.g., by setting the option 
+</tt>-DGZSTREAM_NAMESPACE=gz</tt> in the <tt>Makefile</tt>. 
+However, this needs to be consistent for both, the library compilation
+and the application that uses the library.<P>
+
+
+<hr><!-------------------------------------------------------------------->
+<a name="miss"><h2> What's Missing </h2></a>
+
+<ul>
+  <li> Seek. The zlib library provides the necessary functionality,
+       but we have not realized that in the wrapper (yet? ;-).
+  <li> Both streams are based on the same streambuffer. So, they 
+       cannot be used to derive an iogzstream class that would allow
+       simultaneous reading and writing to the same file.
+</ul><P>
+
+
+<hr><!-------------------------------------------------------------------->
+<a name="src"><h2> Download and Release Notes</h2></a>
+
+<ul>
+  <li> Gzstream library 1.5 (08 Apr 2003):
+         <a href="gzstream.tgz">gzstream.tgz</a><br>
+       Fixed bug that did not set the state correctly on failure to open or 
+       close a file. <br>
+       Fixed bug in the indexing of the write buffer that
+       caused the write buffer to shrink continously and finally caused 
+       wrong results when writing compressed files (only observed on some
+       platforms). <P>
+  <li> Gzstream library 1.4 (27 Apr 2002):<br>
+       Fixed a bug that stopped stream output after calling <tt>flush()</tt>
+       or using <tt>std::endl</tt>.<P>
+  <li> Gzstream library 1.3 (06 Nov 2001):<br>
+       Fixed unsigned char -- signed char bug. Increased buffer size
+       for better performance.<P>
+  <li> Gzstream library 1.2 (04 Oct 2001):<br>
+       Initial release as gzstream, renamed from zipstream.<P>
+  <li> Zipstream library 1.1 (09 Sep 2001):<br>
+       Initial release.
+</ul>
+
+<hr><!-------------------------------------------------------------------->
+<a name="links"><h2> Acknowledgements </h2></a>
+
+Credits for finding bugs and improving this software go to:
+Vincent Ricard, Peter Milley, Peter J. Torelli, and Ares Lagae.
+<P>
+
+<hr><!-------------------------------------------------------------------->
+<a name="links"><h2> Links </h2></a>
+
+<ul>
+    <li><a href="http://www.gzip.org/zlib/">zlib C-library</a>
+    <li><a href="http://www.cs.unc.edu/~debug/">Deepak Bandyopadhyay</a>
+    <li><a href="http://www.cs.unc.edu/~kettner/">Lutz Kettner</a>
+    <li><a href="http://www.cs.unc.edu/Research/compgeom/">
+            The Computational Geometry Group at UNC Chapel Hill</a>
+</ul>
+
+<hr><!-------------------------------------------------------------------->
+<address> 
+    The Computational Geometry Group at UNC Chapel Hill, Jan. 08, 2003. 
+</address>
+</body>  </html>
+<!-------------------------------------------------------------------->
+<!EOF>
diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a
new file mode 100644
index 0000000000000000000000000000000000000000..92861086535726f3d31314ecb9e4bfe8ba8724b8
GIT binary patch
literal 14254
zcmdT~4{RJ&dY|>!#3AW=L%8B2Z8e4f4hhb#V;hHmU1#lhOqO6`CqyK<$$GtxSFyd8
zcN{xTOI-F67#X8TTwB!AJFcn(=+s_0wTQwAd@RIwRRuY<MM&-G(i3-0<7-Pn<RUno
z`+aY|H~a0|8RE2}Qa@?tz4v?HzxTaAGxO#(+>@F}58rp~R->F^E$w%1YiW-~TMZ*B
zs`%Y7+P1flz~O5(P5ZK@RekzJU)CeLW3eMMd3z$08XcG%+CDipn3)(F$xU?~)6~kA
z+<0cJt9MsATT#`T9?9i1J~Exn0_@*cW~M^EGv+57C9v7^FqO{`D`+f(L1q83s5REy
zV(l)k@pPB*ce=`hWTa^xoLKYF$Yg%lf;bIjCVU)j&mE)SRYs^Ke|#+M+a5PUiqlH0
z%Un-1mpY-mA{J1=x~IQS?TP-L3Z$<*f*9MLN~m9fG}%o}Id|;y5g&_2w;2Z9W8&V{
z27^7etz}!Y81sj(J=vAqSrrPwz;`L6-KUjG64kUwxRj5JY+T!<-70v04a#+Jp<D#6
z>nQ*0l-@uoOfA84Bc;DV>2E=5z}3;ex6gDwGv}rv=BZ0{Up)<&dG}O(-D7i*Tbh}D
zZ3<1;2TkWgb918hZ>phGxLjTL;P=e*xlM5UL2`a@jcIOrFX{a21*p`_`J+j5?nM06
z%^+zy{~^Fr3y&H}CtYhgM<eDl=6t7~G@V3!-OGvO(rvFS11%YD@?|%<?oweI<ZJ6r
z9mmRxbuV9Y;84fG;+a0P@bd_`Jo|?5Z1Rxl{K9z}c`v=GPwsQh70%ZeF4x<Am)@+k
z*PN~1I04o+hL+Zu&iPAkM(ou9O$q3%HgGl`0vCVg`t2m*=BWj{;cNjlhfL@Fr5j(t
z7A>uJleKPABNDUlR^&htgC~mQj@@9+ty=Dlx;iwB1GV23s_QHsD4vl~UtA>uw)prH
zU{GY|a}c8cbp2UO)Cx>3&y!Abu~3qG4YNXSDPcNqn^IUZFNJ%Yx1{Q%^V0xf$suB$
zoV%e%pSkdF4?qt<+?~%{znmY7^gxm7mp*_oRQNeKj^oHM=W>narccbdlX2&w;CvPP
zcl+CuYn=<fcprK{lxTD=I_DsQwFeFr&s=&OdoN<&-sgNQ!?>~Vt7UtwIoI77e<ErY
zx*J16Q{nP*T~~2wjTj?EaX(N*F6_9q=H1=Kl-o%~5$+<o<q3Xmq3-1cZ~)8}na9_d
z&MI?mU#&S8zhFAouQIQu50<w21zSeZbc&^{8#QXQm~`_eUFX30$uF66x~cC64}LtC
zHwy2#?ZvUOSBw^luk()`&fl4Hs?SOEd9k}zjF-i3ToE)V*Hdax+DK`X(sq%~pLEBl
z95ZJNCviBM^NC)VqfxjT&H2KkfI+q$;C@J93MT5|g6`JAXH%cPkfgcZ>HGwyJ%A$e
zOY0#DNjZiIbfeI5u;Wm%V;{B9`K-tJq{Dg7oc}hCWiN1xg~L_or>31P3}fzWA&Uu&
z;Bl&e>~twRTgpD+Wpz}*27PiK79%-H784Y)1Vt=ixa<jvSb`#!pon=h(4C@J7qO)C
zGugOK=Oc)k7phx(oVUGYl+A96o{S<+&W=MJUlFU%>>HxD?3>Ms;pBD;Ov%!4!U^#P
zwoUZStC)KoZzW7j53Mvm+%-zDn~QD~=TEe|UAsXH{&G+@jRyMia;P>^bDMT=4Om%`
zRxcVW*1lEi0_|XFol!HXMQbc=Ka~F@<bsi#YKFAtn)|gBlx-JV^XH-UZ-%sU#Iojc
zX#ISZ_7_!9TcYxJg%Tc=Q{NGS`2293aFX;3xLz(V%kuxXR6dlh<oEQi+;++|kiR>T
zxXWnl-o1B|(bCl3)M_+GW3AD)=50pfp3I<Orfe6#qh%B7xSkY+X7*^I=}2h9+Euk@
zAiy}6@%;zsCA{qjN1m+e2-lz1JHkexy6VMneMdO5BV6n9naBh56yBhsP}LbWsvZwV
zI>NQMP7EUs)QUGzpGQ&8MrfutDLhyuW{A%H*oZPfJ#gVG70RDRUkW-Ji3?A_P9s$H
zVm;79aQX0>l6MQ#{|+(&Jr9@QRonQDQX6s2+d>0xEeZi)yWpKj(0aU!c0DB8h5QGp
zZnfahG;GH>VfP|d)(%^`rWsK64DrPwuicOF%!X@=J2A7f2o|v4zDqnU$YX^g{eWW(
z`U#(fyiW|Kq#?0>)1og1gYf@|@E_OiXGza_sR!aSU$rA#|0Irq)72f}#zIYJI9l~?
zxDf#85MY22aS!mnPqOzwDr5T;%FgRM!p4&*d%C7094%a<ZwU!;sB^_dT%RGonn9Nv
z*Ky$&w)?NB?l!FJ3`YjArYV1X)J_dRYEQW7u*^+lMp8|7X4=-822*xQYZ}PswI&a3
zQa@<L$7$1%blS>Hr!(WW1^PxZ0c@Y620s>8pvISt{L7AIZ7Ve~kva}Q4!s9A)NeJn
zDWz)V!>vrP?8A_ZaLLcFS3cKO!|IoxTCaYaH>Go<qnR=2m$96kX@W)Tj=Y^pA0@vs
z!`9G5YBU2uoXAa%V{iYj=|?ag*ZF9P6DH?2jxGE?6l0gALGno-`6ebO^#sZ9CwW{G
zk<h*gmz0xfko;lbMdI?_dI|WI<cEcHxjeQ>%E>gyzU&=t|ADKJpZ2kjYm&4p(;)k^
zB+vUV<|A(ahD*~maB+H)3O6Vf(yrv)02vh%b&W2^1$ryO{=;=&#zU;NP`(u|v@f1=
zkV^V9#jeCvTU($4bq2|!ZceIv8&rz+(;6yyrF|SToD?>V$CUG3xWg^IKp?gs_g&QX
z<8vA9S1F4Bo!T$^4_E}L?fG3m{rvxw<WWB-#f*mIOpOWhpTWrrv>@*8{KIFs>i;u<
z`T4(Vl`D89|MA;F$h+*`{b7>t_7FFtw*NhVajQ%1|K~_v`YY9_<sU+YU;BHvx)O4n
z-UdZ{)=GJa{{%3<_P;>((Kk-2|G1UsC%-`Q(vH-DeZc9LQ1J7AnEW@18U4q7J=OmW
z0ruY~`?ycYN%jBk0C{PskqblKP|9r}@8|zt`uLCgkgER&0_?|$aaOTsDCK<t_AitD
zVW>yK_~HA%YJVoc{sJ+o{iT*C1MDBhg#{?;Cc-}MyQ=p8D8T-CV(cY(nHVxd>F)>F
zKSTD<5FfVRfJ?Ri*JNM%N$gq_7keOD2HgVOfnAVHUnhBjz2p$ddjMsmoW%d0<kfZU
zFv)*ksgV9l`FB9xZ~PhJMO)MMlRUP6BV1Bm&dYUxscR4FZls0p1I1yb=+1o<P^gy-
zjFZ7es5jtJ?PD|j>@OfQ(0w$I-BKz3vK*A6pViub`R*g=;z?d9scUO0K|pj(eG0g2
zx_Ba1is<6+gdZNM1Od^-AB@T>T@zoSWfka275M)ldoMu-_dTHGC+{cS+Z8BN1$Oa`
zxRP|S()!`}N2QW<4Og~G(!Je)@;Y6^m7tPzF+%+Cn=3&;bTOjKtGYyyx4u-bYd4h&
zET?<3nt_njA<}-NLyaHlzB3W^lx?kfqI$(!t6QaO%Ah9+-J3n03hfcdUdP7sOH?%l
zso!XOGyp$EdcH&QPf_|Dq<(sy2ORCjDb!*g2(X0!{_g{R6PqtRP)7O@Tz>jL0=$8Z
zpIIm)Rf9c0{#XFMBLII0a6f;B1YWZ6kZV>u5uoQWz%dTTAVtEsAmM&bgJy_4-g+tb
zEx=3Ky=v`;0XVL|sQ))<D9Ra*@7svqPPkkt>IuJ>aQPT&BYY>}V!sN^KMZ9*e=Oqv
zD)Gn3)f0rjMY!0jg1V;(zfAZn@xMs;G)<XJgue~ApWP7fVY~j6_|FsnjZpUE-yDFC
zlb*-u;fVWINKOF%v&8>8O;vh)YA*-yzk@in)z$3Y|0#g~;{Y6&j0Uz2%lGZ;1deaK
zF3IQhM!@~z@IU}QBlMKkZTUVtFYwa3jn8MK=K}QnZ2<mZ0Nw;~^z$<lfPXgtuZQnI
zKRvwx_~QZiYXSJj0r+NEf*aU#%JacH{E*PB?&O0zI+B)^pB%7yEOpzl>}*G$oou(P
zYz{X7E04vr`kXQ;JEJq28BLELhm#j@fM>8mo>@9FgQs^YIJ}b~M`pwko(cx{0FQNK
zdfFP#OyqN8sgbOG+&UHo&dO6g{rmdtSnEJ4pG{ldp`xBNfU)#&YQnN7Qdv9SlkVz5
zLw$DJNG_ckp));u$ELDlgVvr*eiHTq)u?&Lc=(YVI>j?M((4{qD%;ikoRdP}=62W1
zY!2KB^sgPNEnv*Zb4>r&M~sflOig6%%4mFHM~$u+7#J~`)I=AAtl8>;=)h&C$iuSm
zvtvqLM<?i<*A=a_;C$%n^&d?!OAMd#UyGnx9!=}By?z5Hd`5;h>ct@Rm|J?lsOzFN
zn9o_msj<Nk5qqXlFu6m^N>8UOX#2F7y6SGdI5fovtC%snV^Is_rc(piW3gt)=d)IN
zB$dxw_VMwIl^q+(VdMIH{G0AOk-&0U<b`}Cw6END%(9M+1{z<{^jO(cZ_UEd!c5nm
zUEVOj$-UC{wdFH5oVe@iO^;lS*q!-I=4ftcsB6R<seOP9W+$M$O^n<JVL=?t4N?cM
ztSv(AcrFj)zhwwg_yuEGqXS|JST@dz|GDn%`QhBuXlm>@ofBMfgr@UHGB%vdyt)Xr
zPL|GjuGG}_@d5a4GnC4XbcrL5E0t@T&W}%`i@Su6{pt@SO?%LN%@AK!4Uqq(`x|_D
zkl{Bmd=<U7;j7k0h7U6QR))_Kj$iV)1ti~_<`g^*bx426^xOhv>CZP9Kfbfd=l}Bo
z{NHB$w=@1%82@IbzsUIUT~q3Rm+^07{Qu1Ox&EIrey;x`hT|TVY?u6`#rSZ$Yw2AU
z6K?l<hHqhdwlX}*@H-ifTaeOjis9`H{}#jVWcUXG_y%~7M#5^|UpEtu?c)7~-v>x4
z|2D?Y<9QF`=ke?b;NKI#{|Li*+%gR3{W8jM&i{K1zZu#g<N3!7$8R}_ze_mAfw!v`
z-qVpV4!;eT<c|=p#;up}#~J@u7>?%xq@JS;=k3ZfoVV-i4Bx@@TqGR*;q&(s1&>2L
z5*}hgQrmkM;c9z#Fn&HxPB8vB=#%Yo7|z@K2Mp)!{U+1T=husjpU<zC7|!E!j^Vd3
ze=ahd_xmS=UkkhxD#&Ttr;NW7%CcR{jK528y0>ch9zs&}tY-L5mqpyegkzk$;gb5-
z5|28r))9{Uw=jOWPOJUc!1y;aevCPi%8&b?NT{Fl-zQKP=W+g`{gnG1=nr2H<$js!
zKlhXSFJq2+xc}5Fv4e?qa$lI^_d`KGU*!HfJ|0ZT#Oe{>#c-KLd^cQvdXka}QrVBl
zFQ4^T&F%IeKqU1czwA@2;(D5h7ZZc>$lptOOyvFr;&RUp@hIMa5Z@zm?hT(oNQn1|
zoO_E?9`U;<kBN*G;{6PlJ|MoA;rl6%xWVwllt+9Y!-pCEFvG_gE_EU_&G7A%=LzcP
z>nD#t;&Eb*P>TEyP#%+d$|Ejgh>(FdAjIVu#&SE}fK)wkfx5WdD@2ILQ_fk$4ZH#2
zSjJ0|9Agsy4pod$D)Bfv0Q9t*h;^)y@|e6pdE}=hP#`Z6j_V#RX#)8N!jT`gggnB1
zYb4~Cu@wN>m;T6o;njpA|7yYL-l7V>+~@67aJkQWSi$8!udU#6pI4swlm5v4+y#YS
z?&r#Lev)7A=l%=vOaH$_ZKy^@AjvQHc?|`Z`?;G4mwM#-cu?V&`?eW{Uyie96@IyI
L`-+0g_3*y|@rP<5

literal 0
HcmV?d00001

diff --git a/src/gzstream/logo.gif b/src/gzstream/logo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..e259089fbb097573bdc3bab9eccc75e548cf7251
GIT binary patch
literal 1651
zcmc(e`&Z0)7{}l3jC|8rW;J1BOOZK!%QYqubK-O%w{_WMSe0qa7^NBs*~ZDCEjE{2
zHl>g<sdQmVjBidMnPOdPinYY4`A(~>OFG4Tw%OhPVV@tKAD-u&_wzdM^M3pT{k%k>
zsZik)AVhEP=U)C!P_al<ap)NErQn4ETgG6r4oWVf#E=biU=kn$<X(s{Vj{sj1_SjF
z5CD`#h^BCq1(!g~pxD9CqHPesmV$V_Cn;bFj}VF>xGV;{8)O_9@nx=%#X|xC1Ik4l
zX~hCW3}#ShHK%|DkAPxOV#W(s2sog`n9}QI)oO&mvM`7_4J>R49&;^F)_@q`a4EG!
zVqlE|%PmIojQYx=2{15l(E!*2l1(tNAOsA9bR%L}SQ9)jquCYe02o>z5P+hfo(4lZ
z1S65Ghbj7BUPuzb(BSeR=HWpynpq2r`U?zQkmymXhKUB@gQ1X}h<F+l9~2J<0Uf~m
z$0R5XFg1YK2F2!f7Rbm*Yep;%)^qw;j3UT9veI<X;uJ$MyE3^qOjxF`>|~9E5Qbrb
z)Lxx5OZWtY`$1i6qK%>Y1(hs2n2=zY``|hX9Ox4e-HLGnh$-aMS``|>Cn^mWg#aJm
z8t|VDVq5M~4&neQ5QEp-!N!2d!KkoN3X>BdQih2)bLH8H#XwLX&v_fUhn0aifr+rP
zk^@Q<Nzw0=te$8`PX>Q0C}pQQltb?TC51W?mf>coRiIViN1|XLSSnCaQxmsZ({k{`
z!5dKi|I6KiVEF}5W>`^U0!ngjCQ&2YBy}ZcYW5FzS(q76diGZ8sOgGlb*1O-q&dw`
z853BRd-tetMS<jQ+4*}Xri9mL2A1bFobrmT;#IH6U%G!z(vY!z<wEtDfTKO0iWQni
zxk0kISqnW)OA3OMLZrW5>3mou)a@L!>-)msvhn54E9vDL{|dQgTKZFag@2_B+n5zH
zl3jbv`Qyyj4*YJv8Yi3i39S`w{<V&yMb^CZGQYa*6PCtFTB0KEkDC!?A7Y5{)2C&v
zw(~J~`aM3e=F(nW>&MV1Q{*ci?_S=cd}_aA@SUt?_ti<a)0*wg-dEf9G-jtYeJk?b
z*jYQV&zw5e^F!D3d~@PZ-s$q{ckPR&cON|MeY2<Qifq`>d3e`X)7%l2=x8>-l~`3s
zc4}CEM|H!v_)~j+wy8Xo^yk!3qxU{KEczpC<fQRu_Su>JH^|2Rl%KQz)#A#R%SXGZ
z!hC7%@wWBW+W12UrtIr|9_QXlIm>V9PP!y{+S}!(Jd8HJ3h}P$eJS%#F>Fj+^mSc$
znwR&i@gdX8ww>D$oRR$F3Hx`0U5~vuxaOOk5g98ZG8BGy@)q9K?O1g<rb*tB72mGu
zIGxbLc4Vuq-HbVV?G?r|Gn(>^XAjIY8P6T^b7OPUS1Z`_CnE~jyv#Tgo1e4K&2%C6
zxWc5lkY~z1R-jclUNjfyRY?oi-K=Wg(V~%_j47YnF*f4W;=qjw%Wh<xFBSHr<f<9x
z{@=?c+z?$nGQ%u<bKGx;tw&j-uSi+(peo=^S#!!sVY1Eh?e3LdYy#b7F0QlRTwhY<
z;89u`xAv@7*f>#H{+AWCwVF%%i?!{u9KPFZ;~;r^lM?AYt`F-@Ezu319F((xBU4VT
z5zq;xMY=YJD8_EKzPsYLFv+yrLr#n-tR5aWCpvxEFQ+cu@vToNzq{(}=l6QAr>Wz0
zzNw#!?@#XMW!5j4KlEMA;<YWYPiE6T$L|%^^QMP!!aY7bS?7|as@*iKNZVvzqu<t~
zyw<P1TD04zqG9<6pHFKW!$hBl-+1k&iK_E(?B8A&8BTY#b!a!!*Xm7~s<w}f7vit9
zXoI|7loU%<Evx;Ph@Q!cE=IFU%D2c(m)9#qokc%nyl*buTowK%(=}yR{=BZP{_g6*
J!`6#2`)^o?o1Xvx

literal 0
HcmV?d00001

diff --git a/src/gzstream/test_gunzip.C b/src/gzstream/test_gunzip.C
new file mode 100644
index 0000000..6c4a7d7
--- /dev/null
+++ b/src/gzstream/test_gunzip.C
@@ -0,0 +1,78 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : test_gunzip.C
+// Revision      : $Revision: 1.3 $
+// Revision_date : $Date: 2001/10/04 15:09:28 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Short test program reading a file, uncompressing it, and writing it.
+// ============================================================================
+
+#include <gzstream.h>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+
+int main( int argc, char*argv[]) {
+    if ( argc != 3) {
+	std::cerr << "Usage: " << argv[0] <<" <in-file> <out-file>\n";
+	return EXIT_FAILURE;
+    }
+    // check alternate way of opening file
+    igzstream    in2;
+    in2.open( argv[1]);
+    if ( ! in2.good()) {
+        std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    in2.close();
+    if ( ! in2.good()) {
+        std::cerr << "ERROR: Closing file `" << argv[1] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    // now use the shorter way with the constructor to open the same file
+    igzstream in(  argv[1]);
+    if ( ! in.good()) {
+        std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    std::ofstream  out( argv[2]);
+    if ( ! out.good()) {
+        std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    char c;
+    while ( in.get(c))
+	out << c;
+    in.close();
+    out.close();
+    if ( ! in.eof()) {
+        std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    if ( ! out.good()) {
+        std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
+
+// ============================================================================
+// EOF
diff --git a/src/gzstream/test_gzip.C b/src/gzstream/test_gzip.C
new file mode 100644
index 0000000..0c691ae
--- /dev/null
+++ b/src/gzstream/test_gzip.C
@@ -0,0 +1,78 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : test_gzip.C
+// Revision      : $Revision: 1.3 $
+// Revision_date : $Date: 2001/10/04 15:09:28 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Short test program reading a file, compressing it, and writing it.
+// ============================================================================
+
+#include <gzstream.h>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+
+int main( int argc, char*argv[]) {
+    if ( argc != 3) {
+	std::cerr << "Usage: " << argv[0] <<" <in-file> <out-file>\n";
+	return EXIT_FAILURE;
+    }
+    // check alternate way of opening file
+    ogzstream    out2;
+    out2.open( argv[2]);
+    if ( ! out2.good()) {
+        std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    out2.close();
+    if ( ! out2.good()) {
+        std::cerr << "ERROR: Closing file `" << argv[2] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    // now use the shorter way with the constructor to open the same file
+    ogzstream  out( argv[2]);
+    if ( ! out.good()) {
+        std::cerr << "ERROR: Opening file `" << argv[2] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    std::ifstream in(  argv[1]);
+    if ( ! in.good()) {
+        std::cerr << "ERROR: Opening file `" << argv[1] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    char c;
+    while ( in.get(c))
+	out << c;
+    in.close();
+    out.close();
+    if ( ! in.eof()) {
+        std::cerr << "ERROR: Reading file `" << argv[1] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    if ( ! out.good()) {
+        std::cerr << "ERROR: Writing file `" << argv[2] << "' failed.\n";
+	return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
+
+// ============================================================================
+// EOF
diff --git a/src/gzstream/version b/src/gzstream/version
new file mode 100644
index 0000000..511137d
--- /dev/null
+++ b/src/gzstream/version
@@ -0,0 +1 @@
+1.5 (08 Jan 2003)
diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index 0c424c4..48e54e9 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -23,9 +23,9 @@
 using namespace std;
 
 namespace kraken {
-  // Build a node->parent map from NCBI Taxonomy nodes.dmp file
-  map<uint32_t, uint32_t> build_parent_map(string filename) {
-    map<uint32_t, uint32_t> pmap;
+  // Build a node->parent unordered_map from NCBI Taxonomy nodes.dmp file
+  unordered_map<uint32_t, uint32_t> build_parent_map(string filename) {
+    unordered_map<uint32_t, uint32_t> pmap;
     uint32_t node_id, parent_id;
     string line;
     ifstream ifs(filename.c_str());
@@ -47,7 +47,7 @@ namespace kraken {
   // Return lowest common ancestor of a and b
   // LCA(0,x) = LCA(x,0) = x
   // Default ancestor is 1 (root of tree)
-  uint32_t lca(map<uint32_t, uint32_t> &parent_map,
+  uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map,
     uint32_t a, uint32_t b)
   {
     if (a == 0 || b == 0)
@@ -71,12 +71,12 @@ namespace kraken {
 
   // Tree resolution: take all hit taxa (plus ancestors), then
   // return leaf of highest weighted leaf-to-root path.
-  uint32_t resolve_tree(map<uint32_t, uint32_t> &hit_counts,
-                        map<uint32_t, uint32_t> &parent_map)
+  uint32_t resolve_tree(unordered_map<uint32_t, uint32_t> &hit_counts,
+                        unordered_map<uint32_t, uint32_t> &parent_map)
   {
     set<uint32_t> max_taxa;
     uint32_t max_taxon = 0, max_score = 0;
-    map<uint32_t, uint32_t>::iterator it = hit_counts.begin();
+    unordered_map<uint32_t, uint32_t>::iterator it = hit_counts.begin();
 
     // Sum each taxon's LTR path
     while (it != hit_counts.end()) {
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
index 196ee1d..30eb67d 100644
--- a/src/krakenutil.hpp
+++ b/src/krakenutil.hpp
@@ -21,19 +21,20 @@
 #define KRAKENUTIL_HPP
 
 #include "kraken_headers.hpp"
+#include <unordered_map>
 
 namespace kraken {
   // Build a map of node to parent from an NCBI taxonomy nodes.dmp file
-  std::map<uint32_t, uint32_t> build_parent_map(std::string filename);
+  std::unordered_map<uint32_t, uint32_t> build_parent_map(std::string filename);
 
   // Return the lowest common ancestor of a and b, according to parent_map
   // NOTE: LCA(0,x) = LCA(x,0) = x
-  uint32_t lca(std::map<uint32_t, uint32_t> &parent_map,
+  uint32_t lca(std::unordered_map<uint32_t, uint32_t> &parent_map,
     uint32_t a, uint32_t b);
 
   // Resolve classification tree
-  uint32_t resolve_tree(std::map<uint32_t, uint32_t> &hit_counts,
-                        std::map<uint32_t, uint32_t> &parent_map);
+  uint32_t resolve_tree(std::unordered_map<uint32_t, uint32_t> &hit_counts,
+                        std::unordered_map<uint32_t, uint32_t> &parent_map);
 
   class KmerScanner {
     public:
diff --git a/src/make_seqid_to_taxid_map.cpp b/src/make_seqid_to_taxid_map.cpp
index 8b968aa..c8a30ed 100644
--- a/src/make_seqid_to_taxid_map.cpp
+++ b/src/make_seqid_to_taxid_map.cpp
@@ -50,12 +50,12 @@ int main(int argc, char **argv) {
   char *map_filename = argv[1];
   char *list_filename = argv[2];
 
-  char *nodes_filename;
-  char *names_filename;
-  if (argc == 5) {
-      nodes_filename = argv[3];
-      names_filename = argv[4];
-  }
+  //char *nodes_filename;
+  //char *names_filename;
+  //if (argc == 5) {
+  //    nodes_filename = argv[3];
+  //    names_filename = argv[4];
+  //}
 
   fill_request_map(list_filename);
   report_taxo_numbers(map_filename);
diff --git a/src/report-cols.h b/src/report-cols.h
index 7087a82..a34a755 100644
--- a/src/report-cols.h
+++ b/src/report-cols.h
@@ -19,6 +19,7 @@ enum class REPORTCOLS : uint8_t {
 	GENOME_SIZE,
 	NUM_READS,
 	NUM_READS_CLADE,
+	NUM_KMERS,
 	NUM_UNIQUE_KMERS,
 	TOTAL_SCORE,
 	TOTAL_HIT_LENGTH,
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index a0d601a..4e9d40d 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -22,6 +22,7 @@
 #include "krakendb.hpp"
 #include "krakenutil.hpp"
 #include "seqreader.hpp"
+#include <unordered_map>
 
 #define SKIP_LEN 50000
 
@@ -45,8 +46,8 @@ bool Allow_extra_kmers = false;
 bool verbose = false;
 bool Operate_in_RAM = false;
 bool One_FASTA_file = false;
-map<uint32_t, uint32_t> Parent_map;
-map<string, uint32_t> ID_to_taxon_map;
+unordered_map<uint32_t, uint32_t> Parent_map;
+unordered_map<string, uint32_t> ID_to_taxon_map;
 KrakenDB Database;
 
 int main(int argc, char **argv) {
diff --git a/src/taxdb.h b/src/taxdb.h
index da11c96..0d93207 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -34,8 +34,8 @@
 typedef uint32_t TaxId;
 
 struct ReadCounts {
-	uint32_t n_reads;
-	uint32_t n_kmers;
+	uint64_t n_reads = 0;
+	uint64_t n_kmers = 0;
     HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
 };
 
@@ -83,10 +83,16 @@ class TaxonomyEntry {
   uint64_t genomeSize = 0;
   uint64_t genomeSizeOfChildren = 0;
   uint64_t numBelow = 0;
-  uint64_t numKmers;
+  uint64_t numKmers = 0;
   HyperLogLogPlusMinus<uint64_t> kmers;
 };
 
+struct TaxonomyEntryPtr_comp {
+	bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { 
+		return ((a->numReadsAligned+a->numReadsAlignedToChildren) > (b->numReadsAligned+b->numReadsAlignedToChildren)); 
+	}
+};
+
 class TaxonomyDB {
  public:
   TaxonomyDB(const std::string inFileName);
@@ -116,12 +122,14 @@ class TaxonomyDB {
 
 void TaxonomyDB::createPointers() {
   for (auto& tax : taxIDsAndEntries) {
+  if (tax.second.parentTaxonomyID != tax.first) {
     auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
     if (parentIt != taxIDsAndEntries.end()) {
       tax.second.parent = &(parentIt->second);
       parentIt->second.children.push_back(&tax.second);
     }
   }
+  }
 }
 TaxonomyDB::TaxonomyDB(const std::string inFileName) {
   log("Building taxonomy index");
@@ -209,13 +217,22 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) {
   uint32_t taxonomyID, parentTaxonomyID;
   std::string scientificName, rank;
 
-  while (inFile >> taxonomyID >> parentTaxonomyID >> rank >> scientificName) {
+  std::string line;
+  while (!inFile.eof()) {
+	inFile >> taxonomyID >> parentTaxonomyID;
+	inFile.get(); // read tab
+	std::getline(inFile, scientificName, '\t');
+	std::getline(inFile, rank, '\n');
     TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
 
+	//cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl;
     taxIDsAndEntries.insert({
       taxonomyID, newEntry
     });
   }
+  taxIDsAndEntries.insert({
+	0, {0, 0, "no rank", "unclassified" }
+  });
 }
 
 uint32_t TaxonomyDB::getLowestCommonAncestor(
@@ -385,18 +402,28 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const {
 
 void TaxonomyDB::fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_counts) {
 	for (auto& elem : taxon_counts) {
+		//cerr << "fill: "<< elem.first << endl;
 		TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first);
+		//cerr << "fill done: "<< elem.first << endl;
 		tax->numReadsAligned += elem.second.n_reads;
 		tax->numKmers += elem.second.n_kmers;
 		tax->kmers += elem.second.kmers;
 
+		//std::cerr << "adding " << elem.second.n_reads << " to " << tax->scientificName << ": ";
+
 		while (tax->parent != nullptr) {
 			tax = tax->parent;
+			//std::cerr << " >> " << tax->scientificName;
 			tax->numReadsAlignedToChildren += elem.second.n_reads;
 			tax->numKmers += elem.second.n_kmers;
 			tax->kmers += elem.second.kmers;
 		}
+		//std::cerr << endl;
 	 }
+
+	for (auto& tax : taxIDsAndEntries) {
+		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp());
+	}
 }
 
 
@@ -418,7 +445,7 @@ class TaxReport {
 };
 
 TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
-	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
+	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
 }
 
 void TaxReport::printReport(std::string format, std::string rank) {
@@ -426,9 +453,13 @@ void TaxReport::printReport(std::string format, std::string rank) {
 			_taxdb.taxIDsAndEntries.at(0).numReadsAligned +
 			_taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren +
 			_taxdb.taxIDsAndEntries.at(1).numReadsAligned +
-			_taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren +
-			_taxdb.taxIDsAndEntries.at(-1).numReadsAligned +
-			_taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree
+			_taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren;// +
+			//_taxdb.taxIDsAndEntries.at(-1).numReadsAligned +
+			//_taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree
+	if (_total_n_reads == 0) {
+		std::cerr << "total number of reads is zero - not creating a report!" << endl;
+		return;
+	}
 
 	if (format == "kraken") {
 		// A: print number of unidentified reads
@@ -436,7 +467,7 @@ void TaxReport::printReport(std::string format, std::string rank) {
 		// B: print normal results
 		printReport(_taxdb.taxIDsAndEntries.at(1),0u);
 		// C: Print Unclassified stuff
-		printReport(_taxdb.taxIDsAndEntries.at(-1),0u);
+		//printReport(_taxdb.taxIDsAndEntries.at(-1),0u);
 	} else {
 		// print stuff at a certain level ..
 		//_uid_abundance;
@@ -464,12 +495,13 @@ void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) {
 		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
 		case REPORTCOLS::TAX_ID:     _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
 		case REPORTCOLS::DEPTH:     _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break;
+		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break;
 		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
 		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (tax.numReadsAligned + tax.numReadsAlignedToChildren); break;
 		case REPORTCOLS::NUM_READS:  _reportOfb << tax.numReadsAligned; break;
 		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
+		case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
 		//case REPORTCOLS::SUM_SCORE: ; break;

From aa6b1794855939d530a705f6298898ecb040b392 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Mon, 20 Feb 2017 13:44:22 -0500
Subject: [PATCH 017/105] Don't use nodes file in classify

---
 scripts/build_kraken_db.sh |  2 +-
 scripts/kraken             |  7 -------
 src/classify.cpp           | 13 -------------
 src/taxdb.h                |  8 ++++++--
 4 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
index 4f64c14..a464a73 100755
--- a/scripts/build_kraken_db.sh
+++ b/scripts/build_kraken_db.sh
@@ -169,7 +169,7 @@ else
 #  echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
 fi
 
-if [ -e "taxDB" ]
+if [ -s "taxDB" ]
 then
   echo "Skipping step 4.5, taxDB exists."
 else
diff --git a/scripts/kraken b/scripts/kraken
index 29cce0d..6f2e290 100755
--- a/scripts/kraken
+++ b/scripts/kraken
@@ -95,12 +95,6 @@ if ($@) {
   die "$PROG: $@";
 }
 
-my $taxonomy = $db_prefix[0]."/taxonomy/nodes.dmp";
-if ($quick) {
-  undef $taxonomy;  # Skip loading nodes file, not needed in quick mode
-}
-
-
 my @kdb_files = map { "$_/database.kdb" } @db_prefix;
 my @idx_files = map { "$_/database.idx" } @db_prefix;
 
@@ -140,7 +134,6 @@ my @flags;
 push @flags, map { ("-d", $_) } @kdb_files;
 push @flags, map { ("-i", $_) } @idx_files;
 push @flags, "-t", $threads if $threads > 1;
-push @flags, "-n", $taxonomy if defined $taxonomy;
 push @flags, "-q" if $quick;
 push @flags, "-m", $min_hits if $min_hits > 1;
 push @flags, "-f" if $fastq_input && ! $paired;  # merger always outputs FASTA
diff --git a/src/classify.cpp b/src/classify.cpp
index 9f7933e..ef7d616 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -44,7 +44,6 @@ unordered_map<uint32_t, ReadCounts> taxon_counts; // stats per taxon
 int Num_threads = 1;
 vector<string> DB_filenames;
 vector<string> Index_filenames;
-string Nodes_filename;
 bool Quick_mode = false;
 bool Fastq_input = false;
 bool Print_classified = false;
@@ -117,10 +116,6 @@ int main(int argc, char **argv) {
   #endif
 
   parse_command_line(argc, argv);
-  //if (! Nodes_filename.empty()) {
-  //  cerr << "Building parent node map " << endl;
-  //  Parent_map = build_parent_map(Nodes_filename);
-  //}
   
   if (!TaxDB_file.empty()) {
 	  taxdb = TaxonomyDB(TaxDB_file);
@@ -481,9 +476,6 @@ void parse_command_line(int argc, char **argv) {
         omp_set_num_threads(Num_threads);
         #endif
         break;
-      case 'n' :
-        Nodes_filename = optarg;
-        break;
       case 'q' :
         Quick_mode = true;
         break;
@@ -542,10 +534,6 @@ void parse_command_line(int argc, char **argv) {
     cerr << "Missing mandatory option -i" << endl;
     usage();
   }
-  if (Nodes_filename.empty() && ! Quick_mode) {
-    cerr << "Must specify one of -q or -n" << endl;
-    usage();
-  }
   if (optind == argc) {
     cerr << "No sequence data files specified" << endl;
   }
@@ -557,7 +545,6 @@ void usage(int exit_code) {
        << "Options: (*mandatory)" << endl
        << "* -d filename      Kraken DB filename" << endl
        << "* -i filename      Kraken DB index filename" << endl
-       << "  -n filename      NCBI Taxonomy nodes file" << endl
        << "  -o filename      Output file for Kraken output" << endl
        << "  -r filename      Output file for Kraken report output" << endl
        << "  -a filename      TaxDB" << endl
diff --git a/src/taxdb.h b/src/taxdb.h
index 0d93207..56bd341 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -402,8 +402,12 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const {
 
 void TaxonomyDB::fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_counts) {
 	for (auto& elem : taxon_counts) {
-		//cerr << "fill: "<< elem.first << endl;
-		TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first);
+		auto it = taxIDsAndEntries.find(elem.first);
+		if (it == taxIDsAndEntries.end()) {
+			cerr << "No taxonomy entry for " << elem.first << "!!" << endl;
+			continue;
+		}
+		TaxonomyEntry* tax = &it->second;
 		//cerr << "fill done: "<< elem.first << endl;
 		tax->numReadsAligned += elem.second.n_reads;
 		tax->numKmers += elem.second.n_kmers;

From 0587b5ec5aff3c325c9b3cc158e8127cf6525ed9 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Tue, 21 Feb 2017 10:04:55 -0500
Subject: [PATCH 018/105] Added generate-taxonomy-ids-for-sequences and typed
 TaxonomyDB

---
 scripts/build_kraken_db.sh |  11 +-
 scripts/kraken-build       |  11 ++
 src/build_taxdb.cpp        |   2 +-
 src/classify.cpp           |   7 +-
 src/set_lcas.cpp           |  74 ++++++++--
 src/taxdb.h                | 277 +++++++++++++++++++++++--------------
 6 files changed, 255 insertions(+), 127 deletions(-)

diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
index a464a73..6d86bf7 100755
--- a/scripts/build_kraken_db.sh
+++ b/scripts/build_kraken_db.sh
@@ -57,7 +57,7 @@ else
   echo "Kraken build set to minimize RAM usage."
 fi
 
-if [ -n "$KRAKEN_REBUILD_DATABASE" ]
+if [ "$KRAKEN_REBUILD_DATABASE" == "1" ]
 then
   rm -f database.* *.map lca.complete
 fi
@@ -174,7 +174,7 @@ then
   echo "Skipping step 4.5, taxDB exists."
 else
   echo "Creating taxDB (step 4.5 of 5)... "
-  build_taxdb taxonomy/nodes.dmp taxonomy/names.dmp > taxDB
+  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB
 fi
 
 
@@ -184,10 +184,15 @@ then
   echo "Skipping step 5, LCAs already set."
 else
   echo "Setting LCAs in database (step 5 of 5)..."
+  PARAM=""
+  if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then
+	echo " Adding taxonomy IDs for sequences"
+	PARAM=" -a"
+  fi
   start_time1=$(date "+%s.%N")
   find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
     set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \
-    -n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
+    -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
   touch "lca.complete"
 
   echo "Database LCAs set. [$(report_time_elapsed $start_time1)]"
diff --git a/scripts/kraken-build b/scripts/kraken-build
index bf36ae6..7170a67 100755
--- a/scripts/kraken-build
+++ b/scripts/kraken-build
@@ -63,6 +63,8 @@ my (
   $standard,
   $upgrade,
   $clean,
+
+  $add_taxonomy_ids_for_seq
 );
 
 $threads = $DEF_THREAD_CT;
@@ -71,6 +73,8 @@ $kmer_len = $DEF_KMER_LEN;
 $work_on_disk = "";
 $hash_size = "";
 $max_db_size = "";
+$add_taxonomy_ids_for_seq = 0;
+$rebuild = 0;
 
 # variables corresponding to task options
 my @TASK_LIST = (
@@ -108,6 +112,8 @@ GetOptions(
   "upgrade" => \$upgrade,
   "standard" => \$standard,
   "clean" => \$clean,
+
+  "generate-taxonomy-ids-for-sequences" => \$add_taxonomy_ids_for_seq
 ) or usage();
 
 if (@ARGV) {
@@ -235,6 +241,10 @@ Options:
                              (default: 1)
   --work-on-disk             Perform most operations on disk rather than in
                              RAM (will slow down build in most cases)
+  --generate-taxonomy-ids-for-sequences 
+                             Generate taxonomy IDs for sequences, starting with 1000000000.
+							 Can be useful to resolve classifications with multiple genomes
+							 for one taxonomy ID.
 EOF
   exit $exit_code;
 }
@@ -284,6 +294,7 @@ sub standard_installation {
 
 sub build_database {
   $ENV{"KRAKEN_REBUILD_DATABASE"} = $rebuild;
+  $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq;
   exec "build_kraken_db.sh";
 }
 
diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index 08e649a..8e1f11e 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -27,7 +27,7 @@ int main(int argc, char **argv) {
       std::cout << "Provide names.dmp and nodes.dmp\n";
       return 1;
     }
-    TaxonomyDB taxdb;
+    TaxonomyDB<uint32_t> taxdb;
     taxdb.writeTaxonomyIndex(
             std::cout, argv[1], argv[2]);
 
diff --git a/src/classify.cpp b/src/classify.cpp
index ef7d616..611497f 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -27,6 +27,7 @@
 #include "gzstream.h"
 
 const size_t DEF_WORK_UNIT_SIZE = 500000;
+int New_taxid_start = 1000000000;
 
 using namespace std;
 using namespace kraken;
@@ -65,7 +66,7 @@ ostream *Report_output;
 vector<ofstream*> Open_fstreams;
 vector<ogzstream*> Open_gzstreams;
 size_t Work_unit_size = DEF_WORK_UNIT_SIZE;
-TaxonomyDB taxdb;
+TaxonomyDB<uint32_t> taxdb;
 
 uint64_t total_classified = 0;
 uint64_t total_sequences = 0;
@@ -118,7 +119,7 @@ int main(int argc, char **argv) {
   parse_command_line(argc, argv);
   
   if (!TaxDB_file.empty()) {
-	  taxdb = TaxonomyDB(TaxDB_file);
+	  taxdb = TaxonomyDB<uint32_t>(TaxDB_file);
       for (const auto & tax : taxdb.taxIDsAndEntries) {
           if (tax.first != 0)
           Parent_map[tax.first] = tax.second.parentTaxonomyID;
@@ -196,7 +197,7 @@ int main(int argc, char **argv) {
 
   if (Print_kraken_report) {
 	taxdb.fillCounts(taxon_counts);
-	TaxReport rep = TaxReport(*Report_output, taxdb, false);
+	TaxReport<uint32_t> rep = TaxReport<uint32_t>(*Report_output, taxdb, false);
 	rep.printReport("kraken","blu");
   }
 
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 4e9d40d..73bbf67 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -22,6 +22,7 @@
 #include "krakendb.hpp"
 #include "krakenutil.hpp"
 #include "seqreader.hpp"
+#include "taxdb.h"
 #include <unordered_map>
 
 #define SKIP_LEN 50000
@@ -37,18 +38,23 @@ void process_file(string filename, uint32_t taxid);
 void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish);
 
 int Num_threads = 1;
-string DB_filename, Index_filename, Nodes_filename,
+string DB_filename, Index_filename, TaxDB_filename,
   File_to_taxon_map_filename,
   ID_to_taxon_map_filename, Multi_fasta_filename;
 bool force_taxid = false;
+int New_taxid_start = 1000000000;
 
 bool Allow_extra_kmers = false;
 bool verbose = false;
 bool Operate_in_RAM = false;
 bool One_FASTA_file = false;
+bool Add_taxIds_for_Sequences = false;
+
 unordered_map<uint32_t, uint32_t> Parent_map;
 unordered_map<string, uint32_t> ID_to_taxon_map;
+unordered_map<uint32_t, bool> SeqId_added;
 KrakenDB Database;
+TaxonomyDB<uint32_t> taxdb;
 
 int main(int argc, char **argv) {
   #ifdef _OPENMP
@@ -57,8 +63,16 @@ int main(int argc, char **argv) {
 
   parse_command_line(argc, argv);
 
-  if (!force_taxid) {
-    Parent_map = build_parent_map(Nodes_filename);
+  if (!TaxDB_filename.empty() && !force_taxid) {
+	  taxdb = TaxonomyDB<uint32_t>(TaxDB_filename);
+      for (const auto & tax : taxdb.taxIDsAndEntries) {
+          if (tax.first != 0)
+          Parent_map[tax.first] = tax.second.parentTaxonomyID;
+      }
+      Parent_map[1] = 0;
+  } else {
+      cerr << "TaxDB argument is required!" << endl;
+      return 1;
   }
 
   QuickFile db_file(DB_filename, "rw");
@@ -96,25 +110,43 @@ int main(int argc, char **argv) {
     delete temp_ptr;
   }
 
+
+  if (Add_taxIds_for_Sequences && !TaxDB_filename.empty()) {
+    ofstream ofs(TaxDB_filename.c_str());
+    taxdb.writeTaxonomyIndex(ofs);
+    ofs.close();
+  }
+
   return 0;
 }
 
 void process_single_file() {
-  cerr << "Processing multiple FASTA files" << endl;
+  cerr << "Processing FASTA files" << endl;
   ifstream map_file(ID_to_taxon_map_filename.c_str());
   if (map_file.rdstate() & ifstream::failbit) {
     err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str());
   }
-  string line;
+  string line, seq_id;
+  uint32_t parent_taxid, taxid;
   while (map_file.good()) {
     getline(map_file, line);
     if (line.empty())
       break;
-    string seq_id;
-    uint32_t taxid;
     istringstream iss(line);
     iss >> seq_id;
-    iss >> taxid;
+    if (ID_to_taxon_map.find(seq_id) != ID_to_taxon_map.end()) 
+        continue;
+
+    if (Add_taxIds_for_Sequences) {
+      iss >> parent_taxid;
+      taxid = ++New_taxid_start;
+      Parent_map[taxid] = parent_taxid;
+      auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry<uint32_t>(taxid, parent_taxid, "sequence")});
+      if (!itEntry.second)
+          cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the XXX option." << endl;
+    } else {
+      iss >> taxid;
+    }
     ID_to_taxon_map[seq_id] = taxid;
   }
 
@@ -142,6 +174,15 @@ void process_single_file() {
     } else {
         taxid = ID_to_taxon_map[dna.id];
     }
+    
+    if (Add_taxIds_for_Sequences) {
+      auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
+	  if (entryIt == taxdb.taxIDsAndEntries.end()) {
+        cerr << "Error! Didn't find " << taxid << " in TaxonomyDB!!" << endl;
+	  } else {
+        entryIt->second.scientificName = dna.header_line;
+      }
+    }
 
     if (taxid) {
       #pragma omp parallel for schedule(dynamic)
@@ -155,6 +196,7 @@ void process_single_file() {
 
         ++seqs_no_taxid;
     }
+
     cerr << "\rProcessed " << seqs_processed << " sequences";
   }
   cerr << "\r                                                                            ";
@@ -232,7 +274,7 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTv")) != -1) {
+  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:a")) != -1) {
     switch (opt) {
       case 'f' :
         File_to_taxon_map_filename = optarg;
@@ -263,18 +305,21 @@ void parse_command_line(int argc, char **argv) {
       case 'T' :
         force_taxid = true;
         break;
-      case 'n' :
-        Nodes_filename = optarg;
-        break;
       case 'v' :
         verbose = true;
         break;
       case 'x' :
         Allow_extra_kmers = true;
         break;
+      case 'a' :
+        Add_taxIds_for_Sequences = true;
+      case 'b' :
+        TaxDB_filename = optarg;
+        break;
       case 'M' :
         Operate_in_RAM = true;
         break;
+
       default:
         usage();
         break;
@@ -282,7 +327,7 @@ void parse_command_line(int argc, char **argv) {
   }
 
   if (DB_filename.empty() || Index_filename.empty() ||
-      Nodes_filename.empty())
+      TaxDB_filename.empty())
     usage();
   if (File_to_taxon_map_filename.empty() &&
       (Multi_fasta_filename.empty() || ID_to_taxon_map_filename.empty()))
@@ -300,13 +345,14 @@ void usage(int exit_code) {
        << "Options: (*mandatory)" << endl
        << "* -d filename      Kraken DB filename" << endl
        << "* -i filename      Kraken DB index filename" << endl
-       << "* -n filename      NCBI Taxonomy nodes file" << endl
+       << "* -b filename      Taxonomy DB file" << endl
        << "  -t #             Number of threads" << endl
        << "  -M               Copy DB to RAM during operation" << endl
        << "  -x               K-mers not found in DB do not cause errors" << endl
        << "  -f filename      File to taxon map" << endl
        << "  -F filename      Multi-FASTA file with sequence data" << endl
        << "  -m filename      Sequence ID to taxon map" << endl
+       << "  -a               Add taxonomy IDs (starting with "<<New_taxid_start<<") for sequences to Taxonomy DB" << endl
        << "  -T               Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
        << "  -v               Verbose output" << endl
        << "  -h               Print this message" << endl
diff --git a/src/taxdb.h b/src/taxdb.h
index 56bd341..b4d4093 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -31,8 +31,6 @@
 #include "hyperloglogplus.h"
 #include "report-cols.h"
 
-typedef uint32_t TaxId;
-
 struct ReadCounts {
 	uint64_t n_reads = 0;
 	uint64_t n_kmers = 0;
@@ -44,29 +42,50 @@ void log (const std::string& s) {
 	std::cerr << s << "\n";
 }
 
-std::vector<std::string> tokenise(const std::string &line, const std::string& delimiters) {
-	std::vector<std::string> tokens;
-	// Skip delimiters at beginning.
-	std::string::size_type lastPos = line.find_first_not_of(delimiters, 0);
-	std::string::size_type pos = line.find_first_of(delimiters, lastPos);
-	while (std::string::npos != pos || std::string::npos != lastPos) {
-	  tokens.push_back(line.substr(lastPos, pos - lastPos));
-	  // Skip delimiters.  Note the "not_of"
-	  lastPos = line.find_first_not_of(delimiters, pos);
-	  pos = line.find_first_of(delimiters, lastPos);
-	}
-	return tokens;
+template<typename T>
+uint64_t string_to_T(string str) {
+  stringstream stream(str);
+  T result;
+  stream >> result;
+  return result;
+}
+
+std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) {
+    std::vector<std::string> tokens(max_fields);
+    size_t delim_length = delimiter.length();
+    size_t last = 0;
+    size_t i = 0;
+
+    for (size_t next = s.find(delimiter, last);
+         i < max_fields && next != string::npos;
+         next = s.find(delimiter, last), ++i) {
+        tokens[i] = s.substr(last, next-last);
+        last = next + delim_length;
+    }
+    if (i < max_fields) {
+        tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars);
+    }
+
+    return tokens;
 }
 
+template<typename TAXID>
 class TaxonomyEntry {
  public:
-  uint32_t taxonomyID = 0;
-  uint32_t parentTaxonomyID = 0;
+  TAXID taxonomyID = 0;
+  TAXID parentTaxonomyID = 0;
   std::string rank;
   std::string scientificName;
 
   TaxonomyEntry() {}
-  TaxonomyEntry(uint32_t taxonomyID_, uint32_t parentTaxonomyID_, std::string rank_, std::string scientificName_) :
+
+  TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) :
+	  taxonomyID(taxonomyID_), scientificName(scientificName_) {}
+
+  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) :
+	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {}
+
+  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) :
 	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {}
 
   inline bool operator==(const TaxonomyEntry& other) const {
@@ -87,40 +106,45 @@ class TaxonomyEntry {
   HyperLogLogPlusMinus<uint64_t> kmers;
 };
 
+
+template<typename TAXID>
 struct TaxonomyEntryPtr_comp {
-	bool operator() ( const TaxonomyEntry* a, const TaxonomyEntry* b) const { 
+	bool operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const { 
 		return ((a->numReadsAligned+a->numReadsAlignedToChildren) > (b->numReadsAligned+b->numReadsAlignedToChildren)); 
 	}
 };
 
+template<typename TAXID>
 class TaxonomyDB {
  public:
   TaxonomyDB(const std::string inFileName);
   TaxonomyDB() {};
-  std::unordered_map<uint32_t, TaxonomyEntry> taxIDsAndEntries;
+  //std::unordered_map<std::string, TAXID> seqIDsAndTaxIds;
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID> > taxIDsAndEntries;
   void parseNamesDump(const std::string namesDumpFileName);
   void parseNodesDump(const std::string nodesDumpFileName);
-  uint32_t getTaxIDAtRank(const uint32_t taxID, const std::string& rank) const;
-  std::string getScientificName(const uint32_t taxID) const;
-  std::string getRank(const uint32_t taxID) const;
-  uint32_t getLowestCommonAncestor(const std::vector<uint32_t>& taxIDs) const;
-  uint32_t getParentTaxID(const uint32_t taxID) const;
-  std::string getLineage(uint32_t taxonomyID) const;
-  std::string getMetaPhlAnLineage(uint32_t taxonomyID) const;
-  char* getIndexFileName(const uint32_t hostTaxID) const;
+  TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
+  std::string getScientificName(const TAXID taxID) const;
+  std::string getRank(const TAXID taxID) const;
+  TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
+  TAXID getParentTaxID(const TAXID taxID) const;
+  std::string getLineage(TAXID taxonomyID) const;
+  std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
+  char* getIndexFileName(const TAXID hostTaxID) const;
   void readTaxonomyIndex(const std::string inFileName);
+  void writeTaxonomyIndex(std::ostream & outs) const;
   void writeTaxonomyIndex(std::ostream & outs,
                           const std::string namesDumpFileName,
                           const std::string nodesDumpFileName);
-  bool isSubSpecies(uint32_t taxonomyID) const;
-  int isBelowInTree(uint32_t upper, uint32_t lower) const;
-  void fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_counts);
+  bool isSubSpecies(TAXID taxonomyID) const;
+  int isBelowInTree(TAXID upper, TAXID lower) const;
+  void fillCounts(const unordered_map<TAXID, ReadCounts>& taxon_counts);
   void createPointers();
   void printReport();
 };
 
-
-void TaxonomyDB::createPointers() {
+template<typename TAXID>
+void TaxonomyDB<TAXID>::createPointers() {
   for (auto& tax : taxIDsAndEntries) {
   if (tax.second.parentTaxonomyID != tax.first) {
     auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
@@ -131,7 +155,9 @@ void TaxonomyDB::createPointers() {
   }
   }
 }
-TaxonomyDB::TaxonomyDB(const std::string inFileName) {
+
+template<typename TAXID>
+TaxonomyDB<TAXID>::TaxonomyDB(const std::string inFileName) {
   log("Building taxonomy index");
   readTaxonomyIndex(inFileName);
   createPointers();
@@ -139,82 +165,103 @@ TaxonomyDB::TaxonomyDB(const std::string inFileName) {
       " nodes");
 }
 
-void TaxonomyDB::parseNodesDump(const std::string nodesDumpFileName) {
+template<typename TAXID>
+void TaxonomyDB<TAXID>::parseNodesDump(const std::string nodesDumpFileName) {
   std::ifstream nodesDumpFile(nodesDumpFileName);
   if (!nodesDumpFile.is_open())
     throw std::runtime_error("unable to open nodes file");
   std::string line;
+
+  TAXID taxonomyID;
+  TAXID parentTaxonomyID;
+  std::string rank;
+
   while (nodesDumpFile.good()) {
     getline(nodesDumpFile, line);
-    std::vector<std::string> tokens = tokenise(line, "\t|");
-    if (tokens.size() > 2) {
-      TaxonomyEntry newEntry;
-      newEntry.taxonomyID = stoi(tokens[0]);
-      newEntry.parentTaxonomyID = stoi(tokens[1]);
-      newEntry.rank = tokens[2];
-      auto entryIt = taxIDsAndEntries.insert({
-        newEntry.taxonomyID, newEntry
-      });
-      if (!entryIt.second) {
-        entryIt.first->second.taxonomyID = newEntry.taxonomyID;
-        newEntry.parentTaxonomyID = stoi(tokens[1]);
-      }
+    std::vector<std::string> tokens = tokenise(line, "\t|\t", 3, 2);
+    if (tokens.size() < 3) {
+	  continue;
+	}
+
+	taxonomyID = string_to_T<TAXID>(tokens[0]);
+    parentTaxonomyID = string_to_T<TAXID>(tokens[1]);
+    rank = tokens[2];
+
+    auto entryIt = taxIDsAndEntries.find(taxonomyID);
+	if (entryIt == taxIDsAndEntries.end()) {
+	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, parentTaxonomyID, rank);
+	} else {
+      entryIt->second.parentTaxonomyID = parentTaxonomyID;
+      entryIt->second.rank = rank;
     }
   }
 }
 
-void TaxonomyDB::parseNamesDump(const std::string namesDumpFileName) {
+template<typename TAXID>
+void TaxonomyDB<TAXID>::parseNamesDump(const std::string namesDumpFileName) {
   std::ifstream namesDumpFile(namesDumpFileName);
   if (!namesDumpFile.is_open())
     throw std::runtime_error("unable to open names file");
   std::string line;
+
+  TAXID taxonomyID;
+  std::string scientificName;
   while (namesDumpFile.good()) {
     getline(namesDumpFile, line);
-    std::vector<std::string> tokens = tokenise(line, "|");
-    for (auto& token : tokens) {
-      if (token.size() > 1) {
-        if (token[0] == '\t') token.erase(0, 1);
-        if (token[token.size() - 1] == '\t') token.erase(token.size() - 1, 1);
-      }
-    }
-    if (tokens.size() > 3) {
-      TaxonomyEntry newEntry;
-      newEntry.taxonomyID = stoi(tokens[0]);
-      //            for(auto & token : tokens)
-      //                std::cout<<token<<"\n";
-      if (tokens[3] == "scientific name") {
-        //      std::cout<<"Found\n";
-        newEntry.scientificName = tokens[1];
-        //      std::cout<<newEntry.scientificName<<"\n";
-      } else
-        continue;
-      auto entryIt = taxIDsAndEntries.insert({
-        newEntry.taxonomyID, newEntry
-      });
-      if (!entryIt.second) {
-        entryIt.first->second.scientificName = newEntry.scientificName;
-      }
+    std::vector<std::string> tokens = tokenise(line, "\t|\t", 4, 2);
+    if (tokens.size() < 4 || tokens[3] != "scientific name") {
+	  continue;
+	}
+    taxonomyID = string_to_T<TAXID>(tokens[0]);
+    scientificName = tokens[1];
+
+    auto entryIt = taxIDsAndEntries.find(taxonomyID);
+	if (entryIt == taxIDsAndEntries.end()) {
+	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, scientificName);
+	} else {
+      entryIt->second.scientificName = scientificName;
     }
   }
 }
 
-void TaxonomyDB::writeTaxonomyIndex(std::ostream & outs,
+template<typename TAXID>
+void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs,
 									const std::string namesDumpFileName,
                                     const std::string nodesDumpFileName) {
   parseNodesDump(nodesDumpFileName);
   parseNamesDump(namesDumpFileName);
-  for (auto& entry : taxIDsAndEntries) {
-    outs << entry.first << "\t" << entry.second.parentTaxonomyID << "\t"
-            << entry.second.scientificName << "\t" << entry.second.rank << "\n";
+  writeTaxonomyIndex(outs);
+}
+
+template<typename KeyType, typename ValueType>
+std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>& unordered) {
+  std::vector<KeyType> keys;
+  keys.reserve (unordered.size());
+  for (auto& it : unordered) {
+	      keys.push_back(it.first);
   }
+  std::sort (keys.begin(), keys.end());
+  return keys;
 }
 
-void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) {
+template<typename TAXID>
+void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs) const {
+  for (TAXID& key : getSortedKeys(taxIDsAndEntries)) {
+	const auto& entry = taxIDsAndEntries.at(key);
+    outs << key << "\t" << entry.parentTaxonomyID << "\t"
+            << entry.scientificName << "\t" << entry.rank << "\n";
+  }
+}
+
+
+
+template<typename TAXID>
+void TaxonomyDB<TAXID>::readTaxonomyIndex(const std::string inFileName) {
   std::ifstream inFile(inFileName);
   if (!inFile.is_open())
     throw std::runtime_error("unable to open taxonomy index file");
 
-  uint32_t taxonomyID, parentTaxonomyID;
+  TAXID taxonomyID, parentTaxonomyID;
   std::string scientificName, rank;
 
   std::string line;
@@ -223,7 +270,7 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) {
 	inFile.get(); // read tab
 	std::getline(inFile, scientificName, '\t');
 	std::getline(inFile, rank, '\n');
-    TaxonomyEntry newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
+    TaxonomyEntry<TAXID> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
 
 	//cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl;
     taxIDsAndEntries.insert({
@@ -235,16 +282,17 @@ void TaxonomyDB::readTaxonomyIndex(const std::string inFileName) {
   });
 }
 
-uint32_t TaxonomyDB::getLowestCommonAncestor(
-    const std::vector<uint32_t>& taxIDs) const {
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getLowestCommonAncestor(
+    const std::vector<TAXID>& taxIDs) const {
   if (taxIDs.size() == 0) {
     return 0;
   }
-  std::vector<std::vector<uint32_t> > paths;
+  std::vector<std::vector<TAXID> > paths;
   for (auto& taxID : taxIDs) {
     bool good = true;
-    std::vector<uint32_t> path;
-    uint32_t tempTaxID = taxID;
+    std::vector<TAXID> path;
+    TAXID tempTaxID = taxID;
     while (tempTaxID != 0) {
       path.push_back(tempTaxID);
       tempTaxID = getParentTaxID(tempTaxID);
@@ -257,12 +305,12 @@ uint32_t TaxonomyDB::getLowestCommonAncestor(
   for (auto& path : paths)
     std::reverse(path.begin(), path.end());
   std::sort(paths.begin(), paths.end(),
-            [](std::vector<uint32_t> i, std::vector<uint32_t> j) {
+            [](std::vector<TAXID> i, std::vector<TAXID> j) {
     return i.size() < j.size();
   });
-  uint32_t consensus = 0;
+  TAXID consensus = 0;
   for (unsigned i = 0; i < paths[0].size(); i++) {
-    uint32_t temp = 0;
+    TAXID temp = 0;
     for (auto& path : paths) {
       if (temp == 0)
         temp = path[i];
@@ -275,7 +323,8 @@ uint32_t TaxonomyDB::getLowestCommonAncestor(
   return consensus;
 }
 
-uint32_t TaxonomyDB::getParentTaxID(const uint32_t taxID) const {
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getParentTaxID(const TAXID taxID) const {
   auto entry = taxIDsAndEntries.find(taxID);
   if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1)
     return entry->second.parentTaxonomyID;
@@ -283,7 +332,8 @@ uint32_t TaxonomyDB::getParentTaxID(const uint32_t taxID) const {
     return 0;
 }
 
-std::string TaxonomyDB::getScientificName(const uint32_t taxID) const {
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getScientificName(const TAXID taxID) const {
   auto entry = taxIDsAndEntries.find(taxID);
   if (entry != taxIDsAndEntries.end()) {
     return entry->second.scientificName;
@@ -291,7 +341,8 @@ std::string TaxonomyDB::getScientificName(const uint32_t taxID) const {
     return std::string();
 }
 
-std::string TaxonomyDB::getRank(const uint32_t taxID) const {
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getRank(const TAXID taxID) const {
   auto entry = taxIDsAndEntries.find(taxID);
   if (entry != taxIDsAndEntries.end()) {
     return entry->second.rank;
@@ -299,7 +350,8 @@ std::string TaxonomyDB::getRank(const uint32_t taxID) const {
     return std::string();
 }
 
-std::string TaxonomyDB::getLineage(uint32_t taxonomyID) const {
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getLineage(TAXID taxonomyID) const {
   std::string lineage;
   while (true) {
     // 131567 = Cellular organisms
@@ -316,7 +368,9 @@ std::string TaxonomyDB::getLineage(uint32_t taxonomyID) const {
   }
   return lineage;
 }
-std::string TaxonomyDB::getMetaPhlAnLineage(uint32_t taxonomyID) const {
+
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getMetaPhlAnLineage(TAXID taxonomyID) const {
   std::string rank = getRank(taxonomyID);
   if (rank == "superphylum") return std::string();
   std::string lineage;
@@ -356,7 +410,8 @@ std::string TaxonomyDB::getMetaPhlAnLineage(uint32_t taxonomyID) const {
   return lineage;
 }
 
-uint32_t TaxonomyDB::getTaxIDAtRank(const uint32_t taxID,
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getTaxIDAtRank(const TAXID taxID,
                                     const std::string& rank) const {
   auto entry = taxIDsAndEntries.find(taxID);
   while (entry != taxIDsAndEntries.end() &&
@@ -368,7 +423,9 @@ uint32_t TaxonomyDB::getTaxIDAtRank(const uint32_t taxID,
   }
   return 0;
 }
-int TaxonomyDB::isBelowInTree(uint32_t upper, uint32_t lower) const {
+
+template<typename TAXID>
+int TaxonomyDB<TAXID>::isBelowInTree(TAXID upper, TAXID lower) const {
   auto entry = taxIDsAndEntries.find(lower);
   unsigned level = 0;
   while (entry != taxIDsAndEntries.end() &&
@@ -382,7 +439,9 @@ int TaxonomyDB::isBelowInTree(uint32_t upper, uint32_t lower) const {
   }
   return -1;
 }
-bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const {
+
+template<typename TAXID>
+bool TaxonomyDB<TAXID>::isSubSpecies(TAXID taxonomyID) const {
   bool isSubSpecies = false;
   auto entry = taxIDsAndEntries.find(taxonomyID);
   int numLevels = 0;
@@ -400,14 +459,15 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const {
   return isSubSpecies;
 }
 
-void TaxonomyDB::fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_counts) {
+template<typename TAXID>
+void TaxonomyDB<TAXID>::fillCounts(const unordered_map<TAXID, ReadCounts>& taxon_counts) {
 	for (auto& elem : taxon_counts) {
 		auto it = taxIDsAndEntries.find(elem.first);
 		if (it == taxIDsAndEntries.end()) {
 			cerr << "No taxonomy entry for " << elem.first << "!!" << endl;
 			continue;
 		}
-		TaxonomyEntry* tax = &it->second;
+		TaxonomyEntry<TAXID>* tax = &it->second;
 		//cerr << "fill done: "<< elem.first << endl;
 		tax->numReadsAligned += elem.second.n_reads;
 		tax->numKmers += elem.second.n_kmers;
@@ -426,33 +486,36 @@ void TaxonomyDB::fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_cou
 	 }
 
 	for (auto& tax : taxIDsAndEntries) {
-		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp());
+		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID>());
 	}
 }
 
 
+template<typename TAXID>
 class TaxReport {
 private:
 	std::ostream& _reportOfb;
-	TaxonomyDB & _taxdb;
+	TaxonomyDB<TAXID> & _taxdb;
 	std::vector<REPORTCOLS> _report_cols;
 	uint64_t _total_n_reads;
 	bool _show_zeros;
 
-	void printLine(TaxonomyEntry& tax, unsigned depth);
+	void printLine(TaxonomyEntry<TAXID>& tax, unsigned depth);
 
 public:
-	TaxReport(std::ostream& _reportOfb, TaxonomyDB & taxdb, bool _show_zeros);
+	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID> & taxdb, bool _show_zeros);
 
 	void printReport(std::string format, std::string rank);
-	void printReport(TaxonomyEntry& tax, unsigned depth);
+	void printReport(TaxonomyEntry<TAXID>& tax, unsigned depth);
 };
 
-TaxReport::TaxReport(std::ostream& reportOfb, TaxonomyDB& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
+template<typename TAXID>
+TaxReport<TAXID>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
 	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
 }
 
-void TaxReport::printReport(std::string format, std::string rank) {
+template<typename TAXID>
+void TaxReport<TAXID>::printReport(std::string format, std::string rank) {
 	_total_n_reads =
 			_taxdb.taxIDsAndEntries.at(0).numReadsAligned +
 			_taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren +
@@ -480,7 +543,8 @@ void TaxReport::printReport(std::string format, std::string rank) {
 	}
 }
 
-void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) {
+template<typename TAXID>
+void TaxReport<TAXID>::printReport(TaxonomyEntry<TAXID>& tax, unsigned depth) {
 
 	if (_show_zeros || (tax.numReadsAligned+tax.numReadsAlignedToChildren) > 0) {
 		printLine(tax, depth);
@@ -492,7 +556,8 @@ void TaxReport::printReport(TaxonomyEntry& tax, unsigned depth) {
 
 }
 
-void TaxReport::printLine(TaxonomyEntry& tax, unsigned depth) {
+template<typename TAXID>
+void TaxReport<TAXID>::printLine(TaxonomyEntry<TAXID>& tax, unsigned depth) {
 	for (auto& col : _report_cols) {
 		switch (col) {
 		case REPORTCOLS::NAME:        _reportOfb << tax.scientificName ; break;

From 06e7f7ebe6ac745a27361aa7c35830e6b5b37e21 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <fbreitw1@jhu.edu>
Date: Tue, 21 Feb 2017 17:39:33 -0500
Subject: [PATCH 019/105] Use less critical pragma - better parallelization

---
 install_kraken.sh |  2 ++
 src/classify.cpp  | 46 +++++++++++++++++++++++++---------------------
 src/taxdb.h       | 10 ++++++++++
 3 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/install_kraken.sh b/install_kraken.sh
index e7af3d7..803989c 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -62,3 +62,5 @@ for file in $KRAKEN_DIR/kraken*
 do
   [ -x "$file" ] && echo "  $file"
 done
+
+exit 0
diff --git a/src/classify.cpp b/src/classify.cpp
index 611497f..f5bf9d6 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -35,8 +35,9 @@ using namespace kraken;
 void parse_command_line(int argc, char **argv);
 void usage(int exit_code=EX_USAGE);
 void process_file(char *filename);
-void classify_sequence(DNASequence &dna, ostringstream &koss,
-                       ostringstream &coss, ostringstream &uoss);
+bool classify_sequence(DNASequence &dna, ostringstream &koss,
+                       ostringstream &coss, ostringstream &uoss,
+                       unordered_map<uint32_t, ReadCounts>&);
 string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig);
 set<uint32_t> get_ancestry(uint32_t taxon);
 void report_stats(struct timeval time1, struct timeval time2);
@@ -269,15 +270,24 @@ void process_file(char *filename) {
       if (total_nt == 0)
         break;
       
+      unordered_map<uint32_t, ReadCounts> my_taxon_counts;
+      uint64_t my_total_classified = 0;
       kraken_output_ss.str("");
       classified_output_ss.str("");
       unclassified_output_ss.str("");
       for (size_t j = 0; j < work_unit.size(); j++)
-        classify_sequence( work_unit[j], kraken_output_ss,
-                           classified_output_ss, unclassified_output_ss );
+        my_total_classified += 
+            classify_sequence( work_unit[j], kraken_output_ss,
+                           classified_output_ss, unclassified_output_ss,
+                           my_taxon_counts);
 
       #pragma omp critical(write_output)
       {
+        total_classified += my_total_classified;
+        for (auto &it : my_taxon_counts) {
+            taxon_counts[it.first] += it.second;
+        }
+
         if (Print_kraken)
           (*Kraken_output) << kraken_output_ss.str();
         if (Print_classified)
@@ -286,8 +296,9 @@ void process_file(char *filename) {
           (*Unclassified_output) << unclassified_output_ss.str();
         total_sequences += work_unit.size();
         total_bases += total_nt;
+        //if (Print_Progress && total_sequences % 100000 < work_unit.size()) 
         if (Print_Progress && total_sequences % 100000 < work_unit.size()) 
-          cerr << "\rProcessed " << total_sequences << " sequences (" << total_bases << " bp) ...";
+          cerr << "\rProcessed " << total_sequences << " sequences (" << total_classified << " classified) ...";
       }
     }
   }  // end parallel section
@@ -304,8 +315,9 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu
 	return taxon;
 }
 
-void classify_sequence(DNASequence &dna, ostringstream &koss,
-                       ostringstream &coss, ostringstream &uoss) {
+bool classify_sequence(DNASequence &dna, ostringstream &koss,
+                       ostringstream &coss, ostringstream &uoss,
+                       unordered_map<uint32_t, ReadCounts>& my_taxon_counts) {
   vector<uint32_t> taxa;
   vector<uint8_t> ambig_list;
   unordered_map<uint32_t, uint32_t> hit_counts;
@@ -330,11 +342,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
             if (taxon) break;
         }
 
-        #pragma omp critical
-        {
-        taxon_counts[taxon].kmers.add(*kmer_ptr);
-        ++taxon_counts[taxon].n_kmers;
-        }
+        my_taxon_counts[taxon].add_kmer(*kmer_ptr);
 
         if (taxon) {
           hit_counts[taxon]++;
@@ -352,12 +360,7 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
   else
     call = resolve_tree(hit_counts, Parent_map);
 
-  if (call)
-    #pragma omp atomic
-    total_classified++;
-
-  #pragma omp critical
-  ++(taxon_counts[call].n_reads);
+  ++(my_taxon_counts[call].n_reads);
 
   if (Print_unclassified || Print_classified) {
     ostringstream *oss_ptr = call ? &coss : &uoss;
@@ -377,14 +380,14 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
   }
 
   if (! Print_kraken)
-    return;
+    return call;
 
   if (call) {
     koss << "C\t";
   }
   else {
     if (Only_classified_kraken_output)
-      return;
+      return false;
     koss << "U\t";
   }
   koss << dna.id << "\t" << call << "\t" << dna.seq.size() << "\t";
@@ -402,7 +405,8 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
   if (Print_sequence)
       koss << "\t" << dna.seq;
 
-  koss << endl;
+  koss << "\n";
+  return call;
 }
 
 string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig)
diff --git a/src/taxdb.h b/src/taxdb.h
index b4d4093..5fc53c9 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -35,6 +35,16 @@ struct ReadCounts {
 	uint64_t n_reads = 0;
 	uint64_t n_kmers = 0;
     HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
+	void add_kmer(uint64_t kmer) {
+		++ n_kmers;
+		kmers.add(kmer);
+	}
+    ReadCounts& operator+=(const ReadCounts& b) {
+        n_reads += b.n_reads;
+        n_kmers += b.n_kmers;
+		kmers += kmers;
+        return *this;
+    }
 };
 
 

From ff6394483f00d44b8ea82b6560e2dd16f4ea4f4b Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 22 Feb 2017 12:59:24 -0500
Subject: [PATCH 020/105] Fix for using multiple databases for search

---
 scripts/kraken-build |  9 ++++++---
 src/classify.cpp     | 47 +++++++++++++++++++++++++-------------------
 src/set_lcas.cpp     |  1 +
 src/taxdb.h          |  2 +-
 4 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/scripts/kraken-build b/scripts/kraken-build
index 7170a67..1ddea52 100755
--- a/scripts/kraken-build
+++ b/scripts/kraken-build
@@ -67,6 +67,8 @@ my (
   $add_taxonomy_ids_for_seq
 );
 
+my $verbose = 0;
+
 $threads = $DEF_THREAD_CT;
 $minimizer_len = $DEF_MINIMIZER_LEN;
 $kmer_len = $DEF_KMER_LEN;
@@ -74,7 +76,6 @@ $work_on_disk = "";
 $hash_size = "";
 $max_db_size = "";
 $add_taxonomy_ids_for_seq = 0;
-$rebuild = 0;
 
 # variables corresponding to task options
 my @TASK_LIST = (
@@ -112,6 +113,7 @@ GetOptions(
   "upgrade" => \$upgrade,
   "standard" => \$standard,
   "clean" => \$clean,
+  "verbose" => \$verbose,
 
   "generate-taxonomy-ids-for-sequences" => \$add_taxonomy_ids_for_seq
 ) or usage();
@@ -293,9 +295,10 @@ sub standard_installation {
 }
 
 sub build_database {
-  $ENV{"KRAKEN_REBUILD_DATABASE"} = $rebuild;
+  $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0);
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq;
-  exec "build_kraken_db.sh";
+  my $opt = ($verbose? "-x" : "");
+  exec "build_kraken_db.sh $opt";
 }
 
 sub clean_database {
diff --git a/src/classify.cpp b/src/classify.cpp
index f5bf9d6..891f30b 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -51,14 +51,13 @@ bool Fastq_input = false;
 bool Print_classified = false;
 bool Print_unclassified = false;
 bool Print_kraken = true;
-bool Print_kraken_report = true;
+bool Print_kraken_report = false;
 bool Populate_memory = false;
 bool Only_classified_kraken_output = false;
 bool Print_sequence = false;
 bool Print_Progress = true;
 uint32_t Minimum_hit_count = 1;
 unordered_map<uint32_t, uint32_t> Parent_map;
-vector<KrakenDB*> KrakenDatabases;
 string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file;
 ostream *Classified_output;
 ostream *Unclassified_output;
@@ -68,6 +67,7 @@ vector<ofstream*> Open_fstreams;
 vector<ogzstream*> Open_gzstreams;
 size_t Work_unit_size = DEF_WORK_UNIT_SIZE;
 TaxonomyDB<uint32_t> taxdb;
+static vector<KrakenDB*> KrakenDatabases (DB_filenames.size());
 
 uint64_t total_classified = 0;
 uint64_t total_sequences = 0;
@@ -134,25 +134,24 @@ int main(int argc, char **argv) {
   if (Populate_memory)
     cerr << "Loading database(s)... " << endl;
 
+  static vector<QuickFile> idx_files (DB_filenames.size());
+  static vector<QuickFile> db_files (DB_filenames.size());
+  static vector<KrakenDBIndex> db_indices (DB_filenames.size());
+
+
   // TODO: Check DB_filenames and Index_filesnames have the same length
   for (size_t i=0; i < DB_filenames.size(); ++i) {
-    //cerr << "\t " << DB_filenames[i] << endl;
-    static QuickFile db_file;
-    db_file.open_file(DB_filenames[i]);
+    cerr << " Database " << DB_filenames[i] << endl;
+    db_files[i].open_file(DB_filenames[i]);
     if (Populate_memory)
-      db_file.load_file();
-    static KrakenDB Database = KrakenDB(db_file.ptr());
-    KmerScanner::set_k(Database.get_k());
-  
-    static QuickFile idx_file;
-    idx_file.open_file(Index_filenames[i]);
+      db_files[i].load_file();
+
+    KrakenDatabases.push_back(new KrakenDB(db_files[i].ptr()));
+    idx_files[i].open_file(Index_filenames[i]);
     if (Populate_memory)
-      idx_file.load_file();
-    static KrakenDBIndex db_index(idx_file.ptr());
-    Database.set_index(&db_index);
-    
-  
-    KrakenDatabases.push_back(&Database);
+      idx_files[i].load_file();
+    db_indices[i] = KrakenDBIndex(idx_files[i].ptr());
+    KrakenDatabases[i]->set_index(&db_indices[i]);
   }
 
   // TODO: Check all databases have the same k
@@ -325,7 +324,14 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
   uint32_t taxon = 0;
   uint32_t hits = 0;  // only maintained if in quick mode
 
-  uint64_t current_bin_key; int64_t current_min_pos = 1;  int64_t current_max_pos = 0;
+
+  struct db_status {
+    uint64_t current_bin_key;
+    int64_t current_min_pos = 1;
+    int64_t current_max_pos = 0;
+  };
+
+  vector<db_status> db_statuses(KrakenDatabases.size());
 
   if (dna.seq.size() >= KrakenDatabases[0]->get_k()) {
     KmerScanner scanner(dna.seq);
@@ -337,8 +343,9 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
       else {
         ambig_list.push_back(0);
 
-        for (auto& db : KrakenDatabases) {
-            taxon = get_taxon_for_kmer(*db, kmer_ptr, current_bin_key, current_min_pos, current_max_pos);
+        for (size_t i=0; i<KrakenDatabases.size(); ++i) {
+            taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr, 
+                    db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);
             if (taxon) break;
         }
 
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 73bbf67..3848d5d 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -313,6 +313,7 @@ void parse_command_line(int argc, char **argv) {
         break;
       case 'a' :
         Add_taxIds_for_Sequences = true;
+        break;
       case 'b' :
         TaxDB_filename = optarg;
         break;
diff --git a/src/taxdb.h b/src/taxdb.h
index 5fc53c9..ff2b2f3 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -42,7 +42,7 @@ struct ReadCounts {
     ReadCounts& operator+=(const ReadCounts& b) {
         n_reads += b.n_reads;
         n_kmers += b.n_kmers;
-		kmers += kmers;
+		kmers += b.kmers;
         return *this;
     }
 };

From 647686389543b7d5cb2b7ad938ff5f4e60bd985c Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 22 Feb 2017 14:09:51 -0500
Subject: [PATCH 021/105] Update README.md

---
 README.md | 49 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index c414dfe..988deb5 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,45 @@
-Kraken taxonomic sequence classification system
+Kraken taxonomic sequence classification system with Unique K-mer Counting
 ===============================================
 
-Please see the [Kraken webpage] or the [Kraken manual]
-for information on installing and operating Kraken.
-A local copy of the [Kraken manual] is also present here
-in the `docs/` directory (`MANUAL.html` and `MANUAL.markdown`).
+[Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. 
 
-[Kraken webpage]:   http://ccb.jhu.edu/software/kraken/
-[Kraken manual]:    http://ccb.jhu.edu/software/kraken/MANUAL.html
+kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). 
+
+Here's a small example of a classification against a viral database with k=25. There are three species identified by just one read - Enterobacteria phage BP-4795, Salmonella phage SEN22, Sulfolobus monocaudavirus SMV1. Out of those, the identification of Salmonella phage SEN22 is the strongest, as there read was matched with 116 k-mers that are unique to the sequence, while the match to Sulfolobus monocaudavirus SMV1 is only based on a single 25-mer.
+
+```
+99.0958 2192    2192    255510  272869  no rank 0   unclassified
+0.904159    20  0   2361    2318    no rank 1   root
+0.904159    20  0   2361    2318    superkingdom    10239     Viruses
+0.904159    20  0   2361    2318    no rank 35237       dsDNA viruses, no RNA stage
+0.768535    17  0   2074    2063    order   548681        Herpesvirales
+0.768535    17  0   2074    2063    family  10292           Herpesviridae
+0.768535    17  0   2074    2063    subfamily   10374             Gammaherpesvirinae
+0.768535    17  0   2074    2063    genus   10375               Lymphocryptovirus
+0.768535    17  16  2001    1987    species 10376                 Human gammaherpesvirus 4
+0.045208    1   1   4   4   sequence    1000041143                  KC207814.1 Human herpesvirus 4 strain Mutu, complete genome
+0.0904159   2   0   254 254 order   28883         Caudovirales
+0.045208    1   0   28  28  family  10699           Siphoviridae
+0.045208    1   0   28  28  genus   186765            Lambdavirus
+0.045208    1   0   28  28  no rank 335795              unclassified Lambda-like viruses
+0.045208    1   1   28  28  species 196242                Enterobacteria phage BP-4795
+0.045208    1   0   116 116 family  10744           Podoviridae
+0.045208    1   0   116 116 no rank 196895            unclassified Podoviridae
+0.045208    1   0   116 116 no rank 1758253             Escherichia phage phi191 sensu lato
+0.045208    1   1   116 116 species 1647458               Salmonella phage SEN22
+0.045208    1   0   1   1   no rank 51368         unclassified dsDNA viruses
+0.045208    1   1   1   1   species 1351702         Sulfolobus monocaudavirus SMV1
+```
+
+## Usage
+
+For usage, see `kraken_hll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM).
+
+### Differences to `kraken`
+ - Use `kraken_hll --report-file FILENAME ...` to write the kraken report to `FILENAME`.
+ - Use `kraken_hll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `kraken_hll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same.
+ - Add a suffix `.gz` to output files to generate gzipped output files
+
+### Differences to `kraken-build`
+ - Use `kraken_hll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`.
+ - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space

From 8f4b1841bd23a75d0726b95f8f2847f3d03f3b63 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Tue, 11 Apr 2017 22:58:45 -0400
Subject: [PATCH 022/105] Do not need taxonomy nodes anymore

---
 scripts/kraken   |  7 -------
 src/classify.cpp | 13 -------------
 2 files changed, 20 deletions(-)

diff --git a/scripts/kraken b/scripts/kraken
index 29cce0d..6f2e290 100755
--- a/scripts/kraken
+++ b/scripts/kraken
@@ -95,12 +95,6 @@ if ($@) {
   die "$PROG: $@";
 }
 
-my $taxonomy = $db_prefix[0]."/taxonomy/nodes.dmp";
-if ($quick) {
-  undef $taxonomy;  # Skip loading nodes file, not needed in quick mode
-}
-
-
 my @kdb_files = map { "$_/database.kdb" } @db_prefix;
 my @idx_files = map { "$_/database.idx" } @db_prefix;
 
@@ -140,7 +134,6 @@ my @flags;
 push @flags, map { ("-d", $_) } @kdb_files;
 push @flags, map { ("-i", $_) } @idx_files;
 push @flags, "-t", $threads if $threads > 1;
-push @flags, "-n", $taxonomy if defined $taxonomy;
 push @flags, "-q" if $quick;
 push @flags, "-m", $min_hits if $min_hits > 1;
 push @flags, "-f" if $fastq_input && ! $paired;  # merger always outputs FASTA
diff --git a/src/classify.cpp b/src/classify.cpp
index 9f7933e..ef7d616 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -44,7 +44,6 @@ unordered_map<uint32_t, ReadCounts> taxon_counts; // stats per taxon
 int Num_threads = 1;
 vector<string> DB_filenames;
 vector<string> Index_filenames;
-string Nodes_filename;
 bool Quick_mode = false;
 bool Fastq_input = false;
 bool Print_classified = false;
@@ -117,10 +116,6 @@ int main(int argc, char **argv) {
   #endif
 
   parse_command_line(argc, argv);
-  //if (! Nodes_filename.empty()) {
-  //  cerr << "Building parent node map " << endl;
-  //  Parent_map = build_parent_map(Nodes_filename);
-  //}
   
   if (!TaxDB_file.empty()) {
 	  taxdb = TaxonomyDB(TaxDB_file);
@@ -481,9 +476,6 @@ void parse_command_line(int argc, char **argv) {
         omp_set_num_threads(Num_threads);
         #endif
         break;
-      case 'n' :
-        Nodes_filename = optarg;
-        break;
       case 'q' :
         Quick_mode = true;
         break;
@@ -542,10 +534,6 @@ void parse_command_line(int argc, char **argv) {
     cerr << "Missing mandatory option -i" << endl;
     usage();
   }
-  if (Nodes_filename.empty() && ! Quick_mode) {
-    cerr << "Must specify one of -q or -n" << endl;
-    usage();
-  }
   if (optind == argc) {
     cerr << "No sequence data files specified" << endl;
   }
@@ -557,7 +545,6 @@ void usage(int exit_code) {
        << "Options: (*mandatory)" << endl
        << "* -d filename      Kraken DB filename" << endl
        << "* -i filename      Kraken DB index filename" << endl
-       << "  -n filename      NCBI Taxonomy nodes file" << endl
        << "  -o filename      Output file for Kraken output" << endl
        << "  -r filename      Output file for Kraken report output" << endl
        << "  -a filename      TaxDB" << endl

From 77cac1358ff7eaca1b4507135775f933c9184d9a Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Tue, 11 Apr 2017 22:59:48 -0400
Subject: [PATCH 023/105] Check if there's a taxonomy entry

---
 src/taxdb.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/taxdb.h b/src/taxdb.h
index 0d93207..56bd341 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -402,8 +402,12 @@ bool TaxonomyDB::isSubSpecies(uint32_t taxonomyID) const {
 
 void TaxonomyDB::fillCounts(const unordered_map<uint32_t, ReadCounts>& taxon_counts) {
 	for (auto& elem : taxon_counts) {
-		//cerr << "fill: "<< elem.first << endl;
-		TaxonomyEntry* tax = &taxIDsAndEntries.at(elem.first);
+		auto it = taxIDsAndEntries.find(elem.first);
+		if (it == taxIDsAndEntries.end()) {
+			cerr << "No taxonomy entry for " << elem.first << "!!" << endl;
+			continue;
+		}
+		TaxonomyEntry* tax = &it->second;
 		//cerr << "fill done: "<< elem.first << endl;
 		tax->numReadsAligned += elem.second.n_reads;
 		tax->numKmers += elem.second.n_kmers;

From dcaec2956743c5fb518a5d0240a3198320dfd6cd Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Tue, 11 Apr 2017 23:06:55 -0400
Subject: [PATCH 024/105] Use template for ReadCounts

---
 src/build_taxdb.cpp |   2 +-
 src/classify.cpp    |  23 +++
 src/report-cols.h   |   3 +-
 src/taxdb.h         | 340 ++++++++++++++++++++++++++++----------------
 4 files changed, 247 insertions(+), 121 deletions(-)

diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index 8e1f11e..2710d82 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -27,7 +27,7 @@ int main(int argc, char **argv) {
       std::cout << "Provide names.dmp and nodes.dmp\n";
       return 1;
     }
-    TaxonomyDB<uint32_t> taxdb;
+    TaxonomyDB<uint32_t, uint32_t> taxdb;
     taxdb.writeTaxonomyIndex(
             std::cout, argv[1], argv[2]);
 
diff --git a/src/classify.cpp b/src/classify.cpp
index 891f30b..1f5bbac 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -79,6 +79,29 @@ inline bool ends_with(std::string const & value, std::string const & ending)
             return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
 }
 
+struct ReadCounts {
+	uint64_t n_reads = 0;
+	uint64_t n_kmers = 0;
+    HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
+	void add_kmer(uint64_t kmer) {
+		++ n_kmers;
+		kmers.add(kmer);
+	}
+    ReadCounts& operator+=(const ReadCounts& b) {
+        n_reads += b.n_reads;
+        n_kmers += b.n_kmers;
+		kmers += b.kmers;
+        return *this;
+    }
+};
+
+inline
+uint64_t reads(const ReadCounts& read_count) {
+	return(read_count.n_reads);
+}
+
+
+
 ostream* cout_or_file(string file) {
     if (file == "-")
       return &cout;
diff --git a/src/report-cols.h b/src/report-cols.h
index a34a755..007eef5 100644
--- a/src/report-cols.h
+++ b/src/report-cols.h
@@ -9,6 +9,7 @@
 #define REPORT_COLS_H
 
 #include<map>
+#include<string>
 
 enum class REPORTCOLS : uint8_t {
 	SPACED_NAME,
@@ -29,7 +30,7 @@ enum class REPORTCOLS : uint8_t {
 };
 
 
-static const std::map<string, REPORTCOLS> report_col_name_map = {
+static const std::map<std::string, REPORTCOLS> report_col_name_map = {
 		{"name", REPORTCOLS::NAME},
 		{"spaced_name", REPORTCOLS::SPACED_NAME},
 		{"taxID", REPORTCOLS::TAX_ID},
diff --git a/src/taxdb.h b/src/taxdb.h
index ff2b2f3..ce45bf8 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -28,25 +28,10 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "hyperloglogplus.h"
+#include <unordered_set>
 #include "report-cols.h"
 
-struct ReadCounts {
-	uint64_t n_reads = 0;
-	uint64_t n_kmers = 0;
-    HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
-	void add_kmer(uint64_t kmer) {
-		++ n_kmers;
-		kmers.add(kmer);
-	}
-    ReadCounts& operator+=(const ReadCounts& b) {
-        n_reads += b.n_reads;
-        n_kmers += b.n_kmers;
-		kmers += b.kmers;
-        return *this;
-    }
-};
-
+using namespace std;
 
 void log (const std::string& s) {
 	std::cerr << s << "\n";
@@ -60,26 +45,85 @@ uint64_t string_to_T(string str) {
   return result;
 }
 
-std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) {
+template <typename T>
+inline
+uint64_t reads(const T read_count) {
+	cerr << "No reads function for type!! " << endl;
+	throw ;
+	return(0);
+}
+
+
+
+inline
+uint64_t reads(const uint64_t read_count) {
+	return(read_count);
+}
+
+std::vector<std::string> in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at = 0) {
+    std::vector<std::string> tokens;
+	size_t i = 0;
+	size_t next_end = start_at-1;
+    
+	for (size_t next_start = s.find(start_char, next_end + 1); \
+		 next_start != string::npos;
+         next_start = s.find(start_char, next_end + 1), ++i) {
+
+		next_end = s.find(end_char, next_start + 1);
+		if (next_end == string::npos)
+			throw std::runtime_error("unmatched start and end!");
+
+        tokens.push_back(s.substr(next_start+1, next_end-1));
+    }
+
+    return tokens;
+}
+
+
+
+std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0) {
     std::vector<std::string> tokens(max_fields);
     size_t delim_length = delimiter.length();
     size_t last = 0;
     size_t i = 0;
 
     for (size_t next = s.find(delimiter, last);
-         i < max_fields && next != string::npos;
+         (max_fields > 0 && i < max_fields) && next != string::npos;
          next = s.find(delimiter, last), ++i) {
         tokens[i] = s.substr(last, next-last);
         last = next + delim_length;
     }
-    if (i < max_fields) {
+    if (max_fields > 0 && i < max_fields) {
         tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars);
     }
 
     return tokens;
 }
 
-template<typename TAXID>
+std::vector<std::string> get_fields(const std::string &s, const std::string& delimiter, vector<size_t> fields) {
+    std::vector<std::string> tokens;
+	tokens.reserve(fields.size());
+    size_t delim_length = delimiter.length();
+    size_t last = 0;
+    size_t i = 0;
+	size_t current_field = 0;
+
+    for (size_t next = s.find(delimiter, last);
+         tokens.size() < fields.size() && next != string::npos;
+         next = s.find(delimiter, last), ++i) {
+		if (i == fields[current_field]) {
+          tokens.push_back(s.substr(last, next-last));
+           ++current_field;
+		}
+        last = next + delim_length;
+    }
+
+    return tokens;
+}
+
+
+
+template<typename TAXID, typename READCOUNTS>
 class TaxonomyEntry {
  public:
   TAXID taxonomyID = 0;
@@ -106,31 +150,35 @@ class TaxonomyEntry {
   TaxonomyEntry* parent = nullptr;
   std::vector<TaxonomyEntry*> children;
 
-  unsigned numReadsAligned = 0;
-  unsigned numReadsAlignedToChildren = 0;
+  READCOUNTS read_counts = 0;
+  READCOUNTS read_counts_children = 0;
+
   bool used = false;
   uint64_t genomeSize = 0;
   uint64_t genomeSizeOfChildren = 0;
   uint64_t numBelow = 0;
-  uint64_t numKmers = 0;
-  HyperLogLogPlusMinus<uint64_t> kmers;
 };
 
+template<>
+TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
+	read_counts = 0;
+	read_counts_children = 0;
+}
 
-template<typename TAXID>
+template<typename TAXID, typename READCOUNTS>
 struct TaxonomyEntryPtr_comp {
-	bool operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const { 
-		return ((a->numReadsAligned+a->numReadsAlignedToChildren) > (b->numReadsAligned+b->numReadsAlignedToChildren)); 
+	bool operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const { 
+		return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); 
 	}
 };
 
-template<typename TAXID>
+template<typename TAXID, typename READCOUNTS>
 class TaxonomyDB {
  public:
   TaxonomyDB(const std::string inFileName);
   TaxonomyDB() {};
   //std::unordered_map<std::string, TAXID> seqIDsAndTaxIds;
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID> > taxIDsAndEntries;
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
   void parseNamesDump(const std::string namesDumpFileName);
   void parseNodesDump(const std::string nodesDumpFileName);
   TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
@@ -138,6 +186,8 @@ class TaxonomyDB {
   std::string getRank(const TAXID taxID) const;
   TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
   TAXID getParentTaxID(const TAXID taxID) const;
+  std::unordered_map<TAXID, TAXID> getParentMap() const;
+  std::unordered_map<std::string, TAXID> getScientificNameMap() const;
   std::string getLineage(TAXID taxonomyID) const;
   std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
   char* getIndexFileName(const TAXID hostTaxID) const;
@@ -148,13 +198,34 @@ class TaxonomyDB {
                           const std::string nodesDumpFileName);
   bool isSubSpecies(TAXID taxonomyID) const;
   int isBelowInTree(TAXID upper, TAXID lower) const;
-  void fillCounts(const unordered_map<TAXID, ReadCounts>& taxon_counts);
+  void addCounts(const TAXID taxid, const READCOUNTS& read_counts_);
+  void fillCounts(const unordered_map<TAXID, READCOUNTS>& taxon_counts);
   void createPointers();
   void printReport();
 };
 
-template<typename TAXID>
-void TaxonomyDB<TAXID>::createPointers() {
+template<typename TAXID, typename READCOUNTS>
+std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getScientificNameMap() const {
+	std::unordered_map<std::string, TAXID> scientificNameMap;
+	for (const auto & tax : taxIDsAndEntries) {
+		scientificNameMap[tax.second.scientificName] = tax.first;
+    }
+	return scientificNameMap;
+}
+
+template<typename TAXID, typename READCOUNTS>
+unordered_map<TAXID, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getParentMap() const {
+	unordered_map<TAXID, TAXID> Parent_map;
+	for (const auto & tax : taxIDsAndEntries) {
+		if (tax.first != 0)
+			Parent_map[tax.first] = tax.second.parentTaxonomyID;
+    }
+    Parent_map[1] = 1;
+	return Parent_map;
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::createPointers() {
   for (auto& tax : taxIDsAndEntries) {
   if (tax.second.parentTaxonomyID != tax.first) {
     auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
@@ -166,8 +237,8 @@ void TaxonomyDB<TAXID>::createPointers() {
   }
 }
 
-template<typename TAXID>
-TaxonomyDB<TAXID>::TaxonomyDB(const std::string inFileName) {
+template<typename TAXID, typename READCOUNTS>
+TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName) {
   log("Building taxonomy index");
   readTaxonomyIndex(inFileName);
   createPointers();
@@ -175,8 +246,8 @@ TaxonomyDB<TAXID>::TaxonomyDB(const std::string inFileName) {
       " nodes");
 }
 
-template<typename TAXID>
-void TaxonomyDB<TAXID>::parseNodesDump(const std::string nodesDumpFileName) {
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::parseNodesDump(const std::string nodesDumpFileName) {
   std::ifstream nodesDumpFile(nodesDumpFileName);
   if (!nodesDumpFile.is_open())
     throw std::runtime_error("unable to open nodes file");
@@ -199,7 +270,7 @@ void TaxonomyDB<TAXID>::parseNodesDump(const std::string nodesDumpFileName) {
 
     auto entryIt = taxIDsAndEntries.find(taxonomyID);
 	if (entryIt == taxIDsAndEntries.end()) {
-	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, parentTaxonomyID, rank);
+	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, parentTaxonomyID, rank);
 	} else {
       entryIt->second.parentTaxonomyID = parentTaxonomyID;
       entryIt->second.rank = rank;
@@ -207,8 +278,8 @@ void TaxonomyDB<TAXID>::parseNodesDump(const std::string nodesDumpFileName) {
   }
 }
 
-template<typename TAXID>
-void TaxonomyDB<TAXID>::parseNamesDump(const std::string namesDumpFileName) {
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFileName) {
   std::ifstream namesDumpFile(namesDumpFileName);
   if (!namesDumpFile.is_open())
     throw std::runtime_error("unable to open names file");
@@ -227,15 +298,15 @@ void TaxonomyDB<TAXID>::parseNamesDump(const std::string namesDumpFileName) {
 
     auto entryIt = taxIDsAndEntries.find(taxonomyID);
 	if (entryIt == taxIDsAndEntries.end()) {
-	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, scientificName);
+	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, scientificName);
 	} else {
       entryIt->second.scientificName = scientificName;
     }
   }
 }
 
-template<typename TAXID>
-void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs,
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs,
 									const std::string namesDumpFileName,
                                     const std::string nodesDumpFileName) {
   parseNodesDump(nodesDumpFileName);
@@ -254,8 +325,8 @@ std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>&
   return keys;
 }
 
-template<typename TAXID>
-void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs) const {
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs) const {
   for (TAXID& key : getSortedKeys(taxIDsAndEntries)) {
 	const auto& entry = taxIDsAndEntries.at(key);
     outs << key << "\t" << entry.parentTaxonomyID << "\t"
@@ -265,11 +336,11 @@ void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs) const {
 
 
 
-template<typename TAXID>
-void TaxonomyDB<TAXID>::readTaxonomyIndex(const std::string inFileName) {
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName) {
   std::ifstream inFile(inFileName);
   if (!inFile.is_open())
-    throw std::runtime_error("unable to open taxonomy index file");
+    throw std::runtime_error("unable to open taxonomy index file " + inFileName);
 
   TAXID taxonomyID, parentTaxonomyID;
   std::string scientificName, rank;
@@ -280,7 +351,7 @@ void TaxonomyDB<TAXID>::readTaxonomyIndex(const std::string inFileName) {
 	inFile.get(); // read tab
 	std::getline(inFile, scientificName, '\t');
 	std::getline(inFile, rank, '\n');
-    TaxonomyEntry<TAXID> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
+    TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
 
 	//cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl;
     taxIDsAndEntries.insert({
@@ -292,16 +363,16 @@ void TaxonomyDB<TAXID>::readTaxonomyIndex(const std::string inFileName) {
   });
 }
 
-template<typename TAXID>
-TAXID TaxonomyDB<TAXID>::getLowestCommonAncestor(
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
     const std::vector<TAXID>& taxIDs) const {
   if (taxIDs.size() == 0) {
     return 0;
   }
-  std::vector<std::vector<TAXID> > paths;
+  std::vector<std::vector<READCOUNTS> > paths;
   for (auto& taxID : taxIDs) {
     bool good = true;
-    std::vector<TAXID> path;
+    std::vector<READCOUNTS> path;
     TAXID tempTaxID = taxID;
     while (tempTaxID != 0) {
       path.push_back(tempTaxID);
@@ -315,7 +386,7 @@ TAXID TaxonomyDB<TAXID>::getLowestCommonAncestor(
   for (auto& path : paths)
     std::reverse(path.begin(), path.end());
   std::sort(paths.begin(), paths.end(),
-            [](std::vector<TAXID> i, std::vector<TAXID> j) {
+            [](std::vector<READCOUNTS> i, std::vector<READCOUNTS> j) {
     return i.size() < j.size();
   });
   TAXID consensus = 0;
@@ -333,8 +404,8 @@ TAXID TaxonomyDB<TAXID>::getLowestCommonAncestor(
   return consensus;
 }
 
-template<typename TAXID>
-TAXID TaxonomyDB<TAXID>::getParentTaxID(const TAXID taxID) const {
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getParentTaxID(const TAXID taxID) const {
   auto entry = taxIDsAndEntries.find(taxID);
   if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1)
     return entry->second.parentTaxonomyID;
@@ -342,8 +413,8 @@ TAXID TaxonomyDB<TAXID>::getParentTaxID(const TAXID taxID) const {
     return 0;
 }
 
-template<typename TAXID>
-std::string TaxonomyDB<TAXID>::getScientificName(const TAXID taxID) const {
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getScientificName(const TAXID taxID) const {
   auto entry = taxIDsAndEntries.find(taxID);
   if (entry != taxIDsAndEntries.end()) {
     return entry->second.scientificName;
@@ -351,8 +422,8 @@ std::string TaxonomyDB<TAXID>::getScientificName(const TAXID taxID) const {
     return std::string();
 }
 
-template<typename TAXID>
-std::string TaxonomyDB<TAXID>::getRank(const TAXID taxID) const {
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getRank(const TAXID taxID) const {
   auto entry = taxIDsAndEntries.find(taxID);
   if (entry != taxIDsAndEntries.end()) {
     return entry->second.rank;
@@ -360,8 +431,8 @@ std::string TaxonomyDB<TAXID>::getRank(const TAXID taxID) const {
     return std::string();
 }
 
-template<typename TAXID>
-std::string TaxonomyDB<TAXID>::getLineage(TAXID taxonomyID) const {
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getLineage(TAXID taxonomyID) const {
   std::string lineage;
   while (true) {
     // 131567 = Cellular organisms
@@ -379,8 +450,8 @@ std::string TaxonomyDB<TAXID>::getLineage(TAXID taxonomyID) const {
   return lineage;
 }
 
-template<typename TAXID>
-std::string TaxonomyDB<TAXID>::getMetaPhlAnLineage(TAXID taxonomyID) const {
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getMetaPhlAnLineage(TAXID taxonomyID) const {
   std::string rank = getRank(taxonomyID);
   if (rank == "superphylum") return std::string();
   std::string lineage;
@@ -420,8 +491,8 @@ std::string TaxonomyDB<TAXID>::getMetaPhlAnLineage(TAXID taxonomyID) const {
   return lineage;
 }
 
-template<typename TAXID>
-TAXID TaxonomyDB<TAXID>::getTaxIDAtRank(const TAXID taxID,
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtRank(const TAXID taxID,
                                     const std::string& rank) const {
   auto entry = taxIDsAndEntries.find(taxID);
   while (entry != taxIDsAndEntries.end() &&
@@ -434,8 +505,8 @@ TAXID TaxonomyDB<TAXID>::getTaxIDAtRank(const TAXID taxID,
   return 0;
 }
 
-template<typename TAXID>
-int TaxonomyDB<TAXID>::isBelowInTree(TAXID upper, TAXID lower) const {
+template<typename TAXID, typename READCOUNTS>
+int TaxonomyDB<TAXID,READCOUNTS>::isBelowInTree(TAXID upper, TAXID lower) const {
   auto entry = taxIDsAndEntries.find(lower);
   unsigned level = 0;
   while (entry != taxIDsAndEntries.end() &&
@@ -450,8 +521,8 @@ int TaxonomyDB<TAXID>::isBelowInTree(TAXID upper, TAXID lower) const {
   return -1;
 }
 
-template<typename TAXID>
-bool TaxonomyDB<TAXID>::isSubSpecies(TAXID taxonomyID) const {
+template<typename TAXID, typename READCOUNTS>
+bool TaxonomyDB<TAXID,READCOUNTS>::isSubSpecies(TAXID taxonomyID) const {
   bool isSubSpecies = false;
   auto entry = taxIDsAndEntries.find(taxonomyID);
   int numLevels = 0;
@@ -469,70 +540,66 @@ bool TaxonomyDB<TAXID>::isSubSpecies(TAXID taxonomyID) const {
   return isSubSpecies;
 }
 
-template<typename TAXID>
-void TaxonomyDB<TAXID>::fillCounts(const unordered_map<TAXID, ReadCounts>& taxon_counts) {
-	for (auto& elem : taxon_counts) {
-		auto it = taxIDsAndEntries.find(elem.first);
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) {
+	auto it = taxIDsAndEntries.find(taxid);
 		if (it == taxIDsAndEntries.end()) {
-			cerr << "No taxonomy entry for " << elem.first << "!!" << endl;
-			continue;
+			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
+			return;
 		}
-		TaxonomyEntry<TAXID>* tax = &it->second;
-		//cerr << "fill done: "<< elem.first << endl;
-		tax->numReadsAligned += elem.second.n_reads;
-		tax->numKmers += elem.second.n_kmers;
-		tax->kmers += elem.second.kmers;
-
-		//std::cerr << "adding " << elem.second.n_reads << " to " << tax->scientificName << ": ";
+		TaxonomyEntry<TAXID,READCOUNTS>* tax = &it->second;
+		//cerr << taxid << " rc before: " << tax->read_counts << endl;
+		tax->read_counts += read_counts_;
+		//cerr << taxid << " rc after:  " << tax->read_counts << endl;
 
 		while (tax->parent != nullptr) {
 			tax = tax->parent;
-			//std::cerr << " >> " << tax->scientificName;
-			tax->numReadsAlignedToChildren += elem.second.n_reads;
-			tax->numKmers += elem.second.n_kmers;
-			tax->kmers += elem.second.kmers;
+			tax->read_counts_children += read_counts_;
 		}
-		//std::cerr << endl;
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::fillCounts(const unordered_map<TAXID, READCOUNTS>& taxon_counts) {
+	for (auto& elem : taxon_counts) {
+		addCounts(elem.first, elem.second);
 	 }
 
 	for (auto& tax : taxIDsAndEntries) {
-		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID>());
+		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID,READCOUNTS>());
 	}
 }
 
 
-template<typename TAXID>
+template<typename TAXID, typename READCOUNTS>
 class TaxReport {
 private:
 	std::ostream& _reportOfb;
-	TaxonomyDB<TAXID> & _taxdb;
+	TaxonomyDB<TAXID,READCOUNTS> & _taxdb;
 	std::vector<REPORTCOLS> _report_cols;
 	uint64_t _total_n_reads;
 	bool _show_zeros;
 
-	void printLine(TaxonomyEntry<TAXID>& tax, unsigned depth);
+	void printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
 
 public:
-	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID> & taxdb, bool _show_zeros);
+	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID,READCOUNTS> & taxdb, bool _show_zeros);
 
 	void printReport(std::string format, std::string rank);
-	void printReport(TaxonomyEntry<TAXID>& tax, unsigned depth);
+	void printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
 };
 
-template<typename TAXID>
-TaxReport<TAXID>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
+template<typename TAXID, typename READCOUNTS>
+TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
 	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
 }
 
-template<typename TAXID>
-void TaxReport<TAXID>::printReport(std::string format, std::string rank) {
+template<typename TAXID, typename READCOUNTS>
+void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string rank) {
 	_total_n_reads =
-			_taxdb.taxIDsAndEntries.at(0).numReadsAligned +
-			_taxdb.taxIDsAndEntries.at(0).numReadsAlignedToChildren +
-			_taxdb.taxIDsAndEntries.at(1).numReadsAligned +
-			_taxdb.taxIDsAndEntries.at(1).numReadsAlignedToChildren;// +
-			//_taxdb.taxIDsAndEntries.at(-1).numReadsAligned +
-			//_taxdb.taxIDsAndEntries.at(-1).numReadsAlignedToChildren; // -1 is a magic number in centrifuge for reads not matched to the taxonomy tree
+			reads(_taxdb.taxIDsAndEntries.at(0).read_counts) +
+			reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) +
+			reads(_taxdb.taxIDsAndEntries.at(1).read_counts) +
+			reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// +
 	if (_total_n_reads == 0) {
 		std::cerr << "total number of reads is zero - not creating a report!" << endl;
 		return;
@@ -553,34 +620,30 @@ void TaxReport<TAXID>::printReport(std::string format, std::string rank) {
 	}
 }
 
-template<typename TAXID>
-void TaxReport<TAXID>::printReport(TaxonomyEntry<TAXID>& tax, unsigned depth) {
-
-	if (_show_zeros || (tax.numReadsAligned+tax.numReadsAlignedToChildren) > 0) {
+template<typename TAXID, typename READCOUNTS>
+void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
+	if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) {
 		printLine(tax, depth);
-
-		for (auto child : tax.children) {
+		for (auto child : tax.children)
 			printReport(*child, depth+1);
-		}
 	}
-
 }
 
-template<typename TAXID>
-void TaxReport<TAXID>::printLine(TaxonomyEntry<TAXID>& tax, unsigned depth) {
+template<typename TAXID, typename READCOUNTS>
+void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
 	for (auto& col : _report_cols) {
 		switch (col) {
 		case REPORTCOLS::NAME:        _reportOfb << tax.scientificName ; break;
 		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
 		case REPORTCOLS::TAX_ID:     _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
 		case REPORTCOLS::DEPTH:     _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(tax.numReadsAligned + tax.numReadsAlignedToChildren)/_total_n_reads; break;
+		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break;
 		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
-		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (tax.numReadsAligned + tax.numReadsAlignedToChildren); break;
-		case REPORTCOLS::NUM_READS:  _reportOfb << tax.numReadsAligned; break;
-		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
-		case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break;
+		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break;
+		case REPORTCOLS::NUM_READS:  _reportOfb << (tax.read_counts); break;
+		//case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
+		//case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
 		//case REPORTCOLS::SUM_SCORE: ; break;
@@ -596,6 +659,45 @@ void TaxReport<TAXID>::printLine(TaxonomyEntry<TAXID>& tax, unsigned depth) {
 }
 
 
+  // Return lowest common ancestor of a and b
+  // LCA(0,x) = LCA(x,0) = x
+  // Default ancestor is 1 (root of tree)
+  uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map,
+    uint32_t a, uint32_t b)
+  {
+    if (a == 0 || b == 0)
+      return a ? a : b;
+
+    // create a path from a to the root
+	std::unordered_set<uint32_t> a_path;
+    while (a > 0 && a != parent_map[a]) {
+	  if (a == b)
+		  return a;
+      a_path.insert(a);
+      a = parent_map[a];
+    }
+
+    // search for b in the path from a to the root
+    while (b > 0 && b != parent_map[b]) {
+      if (a_path.count(b) > 0)
+        return b;
+      b = parent_map[b];
+    }
+    return 1;
+  }
+
+template<typename K,typename V>
+inline
+V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, const V default_value) {
+	auto itr = my_map.find(query);
+
+	if (itr == my_map.end()) {
+		return default_value;
+	}
+
+	return itr->second;
+}
+
 
 
 #endif /* TAXD_DB_H_ */

From 53560de028f0781395e4200ca4f74fc1420efabe Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Tue, 11 Apr 2017 23:11:48 -0400
Subject: [PATCH 025/105] Renamed to kraken_hll

---
 install_kraken.sh                                    | 4 ++--
 scripts/{kraken => kraken_hll}                       | 0
 scripts/{kraken-build => kraken_hll-build}           | 0
 scripts/{kraken-filter => kraken_hll-filter}         | 0
 scripts/{kraken-mpa-report => kraken_hll-mpa-report} | 0
 scripts/{kraken-report => kraken_hll-report}         | 0
 scripts/{kraken-translate => kraken_hll-translate}   | 0
 scripts/standard_installation.sh                     | 8 ++++----
 8 files changed, 6 insertions(+), 6 deletions(-)
 rename scripts/{kraken => kraken_hll} (100%)
 rename scripts/{kraken-build => kraken_hll-build} (100%)
 rename scripts/{kraken-filter => kraken_hll-filter} (100%)
 rename scripts/{kraken-mpa-report => kraken_hll-mpa-report} (100%)
 rename scripts/{kraken-report => kraken_hll-report} (100%)
 rename scripts/{kraken-translate => kraken_hll-translate} (100%)

diff --git a/install_kraken.sh b/install_kraken.sh
index 803989c..b909336 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -19,7 +19,7 @@
 
 set -e
 
-VERSION="0.10.6-unreleased"
+VERSION="0.10.7-kraken-hll"
 
 if [ -z "$1" ] || [ -n "$2" ]
 then
@@ -58,7 +58,7 @@ echo "Kraken installation complete."
 echo
 echo "To make things easier for you, you may want to copy/symlink the following"
 echo "files into a directory in your PATH:"
-for file in $KRAKEN_DIR/kraken*
+for file in $KRAKEN_DIR/kraken_hll*
 do
   [ -x "$file" ] && echo "  $file"
 done
diff --git a/scripts/kraken b/scripts/kraken_hll
similarity index 100%
rename from scripts/kraken
rename to scripts/kraken_hll
diff --git a/scripts/kraken-build b/scripts/kraken_hll-build
similarity index 100%
rename from scripts/kraken-build
rename to scripts/kraken_hll-build
diff --git a/scripts/kraken-filter b/scripts/kraken_hll-filter
similarity index 100%
rename from scripts/kraken-filter
rename to scripts/kraken_hll-filter
diff --git a/scripts/kraken-mpa-report b/scripts/kraken_hll-mpa-report
similarity index 100%
rename from scripts/kraken-mpa-report
rename to scripts/kraken_hll-mpa-report
diff --git a/scripts/kraken-report b/scripts/kraken_hll-report
similarity index 100%
rename from scripts/kraken-report
rename to scripts/kraken_hll-report
diff --git a/scripts/kraken-translate b/scripts/kraken_hll-translate
similarity index 100%
rename from scripts/kraken-translate
rename to scripts/kraken_hll-translate
diff --git a/scripts/standard_installation.sh b/scripts/standard_installation.sh
index b542a4f..341e4e0 100755
--- a/scripts/standard_installation.sh
+++ b/scripts/standard_installation.sh
@@ -31,10 +31,10 @@ then
 fi
 
 check_for_jellyfish.sh
-kraken-build --db $KRAKEN_DB_NAME --download-taxonomy
-kraken-build --db $KRAKEN_DB_NAME --download-library bacteria
-kraken-build --db $KRAKEN_DB_NAME --download-library viruses
-kraken-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
+kraken_hll-build --db $KRAKEN_DB_NAME --download-taxonomy
+kraken_hll-build --db $KRAKEN_DB_NAME --download-library bacteria
+kraken_hll-build --db $KRAKEN_DB_NAME --download-library viruses
+kraken_hll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
                --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \
                --max-db-size "$KRAKEN_MAX_DB_SIZE" \
                --minimizer-len $KRAKEN_MINIMIZER_LEN \

From c6cf5bcfd04c0efc5c5bdfaeefe25b4c2dc0fb51 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 12 Apr 2017 10:13:35 -0400
Subject: [PATCH 026/105] Fixed building of kraken_hll

---
 src/Makefile       |  11 +-
 src/classify.cpp   |  31 +-----
 src/krakenutil.hpp |   2 -
 src/readcounts.hpp |  30 ++++++
 src/set_lcas.cpp   |   7 +-
 src/taxdb.h        | 244 ++++++++++++++++++++++++++-------------------
 6 files changed, 183 insertions(+), 142 deletions(-)
 create mode 100644 src/readcounts.hpp

diff --git a/src/Makefile b/src/Makefile
index 73b6b9c..03f32cb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 CXX = g++
-CXXFLAGS = -Wall -std=c++11 -fopenmp -O3 -fmax-errors=3 -g
+CXXFLAGS = -Wall -std=c++11 -fopenmp -O3
 PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
@@ -19,15 +19,14 @@ db_sort: krakendb.o quickfile.o
 
 set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o
 
-classify: krakendb.o quickfile.o krakenutil.o seqreader.o taxdb.h
+classify: krakendb.o quickfile.o krakenutil.o seqreader.o
 	$(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS)
 
-make_seqid_to_taxid_map: quickfile.o
-
 build_taxdb: taxdb.h
-	$(CXX) $(CXXFLAGS) -o build_taxdb build_taxdb.cpp
 
-krakenutil.o: krakenutil.cpp krakenutil.hpp
+make_seqid_to_taxid_map: quickfile.o
+
+krakenutil.o: krakenutil.cpp krakenutil.hpp taxdb.h
 	$(CXX) $(CXXFLAGS) -c krakenutil.cpp
 
 krakendb.o: krakendb.cpp krakendb.hpp quickfile.hpp
diff --git a/src/classify.cpp b/src/classify.cpp
index 1f5bbac..981f1f6 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -22,7 +22,7 @@
 #include "krakenutil.hpp"
 #include "quickfile.hpp"
 #include "seqreader.hpp"
-#include "hyperloglogplus.h"
+#include "readcounts.hpp"
 #include "taxdb.h"
 #include "gzstream.h"
 
@@ -66,7 +66,7 @@ ostream *Report_output;
 vector<ofstream*> Open_fstreams;
 vector<ogzstream*> Open_gzstreams;
 size_t Work_unit_size = DEF_WORK_UNIT_SIZE;
-TaxonomyDB<uint32_t> taxdb;
+TaxonomyDB<uint32_t, ReadCounts> taxdb;
 static vector<KrakenDB*> KrakenDatabases (DB_filenames.size());
 
 uint64_t total_classified = 0;
@@ -79,29 +79,6 @@ inline bool ends_with(std::string const & value, std::string const & ending)
             return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
 }
 
-struct ReadCounts {
-	uint64_t n_reads = 0;
-	uint64_t n_kmers = 0;
-    HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
-	void add_kmer(uint64_t kmer) {
-		++ n_kmers;
-		kmers.add(kmer);
-	}
-    ReadCounts& operator+=(const ReadCounts& b) {
-        n_reads += b.n_reads;
-        n_kmers += b.n_kmers;
-		kmers += b.kmers;
-        return *this;
-    }
-};
-
-inline
-uint64_t reads(const ReadCounts& read_count) {
-	return(read_count.n_reads);
-}
-
-
-
 ostream* cout_or_file(string file) {
     if (file == "-")
       return &cout;
@@ -143,7 +120,7 @@ int main(int argc, char **argv) {
   parse_command_line(argc, argv);
   
   if (!TaxDB_file.empty()) {
-	  taxdb = TaxonomyDB<uint32_t>(TaxDB_file);
+	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_file);
       for (const auto & tax : taxdb.taxIDsAndEntries) {
           if (tax.first != 0)
           Parent_map[tax.first] = tax.second.parentTaxonomyID;
@@ -220,7 +197,7 @@ int main(int argc, char **argv) {
 
   if (Print_kraken_report) {
 	taxdb.fillCounts(taxon_counts);
-	TaxReport<uint32_t> rep = TaxReport<uint32_t>(*Report_output, taxdb, false);
+	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, false);
 	rep.printReport("kraken","blu");
   }
 
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
index 30eb67d..97dd041 100644
--- a/src/krakenutil.hpp
+++ b/src/krakenutil.hpp
@@ -29,8 +29,6 @@ namespace kraken {
 
   // Return the lowest common ancestor of a and b, according to parent_map
   // NOTE: LCA(0,x) = LCA(x,0) = x
-  uint32_t lca(std::unordered_map<uint32_t, uint32_t> &parent_map,
-    uint32_t a, uint32_t b);
 
   // Resolve classification tree
   uint32_t resolve_tree(std::unordered_map<uint32_t, uint32_t> &hit_counts,
diff --git a/src/readcounts.hpp b/src/readcounts.hpp
new file mode 100644
index 0000000..9676514
--- /dev/null
+++ b/src/readcounts.hpp
@@ -0,0 +1,30 @@
+
+#ifndef READCOUNTS_HPP
+#define READCOUNTS_HPP
+
+#include "kraken_headers.hpp"
+#include "hyperloglogplus.h"
+
+namespace kraken {
+  struct ReadCounts {
+  	uint64_t n_reads = 0;
+  	uint64_t n_kmers = 0;
+      HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
+  	void add_kmer(uint64_t kmer) {
+  		++ n_kmers;
+  		kmers.add(kmer);
+  	}
+      ReadCounts& operator+=(const ReadCounts& b) {
+          n_reads += b.n_reads;
+          n_kmers += b.n_kmers;
+  		kmers += b.kmers;
+          return *this;
+      }
+  };
+  
+  inline
+  uint64_t reads(const ReadCounts& read_count) {
+  	return(read_count.n_reads);
+  }
+}
+#endif
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 3848d5d..0e60887 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -23,6 +23,7 @@
 #include "krakenutil.hpp"
 #include "seqreader.hpp"
 #include "taxdb.h"
+#include "readcounts.hpp"
 #include <unordered_map>
 
 #define SKIP_LEN 50000
@@ -54,7 +55,7 @@ unordered_map<uint32_t, uint32_t> Parent_map;
 unordered_map<string, uint32_t> ID_to_taxon_map;
 unordered_map<uint32_t, bool> SeqId_added;
 KrakenDB Database;
-TaxonomyDB<uint32_t> taxdb;
+TaxonomyDB<uint32_t, ReadCounts> taxdb;
 
 int main(int argc, char **argv) {
   #ifdef _OPENMP
@@ -64,7 +65,7 @@ int main(int argc, char **argv) {
   parse_command_line(argc, argv);
 
   if (!TaxDB_filename.empty() && !force_taxid) {
-	  taxdb = TaxonomyDB<uint32_t>(TaxDB_filename);
+	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_filename);
       for (const auto & tax : taxdb.taxIDsAndEntries) {
           if (tax.first != 0)
           Parent_map[tax.first] = tax.second.parentTaxonomyID;
@@ -141,7 +142,7 @@ void process_single_file() {
       iss >> parent_taxid;
       taxid = ++New_taxid_start;
       Parent_map[taxid] = parent_taxid;
-      auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry<uint32_t>(taxid, parent_taxid, "sequence")});
+      auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry<uint32_t, ReadCounts>(taxid, parent_taxid, "sequence")});
       if (!itEntry.second)
           cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the XXX option." << endl;
     } else {
diff --git a/src/taxdb.h b/src/taxdb.h
index ce45bf8..ac3344f 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -33,7 +33,129 @@
 
 using namespace std;
 
-void log (const std::string& s) {
+void log_msg (const std::string& s);
+
+template<typename T> uint64_t string_to_T(std::string str);
+
+template <typename T> 
+inline uint64_t reads(const T read_count);
+
+inline uint64_t reads(const uint64_t read_count);
+
+std::vector<std::string> in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at = 0);
+
+std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0);
+
+
+std::vector<std::string> get_fields(const std::string &s, const std::string& delimiter, std::vector<size_t> fields); 
+
+template<typename TAXID, typename READCOUNTS>
+class TaxonomyEntry {
+ public:
+  TAXID taxonomyID = 0;
+  TAXID parentTaxonomyID = 0;
+  std::string rank;
+  std::string scientificName;
+
+  TaxonomyEntry() {}
+
+  TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) :
+	  taxonomyID(taxonomyID_), scientificName(scientificName_) {}
+
+  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) :
+	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {}
+
+  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) :
+	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {}
+
+  inline bool operator==(const TaxonomyEntry& other) const; 
+  TaxonomyEntry* parent = nullptr;
+  std::vector<TaxonomyEntry*> children;
+
+  READCOUNTS read_counts = READCOUNTS();
+  READCOUNTS read_counts_children = READCOUNTS();
+
+  bool used = false;
+  uint64_t genomeSize = 0;
+  uint64_t genomeSizeOfChildren = 0;
+  uint64_t numBelow = 0;
+};
+
+//template<>
+//TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
+//	read_counts = 0;
+//	read_counts_children = 0;
+//}
+
+template<typename TAXID, typename READCOUNTS>
+struct TaxonomyEntryPtr_comp {
+	bool operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const;
+};
+
+
+template<typename TAXID, typename READCOUNTS>
+class TaxonomyDB {
+ public:
+  TaxonomyDB(const std::string inFileName);
+  TaxonomyDB();
+  //std::unordered_map<std::string, TAXID> seqIDsAndTaxIds;
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
+  void parseNamesDump(const std::string namesDumpFileName);
+  void parseNodesDump(const std::string nodesDumpFileName);
+  TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
+  std::string getScientificName(const TAXID taxID) const;
+  std::string getRank(const TAXID taxID) const;
+  TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
+  TAXID getParentTaxID(const TAXID taxID) const;
+  std::unordered_map<TAXID, TAXID> getParentMap() const;
+  std::unordered_map<std::string, TAXID> getScientificNameMap() const;
+  std::string getLineage(TAXID taxonomyID) const;
+  std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
+  char* getIndexFileName(const TAXID hostTaxID) const;
+  void readTaxonomyIndex(const std::string inFileName);
+  void writeTaxonomyIndex(std::ostream & outs) const;
+  void writeTaxonomyIndex(std::ostream & outs,
+                          const std::string namesDumpFileName,
+                          const std::string nodesDumpFileName);
+  bool isSubSpecies(TAXID taxonomyID) const;
+  int isBelowInTree(TAXID upper, TAXID lower) const;
+  void addCounts(const TAXID taxid, const READCOUNTS& read_counts_);
+  void fillCounts(const std::unordered_map<TAXID, READCOUNTS>& taxon_counts);
+  void createPointers();
+  void printReport();
+};
+
+
+template<typename TAXID, typename READCOUNTS>
+class TaxReport {
+private:
+	std::ostream& _reportOfb;
+	TaxonomyDB<TAXID,READCOUNTS> & _taxdb;
+	std::vector<REPORTCOLS> _report_cols;
+	uint64_t _total_n_reads;
+	bool _show_zeros;
+
+	void printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
+
+public:
+	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID,READCOUNTS> & taxdb, bool _show_zeros);
+
+	void printReport(std::string format, std::string rank);
+	void printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
+};
+
+
+  // Return lowest common ancestor of a and b
+  // LCA(0,x) = LCA(x,0) = x
+  // Default ancestor is 1 (root of tree)
+uint32_t lca(std::unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b);
+
+template<typename K,typename V>
+inline
+V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, const V default_value);
+
+//////////////////////////// DEFINITIONS
+void log_msg (const std::string& s) {
 	std::cerr << s << "\n";
 }
 
@@ -60,7 +182,7 @@ uint64_t reads(const uint64_t read_count) {
 	return(read_count);
 }
 
-std::vector<std::string> in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at = 0) {
+std::vector<std::string> in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) {
     std::vector<std::string> tokens;
 	size_t i = 0;
 	size_t next_end = start_at-1;
@@ -81,7 +203,7 @@ std::vector<std::string> in_betweens(const std::string &s, const char start_char
 
 
 
-std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0) {
+std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) {
     std::vector<std::string> tokens(max_fields);
     size_t delim_length = delimiter.length();
     size_t last = 0;
@@ -123,86 +245,16 @@ std::vector<std::string> get_fields(const std::string &s, const std::string& del
 
 
 
+//template<>
+//TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
+//	read_counts = 0;
+//	read_counts_children = 0;
+//}
 template<typename TAXID, typename READCOUNTS>
-class TaxonomyEntry {
- public:
-  TAXID taxonomyID = 0;
-  TAXID parentTaxonomyID = 0;
-  std::string rank;
-  std::string scientificName;
-
-  TaxonomyEntry() {}
-
-  TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) :
-	  taxonomyID(taxonomyID_), scientificName(scientificName_) {}
-
-  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) :
-	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {}
-
-  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) :
-	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {}
-
-  inline bool operator==(const TaxonomyEntry& other) const {
-    return this->taxonomyID == other.taxonomyID &&
-           this->parentTaxonomyID == other.parentTaxonomyID &&
-           this->scientificName == other.scientificName;
-  }
-  TaxonomyEntry* parent = nullptr;
-  std::vector<TaxonomyEntry*> children;
-
-  READCOUNTS read_counts = 0;
-  READCOUNTS read_counts_children = 0;
-
-  bool used = false;
-  uint64_t genomeSize = 0;
-  uint64_t genomeSizeOfChildren = 0;
-  uint64_t numBelow = 0;
-};
-
-template<>
-TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
-	read_counts = 0;
-	read_counts_children = 0;
-}
-
-template<typename TAXID, typename READCOUNTS>
-struct TaxonomyEntryPtr_comp {
-	bool operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const { 
-		return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children))); 
-	}
-};
+bool TaxonomyEntryPtr_comp<TAXID,READCOUNTS>::operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const {
+	        return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children)));
+			    }
 
-template<typename TAXID, typename READCOUNTS>
-class TaxonomyDB {
- public:
-  TaxonomyDB(const std::string inFileName);
-  TaxonomyDB() {};
-  //std::unordered_map<std::string, TAXID> seqIDsAndTaxIds;
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
-  void parseNamesDump(const std::string namesDumpFileName);
-  void parseNodesDump(const std::string nodesDumpFileName);
-  TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
-  std::string getScientificName(const TAXID taxID) const;
-  std::string getRank(const TAXID taxID) const;
-  TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
-  TAXID getParentTaxID(const TAXID taxID) const;
-  std::unordered_map<TAXID, TAXID> getParentMap() const;
-  std::unordered_map<std::string, TAXID> getScientificNameMap() const;
-  std::string getLineage(TAXID taxonomyID) const;
-  std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
-  char* getIndexFileName(const TAXID hostTaxID) const;
-  void readTaxonomyIndex(const std::string inFileName);
-  void writeTaxonomyIndex(std::ostream & outs) const;
-  void writeTaxonomyIndex(std::ostream & outs,
-                          const std::string namesDumpFileName,
-                          const std::string nodesDumpFileName);
-  bool isSubSpecies(TAXID taxonomyID) const;
-  int isBelowInTree(TAXID upper, TAXID lower) const;
-  void addCounts(const TAXID taxid, const READCOUNTS& read_counts_);
-  void fillCounts(const unordered_map<TAXID, READCOUNTS>& taxon_counts);
-  void createPointers();
-  void printReport();
-};
 
 template<typename TAXID, typename READCOUNTS>
 std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getScientificNameMap() const {
@@ -237,12 +289,15 @@ void TaxonomyDB<TAXID,READCOUNTS>::createPointers() {
   }
 }
 
+template<typename TAXID, typename READCOUNTS>
+TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB() { }
+
 template<typename TAXID, typename READCOUNTS>
 TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName) {
-  log("Building taxonomy index");
+  log_msg("Building taxonomy index");
   readTaxonomyIndex(inFileName);
   createPointers();
-  log("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) +
+  log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) +
       " nodes");
 }
 
@@ -570,24 +625,6 @@ void TaxonomyDB<TAXID,READCOUNTS>::fillCounts(const unordered_map<TAXID, READCOU
 }
 
 
-template<typename TAXID, typename READCOUNTS>
-class TaxReport {
-private:
-	std::ostream& _reportOfb;
-	TaxonomyDB<TAXID,READCOUNTS> & _taxdb;
-	std::vector<REPORTCOLS> _report_cols;
-	uint64_t _total_n_reads;
-	bool _show_zeros;
-
-	void printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
-
-public:
-	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID,READCOUNTS> & taxdb, bool _show_zeros);
-
-	void printReport(std::string format, std::string rank);
-	void printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
-};
-
 template<typename TAXID, typename READCOUNTS>
 TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
 	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
@@ -641,7 +678,7 @@ void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax
 		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
 		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break;
-		case REPORTCOLS::NUM_READS:  _reportOfb << (tax.read_counts); break;
+		case REPORTCOLS::NUM_READS:  _reportOfb << reads(tax.read_counts); break;
 		//case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
 		//case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
@@ -662,8 +699,7 @@ void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax
   // Return lowest common ancestor of a and b
   // LCA(0,x) = LCA(x,0) = x
   // Default ancestor is 1 (root of tree)
-  uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map,
-    uint32_t a, uint32_t b)
+uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b)
   {
     if (a == 0 || b == 0)
       return a ? a : b;

From e76de48066c36941010560a1311fd9f5e3873662 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 12 Apr 2017 10:17:03 -0400
Subject: [PATCH 027/105] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 988deb5..a31d87c 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
-Kraken taxonomic sequence classification system with Unique K-mer Counting
+Kraken taxonomic sequence classification system with unique k-mer counting
 ===============================================
 
-[Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. 
+[Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count using the HyperLogLog algorithm. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. 
 
 kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). 
 

From 83c075f1b4cc329730bba8f032445a311e545a82 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 12 Apr 2017 15:10:49 -0400
Subject: [PATCH 028/105] Show numer of unique k-mers

---
 src/gzstream/Makefile      |   2 +-
 src/gzstream/libgzstream.a | Bin 14254 -> 14622 bytes
 src/readcounts.hpp         |   2 +-
 src/taxdb.h                |   6 ++----
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/gzstream/Makefile b/src/gzstream/Makefile
index 9884a9e..4c32088 100644
--- a/src/gzstream/Makefile
+++ b/src/gzstream/Makefile
@@ -35,7 +35,7 @@
 # CXX      = CC -n32 -LANG:std   # for SGI Irix 6.5, MIPSpro CC version 7.30
 CXX      = g++   # for Linux RedHat 6.1, g++ version 2.95.2
 
-CPPFLAGS = -I. -O
+CPPFLAGS = -I. -O -fPIC
 LDFLAGS  = -L. -lgzstream -lz
 AR       = ar cr
 
diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a
index 92861086535726f3d31314ecb9e4bfe8ba8724b8..5144238131823c9252a7f0c5ff9f3690b8235c5e 100644
GIT binary patch
literal 14622
zcmd5?4QyN2b$;ZNEmm<UrA`}dNjExGGCFotN|t4*@fuR3?5CQwRZFtdI_VQdQ9rRH
zsw8DgO4kTrVw;ue8b~n&{qNAB8!&VM+66%yG)9!ksh4KXnx#QAq+Wm}4h%HSg4)^I
zMmy)ebI8NT_Z+50w;kZ|-uvD2chCKM?_O@&naCt3KHBi%KqZAl(aOu;Kww9ECvgln
zXqxuNnx_A+Jm#nNwzZ8fW^<WTVtQn5tb1-|G?f{fN+0b#qS-4u(zB_V-u^ww$*QXD
z$*FWU<scJl2QYT9HIo`^HO=w)d2=?E$);x#Q<J#`^GFCZ4IZd)p-OkG%}H;F!1j{&
ziEN5kL1P7sMvjEcnf4BIZ)HtMcLjgUQtl?hre)yT+Q+8mvJ)o6a4eN^aJVadgaTU~
zy^id{OwzGE7(MKU3ioXNP*E3cFLzL7MVm{7=(@oHdru7ZRUreF&R}f&s-Zy}q;WU3
zrmeZ*HX}MaI@C;P+qR>#OXY?e?(2>3@_IZlF)Z?EAJxhwMm5d1rJN6|Y*^c(-Ku!s
z3*|<5n&5dKJShJGO0TE%k0^zyr|NH{^iLoSQ2r)Jjpd_F#simg4Mz9T=Emb+2k7$g
zdC=%<y#Ftuu=My*RNr|27vMKC6d4>EFpA%5dExc7H6y>FMZ-b>95D*NHVSVTTlV=L
ztpm>df7Lbad&)>Y8!wI<n;~;C?)eX6^YcdG^@~ug#fyjI#`3-4?{5L^@xm_@_`sRZ
zwHt-QzIfqqlkrerF;-V+6r#<Ik3{1}{z>1VyCZi+?p|qpWc|vcYe4e%(o3h6UuPWr
z8a0;dZVrM`qY!IpZaYIwn){4VxKel!+j{v4eeRyZ`TT|E{9Dbr@yk#6a*@@#mW)x1
zwRl#wk=3vVgkMokf=6NFfipQ{HII?;7=;&%<#<c;sIhH>vNE?FoDCdWF$xzhKhcys
z1B#9+MVJ2JQEd99^H%aBD|y07exQ`AJo%TqUA5?fLyL^%`nCR$<pA{Hy<rNU5;q!S
zD~H}(IenFmT3&~a7{&f(qj0@Z42O-v@e|M(qnP$_4;jo9Zh|Je!bLJ(!JfC2{Qwgy
zXsY{G;>*|f>5CU%OF-{g{pHj;CGo@4UhJ@+tHL3cymXESpo8FVVJruXWix1Oe$!aK
zFI@PU5{^8EgQa_AZbPK-Z&zM4k}n#byMwX91sqQ+)?jbS-N8}c=^XX^+iSTE#`5l9
z_(aIa?+$tjZz}Eix7HeaR}RUsxMD0H3>pvoORmf6kM4tkjQ7nI$7xg2@x?QZkDR&k
z0*>=bKT&d5@4@v(q25?N;4_xP7mdPo^}kQoS)JqDZk*S}`+Ukz9QIxq*vq=1-w9TK
zzMKu@pYf==ORq)_l$QsG-uZQ3;a?+#XN+aL=deORD`-UGVB768&hEywO@p+V(g3AF
zN<)-(skE4eA#ANI6{@1X)%<-h2w`ML`(YS|;0YSV{O5467ouGN4?>C~9Hv4PMvo57
zGW7Y2ahmS2!hgc@!~^kCcVFQjZ5HB%SFPyCHk2n!ocyv@ugIrYA_rxpoEFl)!sSTe
zr$+Jb)imOEoy`x!lt5a|Phf5}KZ%KPHa|-hkex4Qm&(}_oMj(z=5MrSzM8DC9#?h(
zG~jHMIw4A(5DkDiuotCHh*Bp+RVT!Y$*@Z--d!iCcw3zrEBtr7aJdxyfxg0ORrDRZ
z-O&+iS>he2aK-92HTrR9iF`U@t}H#S#>*T`*xDw9e(2J3IBcqC#4C<7-YEV#M$xie
ze7}llal1;EMGbg$6$iK*ak|u3k3QWf-rJ?%Ln~`*FxK6hQnRKXE~z=g3o@z|&o$3_
zHooG~zT|~@4DdDI_HKO4t9@Qq(;6GcRQV3ZXmx6T=c?wg-fI}wFXAe{w#McEFDXvp
zT4?@l%YRPaQD%2E`jJ3z_uiq+fzH-#t!;t!P}`2swos=fc^xV6EbiAl^G%)`Hq`r0
zgD)|_@m&z<NBEQWHx<1-{^pZ<#2+|T7x4%4>xTRxZ_*!(toQ?wRey8O)BdKOANzfu
z@Q3p2j@6yii(V}NqN6Ae^d$a7{7rc;3eGq9n<9Q+<Qff^jT2Z0v<H9oA<yE9-&gj*
zv)CV39@c*uIc{J-(Z?1<fO_Gn!;G@wjk0_UF`)hMi28o44+1udl|T=|!}Z6!5r1=D
zAMyvh3;t$M2(IEX7ovXJfC511dyKH#5Ky)cqiXfuJxq1)!8#zcmm{nnfgXR;Nwjqg
z<C?GY8p>J^DnQ-eAl^ab9l*HcR9vv%9;3S5Sa;apGzdbdzk~1@$UC%WklTa$HCpk*
zm{-1!H(;migyLsOPY_b22Yfw+W1y(__yZ?#3>;e*fzDp9H+x_h_ybS}Jm`EJ<k`Qk
z3HdLe{3(<#*7f*<CsF>``iQ?PUq9pz>p73V3lPu@9A$0RIQTpA6?JQIjKz2Z1f%B~
zC}KRGmTejGH{GLp0hy}_`?5}Z!*LzZHSVoSIbB0b)ud}Y8Y=5_4ZpyuNmpNGPB?m1
zO}hGebHX=Pf!JkU@c*uQt13(!H$dtUep1(}T3tnt7)7G1uSEe+R@d<Bv6^%Zzb30m
z$HN^5(n~8$Wu>m+*Jm~9TA&&PMAvR|!EbiKZ>hp>RfvpAnME(HFl7vLIz-fobe>eU
zN*yGeNVjHJRh6z)ZHtOW*Vb3@BA{#3>9$&3!_}yg?ttR^#5h-LJYaXh<1`90F8G2A
zz63bxKL;t&lUAbP>g}ZeuUzo&0*-o4QK;4a0NB$m{LcY?lbCn7Cm_8B&n9nqoNz?e
zf!<AG94;X<(Di^j`H5@26aGt(al(gy-^rhQ6ke9Vy$K25D$A&a;qP(4F%Cb26bbh#
zq_0AGlNL~U{OPCM!+<;KdENzo4RF*yO=~ExX!TG=e35Xq$AOIx68>$%dBwb)@P`R!
zf9`^^lRu-x&r?I~bD$qTO3>~gO&PVv0sau-Um+a#IHd0o-k>%rO$#ej)6N0zWOqHt
zqyCQ(Kkjize+sx0f5-)&bHVQ?{TZ6VxCbJA)rJ3I;%}lUtoAtIKj*^#GUCuSJH770
zUk~kcYS(puZxZXgpA_Dr@bWsZ-W8y(%SBJZ1wW<ql=lTZk3{;W!pr*tzPBKK&qdEm
zE_fZp*Qs4SF8G2Ae%=M&3QMh%{!thF%P#moyWsWE-%k4P0DP0!2TMHW?)bi*NZd5D
zb0cP-Svq5m!m0E?F5YFDlj+Hs$(%Yvm&f(7?G8uum6so+df%>H1HFUhV5BGBYiiS}
z>E!GJ9A?9P$!L{UaWFr=h?gZAc$P~f51Yw}!{%6Ga;i$CboxHNsIE||7=EV<l)(c7
zxwh>iiR@$&9H6_Mz9fKc$%#b9%w-ajxolsuw-@aV<T|I)$;1?2pHyB6m99uC;%D7~
zj863*nN}`W+%;U~8U`Hew)~n*g9EM}bwRZWj49Rr|AYIR@x`N=$y}<cKmK6%IPVx3
z7%HhmrWZoiZuUWR;K?P(!;0{WJD^h6M(Gmm9j$cWeCh3X9zSg+449#rqmwhE=KfT6
zZrX}ZrS<Yf(?G7&jo_2RO;&02t5H~D?&t#_ERW66Y}%Yi%#2PcQ$nL+vPPPjoKKh#
zig`8DYr4fcu(vH_g4}#!Wb#N`JLI#IW^yW#&6>G|*_1gsGnR(7!rhj0BphcFSgr_#
zlCOqrw%bhe$h52Rnr2IFbA9_HT=AxQ_wOl<3LM?Fwy!gr%E9esZ+}v)Vzr9x$fi<<
z(_>@3Qy^gt+yOvFCo@pqrUvppSShE|qtwB*wMD6&O=n>`bc{g?_mrkNJ)&lWnZrr)
zyX)@GPNa`cCuSDtPPW#jcFt$VQ#rUHe|HhuK3BePuhmrd><DZtxTEx{>*QMHz{S})
zba9XJaj@!|R=sy*)vtBV@gd0n#yVzOJ0b8Ef!{{&bMRWC-6HVM3;fRnepcX{1%8Ea
z{Qm3^`1|SI4)x=^5&Lr^;mE&L;9Fhzw+ViHcV|5@!QU$IJ%S%ko0)$^@Z-@T<6ji`
zHi3Ur;Ozo`N#M5&d?&oOB4K+!0uSq7a>36FT*l|8gxllustf-w1iy^Wn&6l538I5Q
znA{8x`xz5>Sl|W1(I2V*$AZ5{@INQ`W&hp?@25yu9fgPO-YRhXrf0m9aC;nf34R&3
zj|+b3=iP!|#vvu}n6UfT0^cR@H-vsU&i(LSjAW0`e!{W6GCrRa{4zdY68thge<N@i
zpYOToc~bDp`23ULm+^T~@XPqTEcoR-`jz09^XRt%m+|bT??H@%^yh%U!%)Zlo+BKm
z8OkueAovY|FA4rWfgcyR)N_h(?5~dr{s)DAJfmT|e1~8khu9V*d%Het!Byh{`wj{D
zWt{)e`uRH??aFlweL=$I8)MDqt%xBZz84<;Zu21sggEQOGS?%%4*?q?j%P#6C-w9T
zUj9Bu{sDnY{*Mcs^&x*q;8IVUzyp*=em<i?i2D)oAq0q6C*lVfgtUwDh#wSqSm5e<
zlgfs`@1#5?+;-ISNr5vD;+q6MOnJmVCGZJ>9}@Vi!0}9!X`V8uXD5RqMI6_+X2N9r
z5f2mIL@Dxrg7TQK9OAg2G6g7uco%~rZ`Tth9>jN09upbQZh><a`FRc^7Qi1M9OL*f
zaoiX`M-?Y2Wt`7#zbo)9tOU}h1WsFoLY@^kEsYAfC~#VG6!M0^eU!(<hYbQ^fB5|L
zT7mP~CGzpz8T0dbs3G{Vj?Y6!Z8)EY;@`!PSP!3ve#?gQx#t-h&gY&l*l<4g{C9z)
z|F}LO)uBNkw#(<C0UOTeo|^@ZdN_8&0>><$W2(QBkbdOn6GCnu^5VninCESJO6z26
zc40b~7|~kcu*OO!IG0IHC0gOrHm9|YCUOa_btIeB;P14Aw&K4-D^6zcL%Vf6nKV=L
z$<%Dl1btH}7q+XY!2_Ne)Ht${e|VB$CNi1C0sv|B9^9~hom)*&mEaBVA&>1}rxntb
zU)V0<H89!!b?m6f{(0ocWdF0VV7l5LpIejZ>FLxA^y^GImuiJSCETXY3MqK033Duy
zfCHA+@l1M-S6~=SzX#zx#kCIdVZyGXGzck<<<dMN+)bYMK%OrskNbM9^1P==dEB3C
zl^-Slq&)74waQO9$m2d#t9;Hue(SrC|18OGpcLbW`(v&4Pmr8!Kkh4b`2b+JRLVUB
zirQ)X0s4yEM4(K|m8!!Zyf;UkVR$6Xe$Xo2Nt+h>FV|1r$8P}~?bGL^B?P})+3^x$
zWoTJ1p%_zSb(6RA+spWkf!KOm!`OZ=)Jd|jX>CyH)c&V#vZRJd7~B6|c(_^IejHP1
z-)mF+8DixA0v1Jf|NjC|C;yL-Jl~s=cvb(+Lxq$7!4FvySdWDM<2#w%|Azo`^8cwC
z{ePV0A(iwr$>aEtWN-iTQ0a%q-v4irJp0Q!xXkIZP~p`6Z&3TWpMt=LYaz=q{xV=r
z?N@^x>d-eyEYFw+gi&6%^?wsi79h4`mCE+H{2RzP`Tr>SZxAE;kLx$vXT1TkZy$d_
zvd{OCq^2a#<*<wYT_lfoC9!?RZU=cM|M@Q^1~H-@ybWb}_CMuff0*nqkvu0X&*fnk
z`$mh!F#+{RX#YldSf1?{T<l*U`#5JLu{>k<yV!sDW{V?4jA$SK^~3UP|2r=Be@XWH
zNuCpy=knjW*uOybPZJ~B55U9nZ2u=N_QMU9AoE+LvOJfcaj`#36QAv%eOyvmALG9w
z`y40MYmj~1pCkdhB2M2XdC7zMB+1)xmScP)%v;24(^s1y`3trR_MhebQ13MUPf+{s
zBzbH<LM+eice0JT4cu&i&Xd4q$ZvwjE|0^+zIFnnHGcqq;cUgW|J&PtE$}$WUqFX}
KT5RpN%l{7`5pgyE

literal 14254
zcmdT~4{RJ&dY|>!#3AW=L%8B2Z8e4f4hhb#V;hHmU1#lhOqO6`CqyK<$$GtxSFyd8
zcN{xTOI-F67#X8TTwB!AJFcn(=+s_0wTQwAd@RIwRRuY<MM&-G(i3-0<7-Pn<RUno
z`+aY|H~a0|8RE2}Qa@?tz4v?HzxTaAGxO#(+>@F}58rp~R->F^E$w%1YiW-~TMZ*B
zs`%Y7+P1flz~O5(P5ZK@RekzJU)CeLW3eMMd3z$08XcG%+CDipn3)(F$xU?~)6~kA
z+<0cJt9MsATT#`T9?9i1J~Exn0_@*cW~M^EGv+57C9v7^FqO{`D`+f(L1q83s5REy
zV(l)k@pPB*ce=`hWTa^xoLKYF$Yg%lf;bIjCVU)j&mE)SRYs^Ke|#+M+a5PUiqlH0
z%Un-1mpY-mA{J1=x~IQS?TP-L3Z$<*f*9MLN~m9fG}%o}Id|;y5g&_2w;2Z9W8&V{
z27^7etz}!Y81sj(J=vAqSrrPwz;`L6-KUjG64kUwxRj5JY+T!<-70v04a#+Jp<D#6
z>nQ*0l-@uoOfA84Bc;DV>2E=5z}3;ex6gDwGv}rv=BZ0{Up)<&dG}O(-D7i*Tbh}D
zZ3<1;2TkWgb918hZ>phGxLjTL;P=e*xlM5UL2`a@jcIOrFX{a21*p`_`J+j5?nM06
z%^+zy{~^Fr3y&H}CtYhgM<eDl=6t7~G@V3!-OGvO(rvFS11%YD@?|%<?oweI<ZJ6r
z9mmRxbuV9Y;84fG;+a0P@bd_`Jo|?5Z1Rxl{K9z}c`v=GPwsQh70%ZeF4x<Am)@+k
z*PN~1I04o+hL+Zu&iPAkM(ou9O$q3%HgGl`0vCVg`t2m*=BWj{;cNjlhfL@Fr5j(t
z7A>uJleKPABNDUlR^&htgC~mQj@@9+ty=Dlx;iwB1GV23s_QHsD4vl~UtA>uw)prH
zU{GY|a}c8cbp2UO)Cx>3&y!Abu~3qG4YNXSDPcNqn^IUZFNJ%Yx1{Q%^V0xf$suB$
zoV%e%pSkdF4?qt<+?~%{znmY7^gxm7mp*_oRQNeKj^oHM=W>narccbdlX2&w;CvPP
zcl+CuYn=<fcprK{lxTD=I_DsQwFeFr&s=&OdoN<&-sgNQ!?>~Vt7UtwIoI77e<ErY
zx*J16Q{nP*T~~2wjTj?EaX(N*F6_9q=H1=Kl-o%~5$+<o<q3Xmq3-1cZ~)8}na9_d
z&MI?mU#&S8zhFAouQIQu50<w21zSeZbc&^{8#QXQm~`_eUFX30$uF66x~cC64}LtC
zHwy2#?ZvUOSBw^luk()`&fl4Hs?SOEd9k}zjF-i3ToE)V*Hdax+DK`X(sq%~pLEBl
z95ZJNCviBM^NC)VqfxjT&H2KkfI+q$;C@J93MT5|g6`JAXH%cPkfgcZ>HGwyJ%A$e
zOY0#DNjZiIbfeI5u;Wm%V;{B9`K-tJq{Dg7oc}hCWiN1xg~L_or>31P3}fzWA&Uu&
z;Bl&e>~twRTgpD+Wpz}*27PiK79%-H784Y)1Vt=ixa<jvSb`#!pon=h(4C@J7qO)C
zGugOK=Oc)k7phx(oVUGYl+A96o{S<+&W=MJUlFU%>>HxD?3>Ms;pBD;Ov%!4!U^#P
zwoUZStC)KoZzW7j53Mvm+%-zDn~QD~=TEe|UAsXH{&G+@jRyMia;P>^bDMT=4Om%`
zRxcVW*1lEi0_|XFol!HXMQbc=Ka~F@<bsi#YKFAtn)|gBlx-JV^XH-UZ-%sU#Iojc
zX#ISZ_7_!9TcYxJg%Tc=Q{NGS`2293aFX;3xLz(V%kuxXR6dlh<oEQi+;++|kiR>T
zxXWnl-o1B|(bCl3)M_+GW3AD)=50pfp3I<Orfe6#qh%B7xSkY+X7*^I=}2h9+Euk@
zAiy}6@%;zsCA{qjN1m+e2-lz1JHkexy6VMneMdO5BV6n9naBh56yBhsP}LbWsvZwV
zI>NQMP7EUs)QUGzpGQ&8MrfutDLhyuW{A%H*oZPfJ#gVG70RDRUkW-Ji3?A_P9s$H
zVm;79aQX0>l6MQ#{|+(&Jr9@QRonQDQX6s2+d>0xEeZi)yWpKj(0aU!c0DB8h5QGp
zZnfahG;GH>VfP|d)(%^`rWsK64DrPwuicOF%!X@=J2A7f2o|v4zDqnU$YX^g{eWW(
z`U#(fyiW|Kq#?0>)1og1gYf@|@E_OiXGza_sR!aSU$rA#|0Irq)72f}#zIYJI9l~?
zxDf#85MY22aS!mnPqOzwDr5T;%FgRM!p4&*d%C7094%a<ZwU!;sB^_dT%RGonn9Nv
z*Ky$&w)?NB?l!FJ3`YjArYV1X)J_dRYEQW7u*^+lMp8|7X4=-822*xQYZ}PswI&a3
zQa@<L$7$1%blS>Hr!(WW1^PxZ0c@Y620s>8pvISt{L7AIZ7Ve~kva}Q4!s9A)NeJn
zDWz)V!>vrP?8A_ZaLLcFS3cKO!|IoxTCaYaH>Go<qnR=2m$96kX@W)Tj=Y^pA0@vs
z!`9G5YBU2uoXAa%V{iYj=|?ag*ZF9P6DH?2jxGE?6l0gALGno-`6ebO^#sZ9CwW{G
zk<h*gmz0xfko;lbMdI?_dI|WI<cEcHxjeQ>%E>gyzU&=t|ADKJpZ2kjYm&4p(;)k^
zB+vUV<|A(ahD*~maB+H)3O6Vf(yrv)02vh%b&W2^1$ryO{=;=&#zU;NP`(u|v@f1=
zkV^V9#jeCvTU($4bq2|!ZceIv8&rz+(;6yyrF|SToD?>V$CUG3xWg^IKp?gs_g&QX
z<8vA9S1F4Bo!T$^4_E}L?fG3m{rvxw<WWB-#f*mIOpOWhpTWrrv>@*8{KIFs>i;u<
z`T4(Vl`D89|MA;F$h+*`{b7>t_7FFtw*NhVajQ%1|K~_v`YY9_<sU+YU;BHvx)O4n
z-UdZ{)=GJa{{%3<_P;>((Kk-2|G1UsC%-`Q(vH-DeZc9LQ1J7AnEW@18U4q7J=OmW
z0ruY~`?ycYN%jBk0C{PskqblKP|9r}@8|zt`uLCgkgER&0_?|$aaOTsDCK<t_AitD
zVW>yK_~HA%YJVoc{sJ+o{iT*C1MDBhg#{?;Cc-}MyQ=p8D8T-CV(cY(nHVxd>F)>F
zKSTD<5FfVRfJ?Ri*JNM%N$gq_7keOD2HgVOfnAVHUnhBjz2p$ddjMsmoW%d0<kfZU
zFv)*ksgV9l`FB9xZ~PhJMO)MMlRUP6BV1Bm&dYUxscR4FZls0p1I1yb=+1o<P^gy-
zjFZ7es5jtJ?PD|j>@OfQ(0w$I-BKz3vK*A6pViub`R*g=;z?d9scUO0K|pj(eG0g2
zx_Ba1is<6+gdZNM1Od^-AB@T>T@zoSWfka275M)ldoMu-_dTHGC+{cS+Z8BN1$Oa`
zxRP|S()!`}N2QW<4Og~G(!Je)@;Y6^m7tPzF+%+Cn=3&;bTOjKtGYyyx4u-bYd4h&
zET?<3nt_njA<}-NLyaHlzB3W^lx?kfqI$(!t6QaO%Ah9+-J3n03hfcdUdP7sOH?%l
zso!XOGyp$EdcH&QPf_|Dq<(sy2ORCjDb!*g2(X0!{_g{R6PqtRP)7O@Tz>jL0=$8Z
zpIIm)Rf9c0{#XFMBLII0a6f;B1YWZ6kZV>u5uoQWz%dTTAVtEsAmM&bgJy_4-g+tb
zEx=3Ky=v`;0XVL|sQ))<D9Ra*@7svqPPkkt>IuJ>aQPT&BYY>}V!sN^KMZ9*e=Oqv
zD)Gn3)f0rjMY!0jg1V;(zfAZn@xMs;G)<XJgue~ApWP7fVY~j6_|FsnjZpUE-yDFC
zlb*-u;fVWINKOF%v&8>8O;vh)YA*-yzk@in)z$3Y|0#g~;{Y6&j0Uz2%lGZ;1deaK
zF3IQhM!@~z@IU}QBlMKkZTUVtFYwa3jn8MK=K}QnZ2<mZ0Nw;~^z$<lfPXgtuZQnI
zKRvwx_~QZiYXSJj0r+NEf*aU#%JacH{E*PB?&O0zI+B)^pB%7yEOpzl>}*G$oou(P
zYz{X7E04vr`kXQ;JEJq28BLELhm#j@fM>8mo>@9FgQs^YIJ}b~M`pwko(cx{0FQNK
zdfFP#OyqN8sgbOG+&UHo&dO6g{rmdtSnEJ4pG{ldp`xBNfU)#&YQnN7Qdv9SlkVz5
zLw$DJNG_ckp));u$ELDlgVvr*eiHTq)u?&Lc=(YVI>j?M((4{qD%;ikoRdP}=62W1
zY!2KB^sgPNEnv*Zb4>r&M~sflOig6%%4mFHM~$u+7#J~`)I=AAtl8>;=)h&C$iuSm
zvtvqLM<?i<*A=a_;C$%n^&d?!OAMd#UyGnx9!=}By?z5Hd`5;h>ct@Rm|J?lsOzFN
zn9o_msj<Nk5qqXlFu6m^N>8UOX#2F7y6SGdI5fovtC%snV^Is_rc(piW3gt)=d)IN
zB$dxw_VMwIl^q+(VdMIH{G0AOk-&0U<b`}Cw6END%(9M+1{z<{^jO(cZ_UEd!c5nm
zUEVOj$-UC{wdFH5oVe@iO^;lS*q!-I=4ftcsB6R<seOP9W+$M$O^n<JVL=?t4N?cM
ztSv(AcrFj)zhwwg_yuEGqXS|JST@dz|GDn%`QhBuXlm>@ofBMfgr@UHGB%vdyt)Xr
zPL|GjuGG}_@d5a4GnC4XbcrL5E0t@T&W}%`i@Su6{pt@SO?%LN%@AK!4Uqq(`x|_D
zkl{Bmd=<U7;j7k0h7U6QR))_Kj$iV)1ti~_<`g^*bx426^xOhv>CZP9Kfbfd=l}Bo
z{NHB$w=@1%82@IbzsUIUT~q3Rm+^07{Qu1Ox&EIrey;x`hT|TVY?u6`#rSZ$Yw2AU
z6K?l<hHqhdwlX}*@H-ifTaeOjis9`H{}#jVWcUXG_y%~7M#5^|UpEtu?c)7~-v>x4
z|2D?Y<9QF`=ke?b;NKI#{|Li*+%gR3{W8jM&i{K1zZu#g<N3!7$8R}_ze_mAfw!v`
z-qVpV4!;eT<c|=p#;up}#~J@u7>?%xq@JS;=k3ZfoVV-i4Bx@@TqGR*;q&(s1&>2L
z5*}hgQrmkM;c9z#Fn&HxPB8vB=#%Yo7|z@K2Mp)!{U+1T=husjpU<zC7|!E!j^Vd3
ze=ahd_xmS=UkkhxD#&Ttr;NW7%CcR{jK528y0>ch9zs&}tY-L5mqpyegkzk$;gb5-
z5|28r))9{Uw=jOWPOJUc!1y;aevCPi%8&b?NT{Fl-zQKP=W+g`{gnG1=nr2H<$js!
zKlhXSFJq2+xc}5Fv4e?qa$lI^_d`KGU*!HfJ|0ZT#Oe{>#c-KLd^cQvdXka}QrVBl
zFQ4^T&F%IeKqU1czwA@2;(D5h7ZZc>$lptOOyvFr;&RUp@hIMa5Z@zm?hT(oNQn1|
zoO_E?9`U;<kBN*G;{6PlJ|MoA;rl6%xWVwllt+9Y!-pCEFvG_gE_EU_&G7A%=LzcP
z>nD#t;&Eb*P>TEyP#%+d$|Ejgh>(FdAjIVu#&SE}fK)wkfx5WdD@2ILQ_fk$4ZH#2
zSjJ0|9Agsy4pod$D)Bfv0Q9t*h;^)y@|e6pdE}=hP#`Z6j_V#RX#)8N!jT`gggnB1
zYb4~Cu@wN>m;T6o;njpA|7yYL-l7V>+~@67aJkQWSi$8!udU#6pI4swlm5v4+y#YS
z?&r#Lev)7A=l%=vOaH$_ZKy^@AjvQHc?|`Z`?;G4mwM#-cu?V&`?eW{Uyie96@IyI
L`-+0g_3*y|@rP<5

diff --git a/src/readcounts.hpp b/src/readcounts.hpp
index 9676514..486edbd 100644
--- a/src/readcounts.hpp
+++ b/src/readcounts.hpp
@@ -9,7 +9,7 @@ namespace kraken {
   struct ReadCounts {
   	uint64_t n_reads = 0;
   	uint64_t n_kmers = 0;
-      HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
+    HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
   	void add_kmer(uint64_t kmer) {
   		++ n_kmers;
   		kmers.add(kmer);
diff --git a/src/taxdb.h b/src/taxdb.h
index ac3344f..c8dd2bd 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -175,8 +175,6 @@ uint64_t reads(const T read_count) {
 	return(0);
 }
 
-
-
 inline
 uint64_t reads(const uint64_t read_count) {
 	return(read_count);
@@ -679,8 +677,8 @@ void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
 		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break;
 		case REPORTCOLS::NUM_READS:  _reportOfb << reads(tax.read_counts); break;
-		//case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
-		//case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break;
+		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.read_counts.kmers.cardinality(); break;
+		case REPORTCOLS::NUM_KMERS: _reportOfb << tax.read_counts.n_kmers; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
 		//case REPORTCOLS::SUM_SCORE: ; break;

From cf4aeae02705e5250bc9dbeec93bb5d1c9caf3f3 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 4 May 2017 18:30:40 -0400
Subject: [PATCH 029/105] Start all scripts with kraken_hll

---
 scripts/kraken_hll                            |  4 +-
 ...ibrary.sh => kraken_hll-add_to_library.sh} |  0
 scripts/kraken_hll-build                      |  2 +-
 ...ld_kraken_db.sh => kraken_hll-build_db.sh} | 56 +++++++++++--------
 ...h.sh => kraken_hll-check_for_jellyfish.sh} | 20 +++++--
 .../{clean_db.sh => kraken_hll-clean_db.sh}   |  0
 ...file.pl => kraken_hll-cp_into_tempfile.pl} |  0
 ...=> kraken_hll-download_genomic_library.sh} |  0
 ...omy.sh => kraken_hll-download_taxonomy.sh} | 26 +++------
 ...ad_merger.pl => kraken_hll-read_merger.pl} |  0
 .../{shrink_db.sh => kraken_hll-shrink_db.sh} |  0
 ...sh => kraken_hll-standard_installation.sh} |  0
 ...upgrade_db.sh => kraken_hll-upgrade_db.sh} |  0
 ...ers.pl => kraken_hll-verify_gi_numbers.pl} |  0
 scripts/report_gi_numbers.pl                  | 51 -----------------
 src/set_lcas.cpp                              |  5 +-
 16 files changed, 63 insertions(+), 101 deletions(-)
 rename scripts/{add_to_library.sh => kraken_hll-add_to_library.sh} (100%)
 rename scripts/{build_kraken_db.sh => kraken_hll-build_db.sh} (76%)
 rename scripts/{check_for_jellyfish.sh => kraken_hll-check_for_jellyfish.sh} (68%)
 rename scripts/{clean_db.sh => kraken_hll-clean_db.sh} (100%)
 rename scripts/{cp_into_tempfile.pl => kraken_hll-cp_into_tempfile.pl} (100%)
 rename scripts/{download_genomic_library.sh => kraken_hll-download_genomic_library.sh} (100%)
 rename scripts/{download_taxonomy.sh => kraken_hll-download_taxonomy.sh} (75%)
 rename scripts/{read_merger.pl => kraken_hll-read_merger.pl} (100%)
 rename scripts/{shrink_db.sh => kraken_hll-shrink_db.sh} (100%)
 rename scripts/{standard_installation.sh => kraken_hll-standard_installation.sh} (100%)
 rename scripts/{upgrade_db.sh => kraken_hll-upgrade_db.sh} (100%)
 rename scripts/{verify_gi_numbers.pl => kraken_hll-verify_gi_numbers.pl} (100%)
 delete mode 100755 scripts/report_gi_numbers.pl

diff --git a/scripts/kraken_hll b/scripts/kraken_hll
index 6f2e290..b31fca3 100755
--- a/scripts/kraken_hll
+++ b/scripts/kraken_hll
@@ -206,8 +206,8 @@ sub usage {
 Usage: $PROG [options] <filename(s)>
 
 Options:
-  --db NAME               Name for Kraken DB
-                          (default: $default_db)
+  --db NAME               Name for Kraken DB (default: $default_db)
+  --report-file FILENAME  Write Kraken report to FILENAME
   --threads NUM           Number of threads (default: $def_thread_ct)
   --fasta-input           Input is FASTA format
   --fastq-input           Input is FASTQ format
diff --git a/scripts/add_to_library.sh b/scripts/kraken_hll-add_to_library.sh
similarity index 100%
rename from scripts/add_to_library.sh
rename to scripts/kraken_hll-add_to_library.sh
diff --git a/scripts/kraken_hll-build b/scripts/kraken_hll-build
index 1ddea52..8367fdd 100755
--- a/scripts/kraken_hll-build
+++ b/scripts/kraken_hll-build
@@ -298,7 +298,7 @@ sub build_database {
   $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0);
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq;
   my $opt = ($verbose? "-x" : "");
-  exec "build_kraken_db.sh $opt";
+  exec "kraken_hll-build_db.sh $opt";
 }
 
 sub clean_database {
diff --git a/scripts/build_kraken_db.sh b/scripts/kraken_hll-build_db.sh
similarity index 76%
rename from scripts/build_kraken_db.sh
rename to scripts/kraken_hll-build_db.sh
index 6d86bf7..a8d3293 100755
--- a/scripts/build_kraken_db.sh
+++ b/scripts/kraken_hll-build_db.sh
@@ -37,6 +37,7 @@ function report_time_elapsed() {
 }
 
 start_time=$(date "+%s.%N")
+script_dir=`dirname $0`
 
 DATABASE_DIR="$KRAKEN_DB_NAME"
 FIND_OPTS=-L
@@ -59,17 +60,26 @@ fi
 
 if [ "$KRAKEN_REBUILD_DATABASE" == "1" ]
 then
-  rm -f database.* *.map lca.complete
+  rm -f database.* *.map lca.complete library/seq-files.txt
 fi
 
+if [ !-f "library/seq-files.txt" ]; then
+    echo "Finding all library files"
+    find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library/seq-files.txt
+fi
+N_FILES=`cat library/seq-files.txt | wc -l`
+echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)"
+
 if [ -e "database.jdb" ]
 then
   echo "Skipping step 1, k-mer set already exists."
 else
-  echo "Creating k-mer set (step 1 of 5)..."
+  echo "Creating k-mer set (step 1 of 6)..."
   start_time1=$(date "+%s.%N")
 
-  check_for_jellyfish.sh
+  JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh`
+  echo "Using $JELLYFISH_BIN"
+  [[ "$JELLYFISH_BIN" != "" ]] || exit 1
   # Estimate hash size as 1.15 * chars in library FASTA files
   if [ -z "$KRAKEN_HASH_SIZE" ]
   then
@@ -77,14 +87,14 @@ else
     echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
   fi
 
-  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
-    jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
+  cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \
+    $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
       -o database /dev/fd/0
 
   # Merge only if necessary
   if [ -e "database_1" ]
   then
-    jellyfish merge -o database.jdb.tmp database_*
+    $JELLYFISH_BIN merge -o database.jdb.tmp database_*
   else
     mv database_0 database.jdb.tmp
   fi
@@ -111,7 +121,7 @@ else
     then
       echo "Skipping step 2, database reduction unnecessary."
     else
-      echo "Reducing database size (step 2 of 5)..."
+      echo "Reducing database size (step 2 of 6)..."
       max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc)
       idx_size_gb=$(printf %.2f $(echo "$idx_size/2^30" | bc) )
       if (( $(echo "$max_kdb_size < 0" | bc) == 1 ))
@@ -143,7 +153,7 @@ if [ -e "database.kdb" ]
 then
   echo "Skipping step 3, k-mer set already sorted."
 else
-  echo "Sorting k-mer set (step 3 of 5)..."
+  echo "Sorting k-mer set (step 3 of 6)..."
   start_time1=$(date "+%s.%N")
   db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \
     -d database.jdb -o database.kdb.tmp \
@@ -159,38 +169,37 @@ if [ -e "seqid2taxid.map" ]
 then
   echo "Skipping step 4, seqID to taxID map already complete."
 else
-  echo "Creating seqID to taxID map (step 4 of 5)... [blu]"
-#  start_time1=$(date "+%s.%N")
-#  make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \
-#    > seqid2taxid.map.tmp
-#  mv seqid2taxid.map.tmp seqid2taxid.map
-#  line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
-
-#  echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
+  echo "Creating seqID to taxID map (step 4 of 6).."
+  start_time1=$(date "+%s.%N")
+  cat library/seq-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library/seq-headers.txt
+  join -t $'\t' nucl_gb.accession2taxid.sorted library/seq-headers.txt > seqid2taxid.map.tmp
+  mv seqid2taxid.map.tmp seqid2taxid.map
+  line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
+
+  echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
 fi
 
 if [ -s "taxDB" ]
 then
-  echo "Skipping step 4.5, taxDB exists."
+  echo "Skipping step 5, taxDB exists."
 else
-  echo "Creating taxDB (step 4.5 of 5)... "
+  echo "Creating taxDB (step 5 of 6)... "
   build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB
 fi
 
 
-
 if [ -e "lca.complete" ]
 then
-  echo "Skipping step 5, LCAs already set."
+  echo "Skipping step 6, LCAs already set."
 else
-  echo "Setting LCAs in database (step 5 of 5)..."
+  echo "Setting LCAs in database (step 6 of 6)..."
   PARAM=""
   if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then
 	echo " Adding taxonomy IDs for sequences"
 	PARAM=" -a"
   fi
   start_time1=$(date "+%s.%N")
-  find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -exec cat {} + | \
+  cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \
     set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \
     -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
   touch "lca.complete"
@@ -198,4 +207,5 @@ else
   echo "Database LCAs set. [$(report_time_elapsed $start_time1)]"
 fi
 
-echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]"
+echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]
+You can delete all files but database.{kdb,idx} and taxDB now, if you want"
diff --git a/scripts/check_for_jellyfish.sh b/scripts/kraken_hll-check_for_jellyfish.sh
similarity index 68%
rename from scripts/check_for_jellyfish.sh
rename to scripts/kraken_hll-check_for_jellyfish.sh
index 63cc620..9143b62 100755
--- a/scripts/check_for_jellyfish.sh
+++ b/scripts/kraken_hll-check_for_jellyfish.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# modified by Florian Breitwieser, 2017
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
@@ -24,12 +25,23 @@ set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 set -o pipefail  # Stop on failures in non-final pipeline commands
 
-JELLYFISH_VERSION=$(jellyfish --version | awk '{print $2}')
+JELLYFISH_BIN="jellyfish"
+if hash jellyfish1 2>/dev/null; then
+    JELLYFISH_BIN="jellyfish1"
+elif hash jellyfish 2>/dev/null; then
+    JELLYFISH_BIN="jellyfish"
+else 
+    echo "Did not find jellyfish!" 1>&2
+    exit 1
+fi
+
+JELLYFISH_VERSION=$( $JELLYFISH_BIN --version | awk '{print $2}')
 if [[ $JELLYFISH_VERSION =~ ^1\. ]]
 then
-  echo "Found jellyfish v$JELLYFISH_VERSION"
+  echo "Found jellyfish v$JELLYFISH_VERSION" 1>&2
 else
-  echo "Found jellyfish v$JELLYFISH_VERSION"
-  echo "Kraken requires jellyfish version 1"
+  echo "Found jellyfish v$JELLYFISH_VERSION" 1>&2
+  echo "Kraken requires jellyfish version 1" 1>&2
   exit 1
 fi
+echo $JELLYFISH_BIN
diff --git a/scripts/clean_db.sh b/scripts/kraken_hll-clean_db.sh
similarity index 100%
rename from scripts/clean_db.sh
rename to scripts/kraken_hll-clean_db.sh
diff --git a/scripts/cp_into_tempfile.pl b/scripts/kraken_hll-cp_into_tempfile.pl
similarity index 100%
rename from scripts/cp_into_tempfile.pl
rename to scripts/kraken_hll-cp_into_tempfile.pl
diff --git a/scripts/download_genomic_library.sh b/scripts/kraken_hll-download_genomic_library.sh
similarity index 100%
rename from scripts/download_genomic_library.sh
rename to scripts/kraken_hll-download_genomic_library.sh
diff --git a/scripts/download_taxonomy.sh b/scripts/kraken_hll-download_taxonomy.sh
similarity index 75%
rename from scripts/download_taxonomy.sh
rename to scripts/kraken_hll-download_taxonomy.sh
index fa73616..fc27842 100755
--- a/scripts/download_taxonomy.sh
+++ b/scripts/kraken_hll-download_taxonomy.sh
@@ -31,30 +31,18 @@ THIS_DIR=$PWD
 mkdir -p "$TAXONOMY_DIR"
 cd "$TAXONOMY_DIR"
 
-if [ ! -e "gimap.dlflag" ]
+if [ ! -e "nucl_gb.accession2taxid.flag" ]
 then
-  wget $FTP_SERVER/pub/taxonomy/gi_taxid_nucl.dmp.gz
-  touch gimap.dlflag
-  echo "Downloaded GI to taxon map"
-fi
-
-if [ ! -e "taxdump.dlflag" ]
-then
-  wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz
-  touch taxdump.dlflag
-  echo "Downloaded taxonomy tree data"
-fi
-
-if [ ! -e "gimap.flag" ]
-then
-  gunzip gi_taxid_nucl.dmp.gz
-  touch gimap.flag
-  echo "Uncompressed GI to taxon map"
+  wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
+  time gunzip -c nucl_gb.accession2taxid.gz | cut -f 2,3 | sort -k 1,1 > nucl_gb.accession2taxid.sorted
+  touch nucl_gb.accession2taxid.flag
+  echo "Downloaded and sorted GB to taxon map"
 fi
 
 if [ ! -e "taxdump.flag" ]
 then
+  wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz
   tar zxf taxdump.tar.gz
   touch taxdump.flag
-  echo "Uncompressed taxonomy tree data"
+  echo "Downloaded and uncompressed taxonomy tree data"
 fi
diff --git a/scripts/read_merger.pl b/scripts/kraken_hll-read_merger.pl
similarity index 100%
rename from scripts/read_merger.pl
rename to scripts/kraken_hll-read_merger.pl
diff --git a/scripts/shrink_db.sh b/scripts/kraken_hll-shrink_db.sh
similarity index 100%
rename from scripts/shrink_db.sh
rename to scripts/kraken_hll-shrink_db.sh
diff --git a/scripts/standard_installation.sh b/scripts/kraken_hll-standard_installation.sh
similarity index 100%
rename from scripts/standard_installation.sh
rename to scripts/kraken_hll-standard_installation.sh
diff --git a/scripts/upgrade_db.sh b/scripts/kraken_hll-upgrade_db.sh
similarity index 100%
rename from scripts/upgrade_db.sh
rename to scripts/kraken_hll-upgrade_db.sh
diff --git a/scripts/verify_gi_numbers.pl b/scripts/kraken_hll-verify_gi_numbers.pl
similarity index 100%
rename from scripts/verify_gi_numbers.pl
rename to scripts/kraken_hll-verify_gi_numbers.pl
diff --git a/scripts/report_gi_numbers.pl b/scripts/report_gi_numbers.pl
deleted file mode 100755
index 0d07b85..0000000
--- a/scripts/report_gi_numbers.pl
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/perl
-
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
-#
-# This file is part of the Kraken taxonomic sequence classification system.
-#
-# Kraken is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Kraken is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
-
-# Reads multi-FASTA input and for each sequence ID reports a
-# tab-delimited line:
-#   <GI number> <sequence ID> <full header>
-# 
-#   or in the case of a sequence with Kraken taxid information:
-#
-#   TAXID <taxonomy ID> <sequence ID> <full header>
-#
-# Assumes all sequence IDs actually have GI numbers or Kraken
-# taxid information.
-
-use strict;
-use warnings;
-use File::Basename;
-
-my $PROG = basename $0;
-
-while (<>) {
-  next unless /^>(\S+)/;
-  my $seq_id = $1;
-  if ($seq_id =~ /(^|\|)kraken:taxid\|(\d+)/) {
-
-    print "TAXID\t$2\t$seq_id\t$_\n";
-    next;
-  }
-
-  if ($seq_id !~ /(^|\|)gi\|(\d+)/) {
-    die "$PROG: sequence ID $seq_id lacks GI number, aborting.\n";
-  }
-
-  print "$2\t$seq_id\t$_\n";
-}
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 0e60887..61504d7 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -144,7 +144,7 @@ void process_single_file() {
       Parent_map[taxid] = parent_taxid;
       auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry<uint32_t, ReadCounts>(taxid, parent_taxid, "sequence")});
       if (!itEntry.second)
-          cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the XXX option." << endl;
+          cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl;
     } else {
       iss >> taxid;
     }
@@ -172,6 +172,9 @@ void process_single_file() {
     string prefix = "kraken:taxid|";
     if (dna.id.substr(0,prefix.size()) == prefix) {
         taxid = std::atoi(dna.id.substr(prefix.size()).c_str());
+        const auto strBegin = dna.header_line.find_first_not_of("\t ");
+        if (strBegin != std::string::npos)
+            dna.header_line = dna.header_line.substr(strBegin);
     } else {
         taxid = ID_to_taxon_map[dna.id];
     }

From e683cbf7eae40e75f2de07c0d9cec62437705cc3 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 4 May 2017 19:44:21 -0400
Subject: [PATCH 030/105] Refactor TaxonomyDB constructor

---
 src/taxdb.h | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/taxdb.h b/src/taxdb.h
index c8dd2bd..0e449e5 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -96,12 +96,10 @@ struct TaxonomyEntryPtr_comp {
 template<typename TAXID, typename READCOUNTS>
 class TaxonomyDB {
  public:
+  TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName);
   TaxonomyDB(const std::string inFileName);
-  TaxonomyDB();
-  //std::unordered_map<std::string, TAXID> seqIDsAndTaxIds;
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
-  void parseNamesDump(const std::string namesDumpFileName);
-  void parseNodesDump(const std::string nodesDumpFileName);
+  void writeTaxonomyIndex(std::ostream & outs) const;
+
   TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
   std::string getScientificName(const TAXID taxID) const;
   std::string getRank(const TAXID taxID) const;
@@ -111,18 +109,21 @@ class TaxonomyDB {
   std::unordered_map<std::string, TAXID> getScientificNameMap() const;
   std::string getLineage(TAXID taxonomyID) const;
   std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
-  char* getIndexFileName(const TAXID hostTaxID) const;
-  void readTaxonomyIndex(const std::string inFileName);
-  void writeTaxonomyIndex(std::ostream & outs) const;
-  void writeTaxonomyIndex(std::ostream & outs,
-                          const std::string namesDumpFileName,
-                          const std::string nodesDumpFileName);
+
   bool isSubSpecies(TAXID taxonomyID) const;
   int isBelowInTree(TAXID upper, TAXID lower) const;
+
   void addCounts(const TAXID taxid, const READCOUNTS& read_counts_);
   void fillCounts(const std::unordered_map<TAXID, READCOUNTS>& taxon_counts);
-  void createPointers();
   void printReport();
+
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
+ private:
+  TaxonomyDB();
+  void readTaxonomyIndex(const std::string inFileName);
+  void parseNamesDump(const std::string namesDumpFileName);
+  void parseNodesDump(const std::string nodesDumpFileName);
+  void createPointers();
 };
 
 
@@ -292,13 +293,21 @@ TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB() { }
 
 template<typename TAXID, typename READCOUNTS>
 TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName) {
-  log_msg("Building taxonomy index");
+  log_msg("Building taxonomy index from " + inFileName);
   readTaxonomyIndex(inFileName);
   createPointers();
   log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) +
       " nodes");
 }
 
+template<typename TAXID, typename READCOUNTS>
+TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) {
+  log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName);
+  parseNodesDump(nodesDumpFileName);
+  parseNamesDump(namesDumpFileName);
+  log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + " nodes");
+}
+
 template<typename TAXID, typename READCOUNTS>
 void TaxonomyDB<TAXID,READCOUNTS>::parseNodesDump(const std::string nodesDumpFileName) {
   std::ifstream nodesDumpFile(nodesDumpFileName);
@@ -358,15 +367,6 @@ void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFil
   }
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs,
-									const std::string namesDumpFileName,
-                                    const std::string nodesDumpFileName) {
-  parseNodesDump(nodesDumpFileName);
-  parseNamesDump(namesDumpFileName);
-  writeTaxonomyIndex(outs);
-}
-
 template<typename KeyType, typename ValueType>
 std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>& unordered) {
   std::vector<KeyType> keys;

From ffea4a786f5fbcb937780c4117edc40f6dd86de0 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 4 May 2017 19:44:43 -0400
Subject: [PATCH 031/105] Build taxdb in the end

---
 scripts/kraken_hll-build_db.sh | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh
index a8d3293..da4187d 100755
--- a/scripts/kraken_hll-build_db.sh
+++ b/scripts/kraken_hll-build_db.sh
@@ -41,6 +41,7 @@ script_dir=`dirname $0`
 
 DATABASE_DIR="$KRAKEN_DB_NAME"
 FIND_OPTS=-L
+JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh`
 
 if [ ! -d "$DATABASE_DIR" ]
 then
@@ -77,7 +78,6 @@ else
   echo "Creating k-mer set (step 1 of 6)..."
   start_time1=$(date "+%s.%N")
 
-  JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh`
   echo "Using $JELLYFISH_BIN"
   [[ "$JELLYFISH_BIN" != "" ]] || exit 1
   # Estimate hash size as 1.15 * chars in library FASTA files
@@ -179,20 +179,11 @@ else
   echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
 fi
 
-if [ -s "taxDB" ]
-then
-  echo "Skipping step 5, taxDB exists."
-else
-  echo "Creating taxDB (step 5 of 6)... "
-  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB
-fi
-
-
 if [ -e "lca.complete" ]
 then
-  echo "Skipping step 6, LCAs already set."
+  echo "Skipping step 5, LCAs already set."
 else
-  echo "Setting LCAs in database (step 6 of 6)..."
+  echo "Setting LCAs in database (step 5 of 6)..."
   PARAM=""
   if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then
 	echo " Adding taxonomy IDs for sequences"
@@ -207,5 +198,15 @@ else
   echo "Database LCAs set. [$(report_time_elapsed $start_time1)]"
 fi
 
+if [ -s "taxDB" ]
+then
+  echo "Skipping step 6, taxDB exists."
+else
+  echo "Creating taxDB (step 6 of 6)... "
+  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB.dmp
+  mv taxDB.tmp taxDB
+fi
+
+
 echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]
 You can delete all files but database.{kdb,idx} and taxDB now, if you want"

From c550a1b8ef0b92e9c586f1a53d0f75f85dc4e4f2 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 4 May 2017 19:45:12 -0400
Subject: [PATCH 032/105] Update build_taxdb.cpp

---
 src/build_taxdb.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index 2710d82..3432294 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -27,8 +27,7 @@ int main(int argc, char **argv) {
       std::cout << "Provide names.dmp and nodes.dmp\n";
       return 1;
     }
-    TaxonomyDB<uint32_t, uint32_t> taxdb;
-    taxdb.writeTaxonomyIndex(
-            std::cout, argv[1], argv[2]);
+    TaxonomyDB<uint32_t, uint32_t> taxdb(argv[1], argv[2]);
+    taxdb.writeTaxonomyIndex(std::cout);
 
 }

From 112b89d5b31cdc9048eb4bbe4edbf33e2488c491 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Fri, 5 May 2017 13:19:47 -0400
Subject: [PATCH 033/105] Include number of k-mers in database in taxDB

---
 scripts/kraken_hll-build_db.sh |   3 +-
 src/build_taxdb.cpp            |  20 +++--
 src/classify.cpp               |   2 +-
 src/gzstream/libgzstream.a     | Bin 14622 -> 14814 bytes
 src/krakendb.cpp               |  21 +++++
 src/krakendb.hpp               |   5 ++
 src/report-cols.h              |   1 +
 src/taxdb.h                    | 145 +++++++++++++++++++++------------
 8 files changed, 140 insertions(+), 57 deletions(-)

diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh
index da4187d..75a678d 100755
--- a/scripts/kraken_hll-build_db.sh
+++ b/scripts/kraken_hll-build_db.sh
@@ -203,7 +203,8 @@ then
   echo "Skipping step 6, taxDB exists."
 else
   echo "Creating taxDB (step 6 of 6)... "
-  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp > taxDB.dmp
+  jellyfish1 dump database.kdb | grep '^>' | sed 's/.//' | sort | uniq -c | sort -rn | sed 's/^ *\([0-9]\+\) \+\([0-9]\+\)$/\2\t\1/' > database.taxon_count
+  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count > taxDB.tmp
   mv taxDB.tmp taxDB
 fi
 
diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index 3432294..f4a4957 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -18,16 +18,26 @@
  */
 
 #include "taxdb.h"
-
+#include "quickfile.hpp"
 #include <iostream>
+#include <fstream>
+#include <unordered_map>
+
 using namespace std;
 
 int main(int argc, char **argv) {
-	if (argc != 3) {
-      std::cout << "Provide names.dmp and nodes.dmp\n";
+	if (argc < 3 || argc > 4) {
+      std::cerr << "Usage: a.out names.dmp nodes.dmp [taxon-counts]\n";
       return 1;
     }
-    TaxonomyDB<uint32_t, uint32_t> taxdb(argv[1], argv[2]);
+    TaxonomyDB<uint32_t, uint32_t> taxdb {(string)argv[1], (string)argv[2]};
+    if (argc == 4) {
+        ifstream ifs(argv[3]);
+        uint32_t taxon; uint64_t count;
+        while (ifs >> taxon >> count) {
+            taxdb.setGenomeSize(taxon, count);
+        }
+        taxdb.genomeSizes_are_set = true;
+    }
     taxdb.writeTaxonomyIndex(std::cout);
-
 }
diff --git a/src/classify.cpp b/src/classify.cpp
index 981f1f6..690715e 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -196,7 +196,7 @@ int main(int argc, char **argv) {
   std::cerr << "Finishing up ..\n";
 
   if (Print_kraken_report) {
-	taxdb.fillCounts(taxon_counts);
+	taxdb.setReadCounts(taxon_counts);
 	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, false);
 	rep.printReport("kraken","blu");
   }
diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a
index 5144238131823c9252a7f0c5ff9f3690b8235c5e..916d33d4ec4647b96c6db9a1c8625146f6079612 100644
GIT binary patch
literal 14814
zcmdT~4RBo5b$+YoA7R{;jN@93OS7^KmV;4tWl2atJiA`WdJ2mzEZK4V!_#WDlD0@%
z&F+&WnKV|jVEjhjjN7CgI)zLolXhV8H_0T73!Pw4u#>c9oN2;%rb9w$eq6**+6;!4
z6!o0@&RLy(`yR-2(oW9k?Y-~2=kK2T^X`x8?@Z=WBfHjoBvek3=Juv=Q}fQ2a3}<S
zK=L;fx~-{|1p3!#n)X3W)BjH%^YGqCWcWnh%B7QI1CvATljDQw+|X!ts`I$!E^N+D
zq{lmZ?n-4Uids^m*?ihVrlSeKj*mCy(nF1=IXpdWPNZ}B?09lCW6hYy!=S0}V3`XQ
zx;r9Xdcy=ZmAp^p)5HoI%V0EeJZz3PHJf|OOG>)S_&Xfsell!Y1}@SxG&-3dF(HOS
z>70kdZQ0`#*vjZN=V!)Kp7p`#VKbDu=W2(tvUt-{1C<v<d<sO<_4T@2qOZFG=`D8#
zW7}N`^|>I8o2fDDj14&=b~Z<LhC(gjolT)oBphySZH9Dbd#f`>`q!N7Ozf!&1Ylq|
z6wr2QOK*&7+UZ-C@==wIYCE*;iZ=-FweX<aI(XJoej}xIl)|u6Jew%Jk<t%AT2J{c
zkjDBB^cwct#@tk`@!-{(2TlQI+%Z*G^XWOrEu5HrY6?wRM-BUR<Cb{!->iVQ!dok9
z_B~^yE^UXO?<b0b*Bi$67ZUa_uRx(@6z@wIbN5G|yagl;`!xkVc>YtNgq^B3?E7ks
zhmB%~o-piqUCk5m#KN|7i$Dt@C;2@mx#4PI8sw{M9-P6#S8ASEb>vv===|AUqws1i
zxIFu`@@(>$VZUjA7J08et4|)ZFBM*@E4)=_^<I6p+PeP2imf@Ywl%P@&ahv)`fROr
z9Y9kGx}XhQhz7vLUpRi-@u>0Od8__H0W}8<`|`pE&tZ)g);q~sCt0Zyqww9@Bl8$M
z^-Ato^~T)l#h$REL&G>w{T-p2j`<_=XG>9E=>)8&&agKc#c0&9A3BXuD`p{5|K_+g
zM@&(ub=(&&k|9H~7aV0zVWN~R++*0!m9c<E7F`H-+uvog3H#?|d=;vgQMF29ZezE8
z;>y2nfwqDcvfrkN{#2z$=Ba$)XVBXUuR<lT$Ayf!Y=g1=4P)+P)P6;AK8Zb{{kh4t
z_LVm;Lq7?`8|+u?OAzboBgf{?UVRALxz_q{ul*|y_}0QZi`H6WuB#z>I&2iW8UjjF
z;jP7*&iRGwRd1SCzenbg3!8YYaYt8Z%4yDd)iCquR!Q)4=WCug4-SCYdExOY!(MI7
z9jrFyqE`(21FQcR&9u<^E)Bt<HSGDNtP@I(gc`KPla9+!rO6G(oNnlcz^|Xq<wJ$%
zohV>mJD_?SMcw{!%>H|0&RvrPb~xE{x~h$fT{zKbkk(NeqO^h1Fr{rOEuM6`dYPVh
z^@YMoXfx>2@gC^EVR#yhV&PNR%k6j@z<rQn*M>0=hc2N*4GewyN`hvN4*O-8L;zH+
zv#=h5lHj31pc9nX(b%#1*g>kV{dTwgM$CS}D1KcH0v5Sg=&wR|HSI!S1alV(8BC0e
zg$XKv?DSH0b}4(Bvm7ZLx9gp8tHwv72s0cP(s;cXr!dATjPVed1AB1_W1PYmr!aaA
z<ZBIM!u|#KoDTaxAe5!}-P&zGSDLA%tIhzYFmZ%&aL10t?p5>L?9-|xtu01H$8(wt
zhV)X$Q{%k&QLL+KIit8i)w|fDk~vX=!%P$7OiXyI9a-t94ii?J(-}6q2hma)SDyyV
z;$onB<H{CoKj?lRmP+BfR*N6i4ueIoe#5qv55ot4<^7rkz{P-ruG<Vi`^rPwad@{0
zy#A@c`bz=rC;VEhTK~l=?FUuBQby%#nzEv@7xx;btnfWvaOg7WS8#S-T;%uPwiM@q
z<<RsyPCYq&PnoWG{Ps{o*WLr$L(PqCjV+<3aHJ*N+O#v&us=N*GLn{q-x`T*2W1~1
zg@F_MwZL?3VAI;w)n~ysoDY7V%D#c=?qKbis#vh@lpYI)3M;A}57xzkwY!7W4xd51
zpTQIg3RNA!P}M`h+E}m}=b8~JR|8I!=TWp|BXFW8p*&dqm&mzE`mvQ{?}ZeXe5n5v
z`clx*$N}XE*lB>G9xMkMg2#jR81wFc@=qco&=28JyzV-Ff2oc*Z|<NrTZ=+KST9(!
zDZ;g{Q}26Jy^#MXm0hPeG!5%9L719x%IaaUYg!2SV0<cy;_t^Wo|$0v{2t8gnFkBl
zZhuQW&B$X0Yx@Al81xbTXOQ=Z!5KEBz5(<Z!M@=5Kdk)6`Tr5p^BU`c_!O&l2kXvY
zA2_ul7HlZ2><EUdUJNz>01W~R&?D{y{>vnL2vUyiXHd4N?+%8}pzNuYv0%8cN;d*Z
z9LgMV71xK!uO`st_%)&Y!g_y;%07Z+9l_c^#FEDR%$SuNfYi!4=?Ld?>Ct4Pm7cb=
z#=)eO)EWo!c@4JDgf_Z2<!Xc7IGjqE>FHE@!ZJbMXxfMEUDV*Fb{T3s*~q`lc*Zi5
zxm<DvfGm0sZn*c9PE}MPcmsUM<Jx7GNtf+n$x?;*GMHSa44!_tcdedG?!yJa^lk?o
zjj8O|Sb7}Vbv$dO8)4GBHE$(T_feJ7Bj!*pIhKY-$Ym!du($XUcGnf!zk7a9bPW%X
zDoNLPCMYk{wcjZR=uOvf$6QIe+C_WexT~%tUG22JaGuSV%Fx3q@c-~C2UBAPvNyvM
z5Ps5dRZP03R!5G2_|-Ms*;kUTMywZ(JO4`3H9QHZBwfRkflAUfJSnIoU7a3y;dq)*
zNxG`F7rw0mRNpCeYEe<7>Do<|ARxLgKTTLJDIFxdNOxw;iXvTeueux-U8BZzN$A>Y
z7sCAb<*g)L<1f`x*~cJzgXsTiEdbasJot@s(;orgg+Jhf<H-){|0XF?>j%jHwGaPS
ziJ!l_PCLQQ$%p@Wz&DF=hieF=e}SjIYN?-aL~sJA7yWP+-jUY90P(8VE+2fK4}Px?
zo(J5^Ph2N%R;O!@A45)}J?f+9OMqiMk3))tYa65|;k{mrmmUY!{=rAjuYB-T(3Met
z7j<c#0k;y~N4Q!KfsNY<A10i8*dfAspR3kGz&``HSDgQt_`gPaqU8Uhguh0(S_?th
zw+Y`&Lk!nKNPkE8Il|St2k>74?qzoi@L_#lCVpHCA>9e@Ui^3a;D1VbE~&+krs28>
z>B~O+-y!}68ro_d1pJqM_+LkyVxSb$&wu;yZva2M{ND_Cy_o0ui`A@fwPIA7Ik(40
zPu2&2MCn=jF4T~pUsw3jcLCS2NYDD{dDREM0j5Z=diD6=5BcCf@xgD0_VCi5^T8kY
z!C&;jH^7wbrT=cg>&17l#8bKih-+q7V&Cpq!Zh=f17^1=OAhy1i8j;BWN|@Rc^%Z^
zal2A}Z$+y2?b*}Y*=P2}b|*SbZ7e;OnwWum2)ODRtk5d%REAIBJy!)s>7Hcxgu3ji
zV0d3wU3~|8tw_s2GM`DArCYe}6o8S`NHS+yxn#!5cc(f#(O$3BI+{%-N9m&L!1z>V
ze9+vV&QHR^(dT-t{DQA^;UzDx=)P-kw8!JN>0R#B!1s2?^-LCg^mV!oicMfls}TJ^
zT*eKbn95}=2$Ac$^EY-W_nv`)-jq(}Iw53DW;a9!9xF*6mThLlg<h%k<8&kXo>rQ1
z1a<a!cd>}6`c(NL7pUe7-d?NJZs3GR&nS%u)iX=X&D~(sanT&iXU&o1_~59Dz0jzb
zoUUc2rjsVreOe86_kOW*cL>PAy^*j9a?{Cy%<)JQ<ntLbHJZ%lO>1T%ZDz)YvRJvk
zZttpljxMlV7I`IK3At=XO!N4dukmFKkL8X3mJD3br#tuGRq6)LjIccE)_mH6yXnrJ
z)abQ{-Ih<M@5>Gib&i&LYA+yznH;?DR6TbeOqOHWL2BUTd85=$Wb@Gfn};BU{l96B
z4X7btS~x1+zwY+@NOo#0IX*)->dWl`t<(A8v<3g)aBUH4nOypBiRGGVpBR9R|4=eB
z+Nu5-V7YSO;>0Anc$f0A&wX66&sm139obgM|Jpf@T^tnnO#)v{>pEDTXxjunC~#bJ
zvYuJOab1ji8OA>^_;D@7_+JVB9RmNh4?mv~qW)V2Kd$?c(9cGASpScN{#Jo62>#my
z{-)rU`m1Rz>ejzO;1QwcLxiKB(r&BZmv(mv9M3e^|4#@U_X3P31-?_@a{|Y6OXmNv
z!0!-v5Z2R3STBip3S72Vf^e*tY_Fp}{3*dN<7o+g*<Q0g{D0!Z{}%$6ar=_MWxISu
z;FA9tf!_>u<GB5kz+(arV!(jh{cs=Qn8e{>JqHE9jN9i0e}~}ziol}+SN|gd?^rDB
z^^D+`^}>Ax5+-}#VSn0SKY)b(%W--@;Ih5`oN&}5>-DJMm-WK^1`_gjz{7T56?jzO
z|0!@;FFp!#kE8Xl-#|kBavW_ExQqktbC8f<_UAr<%l1tYj>86J*#9BHZwP!$@OKN`
z61dcJKjBy}Jd<MmvqJw}0$&SxB;=QI=6$QXUVOrc{IVS}CP>IH#~qG+BsV|$faK=y
zbzp_dIRB>o#I{C4f8@M}eG<v-zw}eq_lUz@;XlFZ<P+PT&zU8@7hZV3T1^G`*+?*x
zC`Wvsz&VTfet5j}^e_{o+`h=qb-`k3_ihA;SReA^IAW4|`0NGCLd1aleU!%p#|RSQ
z2UO1a2~!^Ng92|8_{Rkv75FZJ<Jdt$J%?1z`Qf@F{t1CI58|5z-cNbN4-0%m;M~>-
zO$hub<uRESIG^2#V&x~!lXV2j_#+-AT*m(%!Z2Yu^pnr@5D(!G5aR7B=lrx$9&xEB
zDsbjONXGMa;*kmJ;kH9Ogg-#o$MIp}KFm0uV~-L8<55%y^jRlSI&PsnCXWf6rgnvV
zL*O*kDda~2hohkqy(n;wxq`?(`@`qM*9jcw0;aIQQHIZZJ6t%Q_x8JRKJT?$IG^|O
z-7@>b=ep-z{Cuv<_sPu9=eqwa_|gC0r3$P-Mj+<r^WKmP=X2fd0!KaEHb(`HSw6o_
z3mo~m4?QAq%<}o|IT!ApCxP{S(T$*Yorh$UF#f)L7~VI+gWmxpPW|M$UrTwM2U(6&
zKlww-%F?`x_E9Ixaq1`E?;(%#Hp_A9CqLpLk8>r<aq1_}^B#{O^dILHmgCe<e%izS
zj%$#gC3zWtoHyAnr+)U&ki7IC`!vgO8UhT5keplKRr?(N0NJhr>c&nX{BD8=bw%Nk
z)IbZWb2OP@{MNw3{?rlk&5)vf+BOiXX*bGAt%N$f;#IBfP^X_fO%Ejz_dD)2RsCHR
z`vbDi_l2y$#irH_UiE)zt0VCk31a<&@VM)beG2VYxfFkr>~s4Aiz2uGc;D*f|7nsR
zA~{Z6X4H4w%m33iITGhd5dFt_#_j*ZfO+}<+A{rrn&lyt^j{?3EkVe;>;D2^7*}`u
z8{{ba%WB;3KZF9W`Wx6Wfw-Sy{WrkF@{GR>m{<LKh!K60<n})R0AjkUVfsm)?XV7B
zFY)`YA>-wLKlyJEBl?fJ-2R8izPtZT6Jrg@bK>?t>f`?xNWOsuDGj;a?*Mr({~z=4
zALkmk|7jolFA?Lc%U;O!-tS}ob+SJK<wzL68{u)=w|(sQ<Dvk_-CpkZ2Yl=wqQHe6
zHOf9tsc!pU^RYim_IpU4lMrVpecZ?XX|jKo_^|%<@VM>&qmTVdWS{v-O-Y{LpZBp}
zx8333{*U&zz{C0&e}n9EoLKK=vfs!6rNFL;(|1Uo;8L<`O(|QJVL1lZ!?;DvHGH*d
zlK-Bofc<CrAe4Lc|1i}b|4#x5>%R>imgoB8oa~<a0MahehXemFLz3HnFW-^>Et<G4
ekwIKLY=*~eAH(cr|9iOD1nQ@G+<B{<zyAlGnRg}t

literal 14622
zcmd5?4QyN2b$;ZNEmm<UrA`}dNjExGGCFotN|t4*@fuR3?5CQwRZFtdI_VQdQ9rRH
zsw8DgO4kTrVw;ue8b~n&{qNAB8!&VM+66%yG)9!ksh4KXnx#QAq+Wm}4h%HSg4)^I
zMmy)ebI8NT_Z+50w;kZ|-uvD2chCKM?_O@&naCt3KHBi%KqZAl(aOu;Kww9ECvgln
zXqxuNnx_A+Jm#nNwzZ8fW^<WTVtQn5tb1-|G?f{fN+0b#qS-4u(zB_V-u^ww$*QXD
z$*FWU<scJl2QYT9HIo`^HO=w)d2=?E$);x#Q<J#`^GFCZ4IZd)p-OkG%}H;F!1j{&
ziEN5kL1P7sMvjEcnf4BIZ)HtMcLjgUQtl?hre)yT+Q+8mvJ)o6a4eN^aJVadgaTU~
zy^id{OwzGE7(MKU3ioXNP*E3cFLzL7MVm{7=(@oHdru7ZRUreF&R}f&s-Zy}q;WU3
zrmeZ*HX}MaI@C;P+qR>#OXY?e?(2>3@_IZlF)Z?EAJxhwMm5d1rJN6|Y*^c(-Ku!s
z3*|<5n&5dKJShJGO0TE%k0^zyr|NH{^iLoSQ2r)Jjpd_F#simg4Mz9T=Emb+2k7$g
zdC=%<y#Ftuu=My*RNr|27vMKC6d4>EFpA%5dExc7H6y>FMZ-b>95D*NHVSVTTlV=L
ztpm>df7Lbad&)>Y8!wI<n;~;C?)eX6^YcdG^@~ug#fyjI#`3-4?{5L^@xm_@_`sRZ
zwHt-QzIfqqlkrerF;-V+6r#<Ik3{1}{z>1VyCZi+?p|qpWc|vcYe4e%(o3h6UuPWr
z8a0;dZVrM`qY!IpZaYIwn){4VxKel!+j{v4eeRyZ`TT|E{9Dbr@yk#6a*@@#mW)x1
zwRl#wk=3vVgkMokf=6NFfipQ{HII?;7=;&%<#<c;sIhH>vNE?FoDCdWF$xzhKhcys
z1B#9+MVJ2JQEd99^H%aBD|y07exQ`AJo%TqUA5?fLyL^%`nCR$<pA{Hy<rNU5;q!S
zD~H}(IenFmT3&~a7{&f(qj0@Z42O-v@e|M(qnP$_4;jo9Zh|Je!bLJ(!JfC2{Qwgy
zXsY{G;>*|f>5CU%OF-{g{pHj;CGo@4UhJ@+tHL3cymXESpo8FVVJruXWix1Oe$!aK
zFI@PU5{^8EgQa_AZbPK-Z&zM4k}n#byMwX91sqQ+)?jbS-N8}c=^XX^+iSTE#`5l9
z_(aIa?+$tjZz}Eix7HeaR}RUsxMD0H3>pvoORmf6kM4tkjQ7nI$7xg2@x?QZkDR&k
z0*>=bKT&d5@4@v(q25?N;4_xP7mdPo^}kQoS)JqDZk*S}`+Ukz9QIxq*vq=1-w9TK
zzMKu@pYf==ORq)_l$QsG-uZQ3;a?+#XN+aL=deORD`-UGVB768&hEywO@p+V(g3AF
zN<)-(skE4eA#ANI6{@1X)%<-h2w`ML`(YS|;0YSV{O5467ouGN4?>C~9Hv4PMvo57
zGW7Y2ahmS2!hgc@!~^kCcVFQjZ5HB%SFPyCHk2n!ocyv@ugIrYA_rxpoEFl)!sSTe
zr$+Jb)imOEoy`x!lt5a|Phf5}KZ%KPHa|-hkex4Qm&(}_oMj(z=5MrSzM8DC9#?h(
zG~jHMIw4A(5DkDiuotCHh*Bp+RVT!Y$*@Z--d!iCcw3zrEBtr7aJdxyfxg0ORrDRZ
z-O&+iS>he2aK-92HTrR9iF`U@t}H#S#>*T`*xDw9e(2J3IBcqC#4C<7-YEV#M$xie
ze7}llal1;EMGbg$6$iK*ak|u3k3QWf-rJ?%Ln~`*FxK6hQnRKXE~z=g3o@z|&o$3_
zHooG~zT|~@4DdDI_HKO4t9@Qq(;6GcRQV3ZXmx6T=c?wg-fI}wFXAe{w#McEFDXvp
zT4?@l%YRPaQD%2E`jJ3z_uiq+fzH-#t!;t!P}`2swos=fc^xV6EbiAl^G%)`Hq`r0
zgD)|_@m&z<NBEQWHx<1-{^pZ<#2+|T7x4%4>xTRxZ_*!(toQ?wRey8O)BdKOANzfu
z@Q3p2j@6yii(V}NqN6Ae^d$a7{7rc;3eGq9n<9Q+<Qff^jT2Z0v<H9oA<yE9-&gj*
zv)CV39@c*uIc{J-(Z?1<fO_Gn!;G@wjk0_UF`)hMi28o44+1udl|T=|!}Z6!5r1=D
zAMyvh3;t$M2(IEX7ovXJfC511dyKH#5Ky)cqiXfuJxq1)!8#zcmm{nnfgXR;Nwjqg
z<C?GY8p>J^DnQ-eAl^ab9l*HcR9vv%9;3S5Sa;apGzdbdzk~1@$UC%WklTa$HCpk*
zm{-1!H(;migyLsOPY_b22Yfw+W1y(__yZ?#3>;e*fzDp9H+x_h_ybS}Jm`EJ<k`Qk
z3HdLe{3(<#*7f*<CsF>``iQ?PUq9pz>p73V3lPu@9A$0RIQTpA6?JQIjKz2Z1f%B~
zC}KRGmTejGH{GLp0hy}_`?5}Z!*LzZHSVoSIbB0b)ud}Y8Y=5_4ZpyuNmpNGPB?m1
zO}hGebHX=Pf!JkU@c*uQt13(!H$dtUep1(}T3tnt7)7G1uSEe+R@d<Bv6^%Zzb30m
z$HN^5(n~8$Wu>m+*Jm~9TA&&PMAvR|!EbiKZ>hp>RfvpAnME(HFl7vLIz-fobe>eU
zN*yGeNVjHJRh6z)ZHtOW*Vb3@BA{#3>9$&3!_}yg?ttR^#5h-LJYaXh<1`90F8G2A
zz63bxKL;t&lUAbP>g}ZeuUzo&0*-o4QK;4a0NB$m{LcY?lbCn7Cm_8B&n9nqoNz?e
zf!<AG94;X<(Di^j`H5@26aGt(al(gy-^rhQ6ke9Vy$K25D$A&a;qP(4F%Cb26bbh#
zq_0AGlNL~U{OPCM!+<;KdENzo4RF*yO=~ExX!TG=e35Xq$AOIx68>$%dBwb)@P`R!
zf9`^^lRu-x&r?I~bD$qTO3>~gO&PVv0sau-Um+a#IHd0o-k>%rO$#ej)6N0zWOqHt
zqyCQ(Kkjize+sx0f5-)&bHVQ?{TZ6VxCbJA)rJ3I;%}lUtoAtIKj*^#GUCuSJH770
zUk~kcYS(puZxZXgpA_Dr@bWsZ-W8y(%SBJZ1wW<ql=lTZk3{;W!pr*tzPBKK&qdEm
zE_fZp*Qs4SF8G2Ae%=M&3QMh%{!thF%P#moyWsWE-%k4P0DP0!2TMHW?)bi*NZd5D
zb0cP-Svq5m!m0E?F5YFDlj+Hs$(%Yvm&f(7?G8uum6so+df%>H1HFUhV5BGBYiiS}
z>E!GJ9A?9P$!L{UaWFr=h?gZAc$P~f51Yw}!{%6Ga;i$CboxHNsIE||7=EV<l)(c7
zxwh>iiR@$&9H6_Mz9fKc$%#b9%w-ajxolsuw-@aV<T|I)$;1?2pHyB6m99uC;%D7~
zj863*nN}`W+%;U~8U`Hew)~n*g9EM}bwRZWj49Rr|AYIR@x`N=$y}<cKmK6%IPVx3
z7%HhmrWZoiZuUWR;K?P(!;0{WJD^h6M(Gmm9j$cWeCh3X9zSg+449#rqmwhE=KfT6
zZrX}ZrS<Yf(?G7&jo_2RO;&02t5H~D?&t#_ERW66Y}%Yi%#2PcQ$nL+vPPPjoKKh#
zig`8DYr4fcu(vH_g4}#!Wb#N`JLI#IW^yW#&6>G|*_1gsGnR(7!rhj0BphcFSgr_#
zlCOqrw%bhe$h52Rnr2IFbA9_HT=AxQ_wOl<3LM?Fwy!gr%E9esZ+}v)Vzr9x$fi<<
z(_>@3Qy^gt+yOvFCo@pqrUvppSShE|qtwB*wMD6&O=n>`bc{g?_mrkNJ)&lWnZrr)
zyX)@GPNa`cCuSDtPPW#jcFt$VQ#rUHe|HhuK3BePuhmrd><DZtxTEx{>*QMHz{S})
zba9XJaj@!|R=sy*)vtBV@gd0n#yVzOJ0b8Ef!{{&bMRWC-6HVM3;fRnepcX{1%8Ea
z{Qm3^`1|SI4)x=^5&Lr^;mE&L;9Fhzw+ViHcV|5@!QU$IJ%S%ko0)$^@Z-@T<6ji`
zHi3Ur;Ozo`N#M5&d?&oOB4K+!0uSq7a>36FT*l|8gxllustf-w1iy^Wn&6l538I5Q
znA{8x`xz5>Sl|W1(I2V*$AZ5{@INQ`W&hp?@25yu9fgPO-YRhXrf0m9aC;nf34R&3
zj|+b3=iP!|#vvu}n6UfT0^cR@H-vsU&i(LSjAW0`e!{W6GCrRa{4zdY68thge<N@i
zpYOToc~bDp`23ULm+^T~@XPqTEcoR-`jz09^XRt%m+|bT??H@%^yh%U!%)Zlo+BKm
z8OkueAovY|FA4rWfgcyR)N_h(?5~dr{s)DAJfmT|e1~8khu9V*d%Het!Byh{`wj{D
zWt{)e`uRH??aFlweL=$I8)MDqt%xBZz84<;Zu21sggEQOGS?%%4*?q?j%P#6C-w9T
zUj9Bu{sDnY{*Mcs^&x*q;8IVUzyp*=em<i?i2D)oAq0q6C*lVfgtUwDh#wSqSm5e<
zlgfs`@1#5?+;-ISNr5vD;+q6MOnJmVCGZJ>9}@Vi!0}9!X`V8uXD5RqMI6_+X2N9r
z5f2mIL@Dxrg7TQK9OAg2G6g7uco%~rZ`Tth9>jN09upbQZh><a`FRc^7Qi1M9OL*f
zaoiX`M-?Y2Wt`7#zbo)9tOU}h1WsFoLY@^kEsYAfC~#VG6!M0^eU!(<hYbQ^fB5|L
zT7mP~CGzpz8T0dbs3G{Vj?Y6!Z8)EY;@`!PSP!3ve#?gQx#t-h&gY&l*l<4g{C9z)
z|F}LO)uBNkw#(<C0UOTeo|^@ZdN_8&0>><$W2(QBkbdOn6GCnu^5VninCESJO6z26
zc40b~7|~kcu*OO!IG0IHC0gOrHm9|YCUOa_btIeB;P14Aw&K4-D^6zcL%Vf6nKV=L
z$<%Dl1btH}7q+XY!2_Ne)Ht${e|VB$CNi1C0sv|B9^9~hom)*&mEaBVA&>1}rxntb
zU)V0<H89!!b?m6f{(0ocWdF0VV7l5LpIejZ>FLxA^y^GImuiJSCETXY3MqK033Duy
zfCHA+@l1M-S6~=SzX#zx#kCIdVZyGXGzck<<<dMN+)bYMK%OrskNbM9^1P==dEB3C
zl^-Slq&)74waQO9$m2d#t9;Hue(SrC|18OGpcLbW`(v&4Pmr8!Kkh4b`2b+JRLVUB
zirQ)X0s4yEM4(K|m8!!Zyf;UkVR$6Xe$Xo2Nt+h>FV|1r$8P}~?bGL^B?P})+3^x$
zWoTJ1p%_zSb(6RA+spWkf!KOm!`OZ=)Jd|jX>CyH)c&V#vZRJd7~B6|c(_^IejHP1
z-)mF+8DixA0v1Jf|NjC|C;yL-Jl~s=cvb(+Lxq$7!4FvySdWDM<2#w%|Azo`^8cwC
z{ePV0A(iwr$>aEtWN-iTQ0a%q-v4irJp0Q!xXkIZP~p`6Z&3TWpMt=LYaz=q{xV=r
z?N@^x>d-eyEYFw+gi&6%^?wsi79h4`mCE+H{2RzP`Tr>SZxAE;kLx$vXT1TkZy$d_
zvd{OCq^2a#<*<wYT_lfoC9!?RZU=cM|M@Q^1~H-@ybWb}_CMuff0*nqkvu0X&*fnk
z`$mh!F#+{RX#YldSf1?{T<l*U`#5JLu{>k<yV!sDW{V?4jA$SK^~3UP|2r=Be@XWH
zNuCpy=knjW*uOybPZJ~B55U9nZ2u=N_QMU9AoE+LvOJfcaj`#36QAv%eOyvmALG9w
z`y40MYmj~1pCkdhB2M2XdC7zMB+1)xmScP)%v;24(^s1y`3trR_MhebQ13MUPf+{s
zBzbH<LM+eice0JT4cu&i&Xd4q$ZvwjE|0^+zIFnnHGcqq;cUgW|J&PtE$}$WUqFX}
KT5RpN%l{7`5pgyE

diff --git a/src/krakendb.cpp b/src/krakendb.cpp
index 83e71b3..ec9927c 100644
--- a/src/krakendb.cpp
+++ b/src/krakendb.cpp
@@ -20,6 +20,7 @@
 #include "kraken_headers.hpp"
 #include "krakendb.hpp"
 #include "quickfile.hpp"
+#include <unordered_map>
 
 using std::string;
 using std::vector;
@@ -68,6 +69,26 @@ KrakenDB::KrakenDB(char *ptr) {
   key_len = key_bits / 8 + !! (key_bits % 8);
 }
 
+std::unordered_map<uint32_t,uint64_t> KrakenDB::count_taxons() {
+  throw std::runtime_error("count_taxons() is not working");
+  // Not working currently!!
+  char *ptr = get_pair_ptr();
+  size_t pair_sz = pair_size();
+
+  std::unordered_map<uint32_t, uint64_t> taxon_counts;
+  for (uint64_t i = 0; i < key_ct; i++) {
+    uint32_t* taxon = (uint32_t *) ptr + pair_sz * i + key_len;
+    if (taxon == NULL) {
+        std::cerr << "taxon is NULL (i is " << i << " and key_ct is " << key_ct << ")" << std::endl;
+    } else {
+        uint32_t taxon_i = *taxon;
+        ++taxon_counts[taxon_i];
+    }
+  }
+  return taxon_counts;
+}
+
+
 // Creates an index, indicating starting positions of each bin
 // Bins contain k-mer/taxon pairs with k-mers that share a bin key
 void KrakenDB::make_index(string index_filename, uint8_t nt) {
diff --git a/src/krakendb.hpp b/src/krakendb.hpp
index c30eeb5..f586026 100644
--- a/src/krakendb.hpp
+++ b/src/krakendb.hpp
@@ -21,6 +21,7 @@
 #define KRAKENDB_HPP
 
 #include "kraken_headers.hpp"
+#include <unordered_map>
 
 namespace kraken {
   class KrakenDBIndex {
@@ -60,6 +61,10 @@ namespace kraken {
     uint32_t *kmer_query(uint64_t kmer, uint64_t *last_bin_key,
                          int64_t *min_pos, int64_t *max_pos,
                          bool retry_on_failure=true);
+
+
+    // return a count of k-mers for all taxons
+	std::unordered_map<uint32_t,uint64_t> count_taxons();
     
     // return "bin key" for kmer, based on index
     // If idx_nt not specified, use index's value
diff --git a/src/report-cols.h b/src/report-cols.h
index 007eef5..ff19275 100644
--- a/src/report-cols.h
+++ b/src/report-cols.h
@@ -22,6 +22,7 @@ enum class REPORTCOLS : uint8_t {
 	NUM_READS_CLADE,
 	NUM_KMERS,
 	NUM_UNIQUE_KMERS,
+	NUM_KMERS_IN_DATABASE,
 	TOTAL_SCORE,
 	TOTAL_HIT_LENGTH,
 	ABUNDANCE,
diff --git a/src/taxdb.h b/src/taxdb.h
index 0e449e5..12518da 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -65,15 +65,16 @@ class TaxonomyEntry {
   TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) :
 	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {}
 
-  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_) :
-	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_) {}
+  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) :
+	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_),
+      genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) {}
 
   inline bool operator==(const TaxonomyEntry& other) const; 
   TaxonomyEntry* parent = nullptr;
   std::vector<TaxonomyEntry*> children;
 
-  READCOUNTS read_counts = READCOUNTS();
-  READCOUNTS read_counts_children = READCOUNTS();
+  READCOUNTS readCounts = READCOUNTS();
+  READCOUNTS readCountsOfChildren = READCOUNTS();
 
   bool used = false;
   uint64_t genomeSize = 0;
@@ -83,8 +84,8 @@ class TaxonomyEntry {
 
 //template<>
 //TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
-//	read_counts = 0;
-//	read_counts_children = 0;
+//	readCounts = 0;
+//	readCountsOfChildren = 0;
 //}
 
 template<typename TAXID, typename READCOUNTS>
@@ -97,7 +98,8 @@ template<typename TAXID, typename READCOUNTS>
 class TaxonomyDB {
  public:
   TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName);
-  TaxonomyDB(const std::string inFileName);
+  TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false);
+  TaxonomyDB();
   void writeTaxonomyIndex(std::ostream & outs) const;
 
   TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
@@ -113,17 +115,21 @@ class TaxonomyDB {
   bool isSubSpecies(TAXID taxonomyID) const;
   int isBelowInTree(TAXID upper, TAXID lower) const;
 
-  void addCounts(const TAXID taxid, const READCOUNTS& read_counts_);
-  void fillCounts(const std::unordered_map<TAXID, READCOUNTS>& taxon_counts);
+  void setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes);
+  void setReadCounts(const std::unordered_map<TAXID, READCOUNTS>& readCounts);
+  void setGenomeSize(const TAXID taxid, const uint64_t genomeSize);
+  void addReadCount(const TAXID taxid, const READCOUNTS& readCounts_);
+
   void printReport();
 
   std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
+  bool genomeSizes_are_set = false;
  private:
-  TaxonomyDB();
-  void readTaxonomyIndex(const std::string inFileName);
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > 
+    readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes);
   void parseNamesDump(const std::string namesDumpFileName);
   void parseNodesDump(const std::string nodesDumpFileName);
-  void createPointers();
+  void createPointers(std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >& taxIDsAndEntries);
 };
 
 
@@ -243,15 +249,14 @@ std::vector<std::string> get_fields(const std::string &s, const std::string& del
 }
 
 
-
 //template<>
 //TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
-//	read_counts = 0;
-//	read_counts_children = 0;
+//	readCounts = 0;
+//	readCountsOfChildren = 0;
 //}
 template<typename TAXID, typename READCOUNTS>
 bool TaxonomyEntryPtr_comp<TAXID,READCOUNTS>::operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const {
-	        return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children)));
+	        return ((reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren)));
 			    }
 
 
@@ -276,7 +281,7 @@ unordered_map<TAXID, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getParentMap() const {
 }
 
 template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::createPointers() {
+void TaxonomyDB<TAXID,READCOUNTS>::createPointers(std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >& taxIDsAndEntries) {
   for (auto& tax : taxIDsAndEntries) {
   if (tax.second.parentTaxonomyID != tax.first) {
     auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
@@ -292,20 +297,17 @@ template<typename TAXID, typename READCOUNTS>
 TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB() { }
 
 template<typename TAXID, typename READCOUNTS>
-TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName) {
-  log_msg("Building taxonomy index from " + inFileName);
-  readTaxonomyIndex(inFileName);
-  createPointers();
-  log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) +
-      " nodes");
-}
+TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) :
+  taxIDsAndEntries( readTaxonomyIndex(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes)
+ { }
 
 template<typename TAXID, typename READCOUNTS>
 TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) {
   log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName);
   parseNodesDump(nodesDumpFileName);
   parseNamesDump(namesDumpFileName);
-  log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) + " nodes");
+  createPointers(taxIDsAndEntries);
+  log_msg("Built a tree with " + std::to_string(taxIDsAndEntries.size()) + " taxa");
 }
 
 template<typename TAXID, typename READCOUNTS>
@@ -382,29 +384,49 @@ template<typename TAXID, typename READCOUNTS>
 void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs) const {
   for (TAXID& key : getSortedKeys(taxIDsAndEntries)) {
 	const auto& entry = taxIDsAndEntries.at(key);
-    outs << key << "\t" << entry.parentTaxonomyID << "\t"
-            << entry.scientificName << "\t" << entry.rank << "\n";
+    outs << key << '\t' << entry.parentTaxonomyID << '\t'
+            << entry.scientificName << '\t' << entry.rank;
+    if (genomeSizes_are_set) {
+		outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren;
+	}
+	outs << '\n';
   }
+  outs.flush();
 }
 
-
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes) {
+  for (const auto& it : genomeSizes) {
+	setGenomeSize(it.first, it.second);
+  }
+  genomeSizes_are_set = true;
+}
 
 template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName) {
+std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > 
+ TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) {
+  log_msg("Reading taxonomy index from " + inFileName);
   std::ifstream inFile(inFileName);
   if (!inFile.is_open())
     throw std::runtime_error("unable to open taxonomy index file " + inFileName);
 
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
   TAXID taxonomyID, parentTaxonomyID;
   std::string scientificName, rank;
+  uint64_t genomeSize, genomeSizeOfChildren = 0;
 
   std::string line;
   while (!inFile.eof()) {
 	inFile >> taxonomyID >> parentTaxonomyID;
 	inFile.get(); // read tab
 	std::getline(inFile, scientificName, '\t');
-	std::getline(inFile, rank, '\n');
-    TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
+    if (hasGenomeSizes) {
+  	  std::getline(inFile, rank, '\t');
+	  inFile >> genomeSize >> genomeSizeOfChildren;
+    } else {
+  	  std::getline(inFile, rank, '\n');
+    }
+    TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName, genomeSize, genomeSizeOfChildren);
 
 	//cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl;
     taxIDsAndEntries.insert({
@@ -414,6 +436,9 @@ void TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileNam
   taxIDsAndEntries.insert({
 	0, {0, 0, "no rank", "unclassified" }
   });
+  createPointers(taxIDsAndEntries);
+  log_msg("Finished, read " + std::to_string(taxIDsAndEntries.size()) + " taxa");
+  return(taxIDsAndEntries);
 }
 
 template<typename TAXID, typename READCOUNTS>
@@ -594,27 +619,46 @@ bool TaxonomyDB<TAXID,READCOUNTS>::isSubSpecies(TAXID taxonomyID) const {
 }
 
 template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) {
+void TaxonomyDB<TAXID,READCOUNTS>::addReadCount(const TAXID taxid, const READCOUNTS& readCounts_) {
+	auto it = taxIDsAndEntries.find(taxid);
+		if (it == taxIDsAndEntries.end()) {
+			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
+			return;
+		}
+		TaxonomyEntry<TAXID,READCOUNTS>* tax = &it->second;
+		//cerr << taxid << " rc before: " << tax->readCounts << endl;
+		tax->readCounts += readCounts_;
+		//cerr << taxid << " rc after:  " << tax->readCounts << endl;
+
+		while (tax->parent != nullptr) {
+			tax = tax->parent;
+			tax->readCountsOfChildren += readCounts_;
+		}
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) {
 	auto it = taxIDsAndEntries.find(taxid);
 		if (it == taxIDsAndEntries.end()) {
 			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
 			return;
 		}
 		TaxonomyEntry<TAXID,READCOUNTS>* tax = &it->second;
-		//cerr << taxid << " rc before: " << tax->read_counts << endl;
-		tax->read_counts += read_counts_;
-		//cerr << taxid << " rc after:  " << tax->read_counts << endl;
+		tax->genomeSize += genomeSize;
 
 		while (tax->parent != nullptr) {
 			tax = tax->parent;
-			tax->read_counts_children += read_counts_;
+			//std::cerr << "setting genomeSizeOfChildren of parent" << std::endl;
+			tax->genomeSizeOfChildren += genomeSize;
 		}
 }
 
+
+
 template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::fillCounts(const unordered_map<TAXID, READCOUNTS>& taxon_counts) {
-	for (auto& elem : taxon_counts) {
-		addCounts(elem.first, elem.second);
+void TaxonomyDB<TAXID,READCOUNTS>::setReadCounts(const unordered_map<TAXID, READCOUNTS>& readCounts) {
+	for (auto& elem : readCounts) {
+		addReadCount(elem.first, elem.second);
 	 }
 
 	for (auto& tax : taxIDsAndEntries) {
@@ -625,16 +669,16 @@ void TaxonomyDB<TAXID,READCOUNTS>::fillCounts(const unordered_map<TAXID, READCOU
 
 template<typename TAXID, typename READCOUNTS>
 TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
-	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
+	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS_IN_DATABASE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
 }
 
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string rank) {
 	_total_n_reads =
-			reads(_taxdb.taxIDsAndEntries.at(0).read_counts) +
-			reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) +
-			reads(_taxdb.taxIDsAndEntries.at(1).read_counts) +
-			reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// +
+			reads(_taxdb.taxIDsAndEntries.at(0).readCounts) +
+			reads(_taxdb.taxIDsAndEntries.at(0).readCountsOfChildren) +
+			reads(_taxdb.taxIDsAndEntries.at(1).readCounts) +
+			reads(_taxdb.taxIDsAndEntries.at(1).readCountsOfChildren);// +
 	if (_total_n_reads == 0) {
 		std::cerr << "total number of reads is zero - not creating a report!" << endl;
 		return;
@@ -657,7 +701,7 @@ void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string ra
 
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
-	if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) {
+	if (_show_zeros || (reads(tax.readCounts)+reads(tax.readCountsOfChildren)) > 0) {
 		printLine(tax, depth);
 		for (auto child : tax.children)
 			printReport(*child, depth+1);
@@ -672,13 +716,14 @@ void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax
 		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
 		case REPORTCOLS::TAX_ID:     _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
 		case REPORTCOLS::DEPTH:     _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break;
+		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
 		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
-		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break;
-		case REPORTCOLS::NUM_READS:  _reportOfb << reads(tax.read_counts); break;
-		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.read_counts.kmers.cardinality(); break;
-		case REPORTCOLS::NUM_KMERS: _reportOfb << tax.read_counts.n_kmers; break;
+		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break;
+		case REPORTCOLS::NUM_READS:  _reportOfb << reads(tax.readCounts); break;
+		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break;
+		case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break;
+		case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
 		//case REPORTCOLS::SUM_SCORE: ; break;

From 78a61d44008ba03e3b521174325e8ffb7b41c75e Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sat, 6 May 2017 23:53:59 -0400
Subject: [PATCH 034/105] Update to read columns

---
 scripts/kraken_hll-build_db.sh |  4 +--
 src/classify.cpp               | 15 ++++++++-
 src/report-cols.h              | 15 +++++++--
 src/taxdb.h                    | 58 ++++++++++++++++++++++++++--------
 4 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh
index 75a678d..fa90e6b 100755
--- a/scripts/kraken_hll-build_db.sh
+++ b/scripts/kraken_hll-build_db.sh
@@ -203,8 +203,8 @@ then
   echo "Skipping step 6, taxDB exists."
 else
   echo "Creating taxDB (step 6 of 6)... "
-  jellyfish1 dump database.kdb | grep '^>' | sed 's/.//' | sort | uniq -c | sort -rn | sed 's/^ *\([0-9]\+\) \+\([0-9]\+\)$/\2\t\1/' > database.taxon_count
-  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count > taxDB.tmp
+  time $JELLYFISH_BIN histo --high 100000000 database.kdb | tee database.taxon_count
+  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp
   mv taxDB.tmp taxDB
 fi
 
diff --git a/src/classify.cpp b/src/classify.cpp
index 690715e..a2a61b3 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -120,7 +120,7 @@ int main(int argc, char **argv) {
   parse_command_line(argc, argv);
   
   if (!TaxDB_file.empty()) {
-	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_file);
+	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_file, true);
       for (const auto & tax : taxdb.taxIDsAndEntries) {
           if (tax.first != 0)
           Parent_map[tax.first] = tax.second.parentTaxonomyID;
@@ -198,6 +198,19 @@ int main(int argc, char **argv) {
   if (Print_kraken_report) {
 	taxdb.setReadCounts(taxon_counts);
 	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, false);
+	rep.setReportCols({ 
+		"percReadsClade",
+		"numReadsClade", 
+		"numReadsTaxon", 
+		"numUniqueKmersClade", 
+		"numUniqueKmersTaxon", 
+		"numKmersClade", 
+		"numKmersTaxon", 
+		"numKmersInDatabaseClade", 
+		"numKmersInDatabaseTaxon", 
+		"taxID", 
+		"taxRank", 
+		"indentedName"});
 	rep.printReport("kraken","blu");
   }
 
diff --git a/src/report-cols.h b/src/report-cols.h
index ff19275..2392bd8 100644
--- a/src/report-cols.h
+++ b/src/report-cols.h
@@ -21,8 +21,11 @@ enum class REPORTCOLS : uint8_t {
 	NUM_READS,
 	NUM_READS_CLADE,
 	NUM_KMERS,
+	NUM_KMERS_CLADE,
 	NUM_UNIQUE_KMERS,
+	NUM_UNIQUE_KMERS_CLADE,
 	NUM_KMERS_IN_DATABASE,
+	NUM_KMERS_IN_DATABASE_CLADE,
 	TOTAL_SCORE,
 	TOTAL_HIT_LENGTH,
 	ABUNDANCE,
@@ -33,19 +36,25 @@ enum class REPORTCOLS : uint8_t {
 
 static const std::map<std::string, REPORTCOLS> report_col_name_map = {
 		{"name", REPORTCOLS::NAME},
-		{"spaced_name", REPORTCOLS::SPACED_NAME},
+		{"indentedName", REPORTCOLS::SPACED_NAME},
 		{"taxID", REPORTCOLS::TAX_ID},
 		{"taxRank", REPORTCOLS::TAX_RANK},
 		{"depth", REPORTCOLS::DEPTH},
 		{"genomeSize", REPORTCOLS::GENOME_SIZE},
-		{"numReads", REPORTCOLS::NUM_READS},
+		{"numReadsTaxon", REPORTCOLS::NUM_READS},
 		{"numReadsClade", REPORTCOLS::NUM_READS_CLADE},
-		{"numUniqueKmers", REPORTCOLS::NUM_UNIQUE_KMERS},
+		{"numKmersTaxon", REPORTCOLS::NUM_KMERS},
+		{"numKmersClade", REPORTCOLS::NUM_KMERS_CLADE},
+		{"numUniqueKmersTaxon", REPORTCOLS::NUM_UNIQUE_KMERS},
+		{"numUniqueKmersClade", REPORTCOLS::NUM_UNIQUE_KMERS_CLADE},
+		{"numKmersInDatabaseTaxon", REPORTCOLS::NUM_KMERS_IN_DATABASE},
+		{"numKmersInDatabaseClade", REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE},
 		{"totalHitLen", REPORTCOLS::TOTAL_HIT_LENGTH},
 		{"totalScore", REPORTCOLS::TOTAL_SCORE},
 		{"abundance", REPORTCOLS::ABUNDANCE},
 		{"abundance_len", REPORTCOLS::ABUNDANCE_LEN},
 
+		{"percReadsClade", REPORTCOLS::PERCENTAGE},
 		{"percent", REPORTCOLS::PERCENTAGE},
 		{"taxId", REPORTCOLS::TAX_ID},
 		{"reads_clade", REPORTCOLS::NUM_READS_CLADE}, // Change to clade reads!
diff --git a/src/taxdb.h b/src/taxdb.h
index 12518da..3b825f2 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -138,17 +138,18 @@ class TaxReport {
 private:
 	std::ostream& _reportOfb;
 	TaxonomyDB<TAXID,READCOUNTS> & _taxdb;
-	std::vector<REPORTCOLS> _report_cols;
 	uint64_t _total_n_reads;
 	bool _show_zeros;
-
 	void printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
 
 public:
 	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID,READCOUNTS> & taxdb, bool _show_zeros);
-
 	void printReport(std::string format, std::string rank);
 	void printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
+	void setReportCols(std::vector<std::string> names);
+
+	std::vector<std::string> _report_col_names;
+	std::vector<REPORTCOLS> _report_cols;
 };
 
 
@@ -668,8 +669,23 @@ void TaxonomyDB<TAXID,READCOUNTS>::setReadCounts(const unordered_map<TAXID, READ
 
 
 template<typename TAXID, typename READCOUNTS>
-TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
-	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS_IN_DATABASE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
+	TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
+	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
+}
+
+
+template<typename TAXID, typename READCOUNTS>
+void TaxReport<TAXID,READCOUNTS>::setReportCols(std::vector<std::string> names) {
+	_report_cols.clear();
+	for (auto& s : names) {
+		auto it = report_col_name_map.find(s);
+		if (it == report_col_name_map.end()) {
+			throw std::runtime_error(s + " is not a valid report column name");
+		}
+		_report_cols.push_back(it->second);
+	}
+	_report_col_names = names;
+
 }
 
 template<typename TAXID, typename READCOUNTS>
@@ -683,6 +699,19 @@ void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string ra
 		std::cerr << "total number of reads is zero - not creating a report!" << endl;
 		return;
 	}
+	if (_report_cols.size() == _report_col_names.size()) {
+		// print header
+		bool first_one = true;
+		for (std::string s : _report_col_names) {
+			if (first_one) {
+				first_one = false;
+			} else {
+				_reportOfb << '\t';
+			}
+			_reportOfb << s;
+		}
+		_reportOfb << endl;
+	}
 
 	if (format == "kraken") {
 		// A: print number of unidentified reads
@@ -712,18 +741,21 @@ template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
 	for (auto& col : _report_cols) {
 		switch (col) {
-		case REPORTCOLS::NAME:        _reportOfb << tax.scientificName ; break;
+		case REPORTCOLS::NAME:              _reportOfb << tax.scientificName ; break;
 		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
-		case REPORTCOLS::TAX_ID:     _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
-		case REPORTCOLS::DEPTH:     _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
-		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
+		case REPORTCOLS::TAX_ID:            _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
+		case REPORTCOLS::DEPTH:             _reportOfb << depth; break;
+		case REPORTCOLS::PERCENTAGE:       _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
+		//case REPORTCOLS::ABUNDANCE:      _reportOfb << 100*counts.abundance[0]; break;
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
+		case REPORTCOLS::NUM_READS:        _reportOfb << reads(tax.readCounts); break;
 		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break;
-		case REPORTCOLS::NUM_READS:  _reportOfb << reads(tax.readCounts); break;
 		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break;
-		case REPORTCOLS::NUM_KMERS: _reportOfb << tax.readCounts.n_kmers; break;
-		case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
+		case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE:  _reportOfb << (tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality()); break;
+		case REPORTCOLS::NUM_KMERS:        _reportOfb << tax.readCounts.n_kmers; break;
+		case REPORTCOLS::NUM_KMERS_CLADE:  _reportOfb << tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers; break;
+		case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
+		case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
 		//case REPORTCOLS::SUM_SCORE: ; break;

From 7ae1f5de0e3f3bdae8decdad0fa118d8884f7161 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 13 Aug 2017 14:34:07 -0400
Subject: [PATCH 035/105] Added taxdb.cpp

---
 src/taxdb.cpp | 584 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 584 insertions(+)
 create mode 100644 src/taxdb.cpp

diff --git a/src/taxdb.cpp b/src/taxdb.cpp
new file mode 100644
index 0000000..41ba0ee
--- /dev/null
+++ b/src/taxdb.cpp
@@ -0,0 +1,584 @@
+#include "taxdb.h"
+using namespace std;
+
+void log_msg (const std::string& s) {
+	std::cerr << s << "\n";
+}
+
+template<typename T>
+uint64_t string_to_T(string str) {
+  stringstream stream(str);
+  T result;
+  stream >> result;
+  return result;
+}
+
+template <typename T>
+inline
+uint64_t reads(const T read_count) {
+	cerr << "No reads function for type!! " << endl;
+	throw ;
+	return(0);
+}
+
+
+
+inline
+uint64_t reads(const uint64_t read_count) {
+	return(read_count);
+}
+
+std::vector<std::string> in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) {
+    std::vector<std::string> tokens;
+	size_t i = 0;
+	size_t next_end = start_at-1;
+    
+	for (size_t next_start = s.find(start_char, next_end + 1); \
+		 next_start != string::npos;
+         next_start = s.find(start_char, next_end + 1), ++i) {
+
+		next_end = s.find(end_char, next_start + 1);
+		if (next_end == string::npos)
+			throw std::runtime_error("unmatched start and end!");
+
+        tokens.push_back(s.substr(next_start+1, next_end-1));
+    }
+
+    return tokens;
+}
+
+
+
+std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) {
+    std::vector<std::string> tokens(max_fields);
+    size_t delim_length = delimiter.length();
+    size_t last = 0;
+    size_t i = 0;
+
+    for (size_t next = s.find(delimiter, last);
+         (max_fields > 0 && i < max_fields) && next != string::npos;
+         next = s.find(delimiter, last), ++i) {
+        tokens[i] = s.substr(last, next-last);
+        last = next + delim_length;
+    }
+    if (max_fields > 0 && i < max_fields) {
+        tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars);
+    }
+
+    return tokens;
+}
+
+std::vector<std::string> get_fields(const std::string &s, const std::string& delimiter, vector<size_t> fields) {
+    std::vector<std::string> tokens;
+	tokens.reserve(fields.size());
+    size_t delim_length = delimiter.length();
+    size_t last = 0;
+    size_t i = 0;
+	size_t current_field = 0;
+
+    for (size_t next = s.find(delimiter, last);
+         tokens.size() < fields.size() && next != string::npos;
+         next = s.find(delimiter, last), ++i) {
+		if (i == fields[current_field]) {
+          tokens.push_back(s.substr(last, next-last));
+           ++current_field;
+		}
+        last = next + delim_length;
+    }
+
+    return tokens;
+}
+
+
+
+//template<>
+//TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
+//	read_counts = 0;
+//	read_counts_children = 0;
+//}
+template<typename TAXID, typename READCOUNTS>
+bool TaxonomyEntryPtr_comp<TAXID,READCOUNTS>::operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const {
+	        return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children)));
+			    }
+
+
+template<typename TAXID, typename READCOUNTS>
+std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getScientificNameMap() const {
+	std::unordered_map<std::string, TAXID> scientificNameMap;
+	for (const auto & tax : taxIDsAndEntries) {
+		scientificNameMap[tax.second.scientificName] = tax.first;
+    }
+	return scientificNameMap;
+}
+
+template<typename TAXID, typename READCOUNTS>
+unordered_map<TAXID, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getParentMap() const {
+	unordered_map<TAXID, TAXID> Parent_map;
+	for (const auto & tax : taxIDsAndEntries) {
+		if (tax.first != 0)
+			Parent_map[tax.first] = tax.second.parentTaxonomyID;
+    }
+    Parent_map[1] = 1;
+	return Parent_map;
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::createPointers() {
+  for (auto& tax : taxIDsAndEntries) {
+  if (tax.second.parentTaxonomyID != tax.first) {
+    auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
+    if (parentIt != taxIDsAndEntries.end()) {
+      tax.second.parent = &(parentIt->second);
+      parentIt->second.children.push_back(&tax.second);
+    }
+  }
+  }
+}
+
+template<typename TAXID, typename READCOUNTS>
+TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB() { }
+
+template<typename TAXID, typename READCOUNTS>
+TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName) {
+  log_msg("Building taxonomy index");
+  readTaxonomyIndex(inFileName);
+  createPointers();
+  log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) +
+      " nodes");
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::parseNodesDump(const std::string nodesDumpFileName) {
+  std::ifstream nodesDumpFile(nodesDumpFileName);
+  if (!nodesDumpFile.is_open())
+    throw std::runtime_error("unable to open nodes file");
+  std::string line;
+
+  TAXID taxonomyID;
+  TAXID parentTaxonomyID;
+  std::string rank;
+
+  while (nodesDumpFile.good()) {
+    getline(nodesDumpFile, line);
+    std::vector<std::string> tokens = tokenise(line, "\t|\t", 3, 2);
+    if (tokens.size() < 3) {
+	  continue;
+	}
+
+	taxonomyID = string_to_T<TAXID>(tokens[0]);
+    parentTaxonomyID = string_to_T<TAXID>(tokens[1]);
+    rank = tokens[2];
+
+    auto entryIt = taxIDsAndEntries.find(taxonomyID);
+	if (entryIt == taxIDsAndEntries.end()) {
+	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, parentTaxonomyID, rank);
+	} else {
+      entryIt->second.parentTaxonomyID = parentTaxonomyID;
+      entryIt->second.rank = rank;
+    }
+  }
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFileName) {
+  std::ifstream namesDumpFile(namesDumpFileName);
+  if (!namesDumpFile.is_open())
+    throw std::runtime_error("unable to open names file");
+  std::string line;
+
+  TAXID taxonomyID;
+  std::string scientificName;
+  while (namesDumpFile.good()) {
+    getline(namesDumpFile, line);
+    std::vector<std::string> tokens = tokenise(line, "\t|\t", 4, 2);
+    if (tokens.size() < 4 || tokens[3] != "scientific name") {
+	  continue;
+	}
+    taxonomyID = string_to_T<TAXID>(tokens[0]);
+    scientificName = tokens[1];
+
+    auto entryIt = taxIDsAndEntries.find(taxonomyID);
+	if (entryIt == taxIDsAndEntries.end()) {
+	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, scientificName);
+	} else {
+      entryIt->second.scientificName = scientificName;
+    }
+  }
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs,
+									const std::string namesDumpFileName,
+                                    const std::string nodesDumpFileName) {
+  parseNodesDump(nodesDumpFileName);
+  parseNamesDump(namesDumpFileName);
+  writeTaxonomyIndex(outs);
+}
+
+template<typename KeyType, typename ValueType>
+std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>& unordered) {
+  std::vector<KeyType> keys;
+  keys.reserve (unordered.size());
+  for (auto& it : unordered) {
+	      keys.push_back(it.first);
+  }
+  std::sort (keys.begin(), keys.end());
+  return keys;
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs) const {
+  for (TAXID& key : getSortedKeys(taxIDsAndEntries)) {
+	const auto& entry = taxIDsAndEntries.at(key);
+    outs << key << "\t" << entry.parentTaxonomyID << "\t"
+            << entry.scientificName << "\t" << entry.rank << "\n";
+  }
+}
+
+
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName) {
+  std::ifstream inFile(inFileName);
+  if (!inFile.is_open())
+    throw std::runtime_error("unable to open taxonomy index file " + inFileName);
+
+  TAXID taxonomyID, parentTaxonomyID;
+  std::string scientificName, rank;
+
+  std::string line;
+  while (!inFile.eof()) {
+	inFile >> taxonomyID >> parentTaxonomyID;
+	inFile.get(); // read tab
+	std::getline(inFile, scientificName, '\t');
+	std::getline(inFile, rank, '\n');
+    TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
+
+	//cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl;
+    taxIDsAndEntries.insert({
+      taxonomyID, newEntry
+    });
+  }
+  taxIDsAndEntries.insert({
+	0, {0, 0, "no rank", "unclassified" }
+  });
+}
+
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
+    const std::vector<TAXID>& taxIDs) const {
+  if (taxIDs.size() == 0) {
+    return 0;
+  }
+  std::vector<std::vector<READCOUNTS> > paths;
+  for (auto& taxID : taxIDs) {
+    bool good = true;
+    std::vector<READCOUNTS> path;
+    TAXID tempTaxID = taxID;
+    while (tempTaxID != 0) {
+      path.push_back(tempTaxID);
+      tempTaxID = getParentTaxID(tempTaxID);
+    }
+    if (good) paths.push_back(path);
+  }
+  if (paths.size() == 0) {
+    return 0;
+  }
+  for (auto& path : paths)
+    std::reverse(path.begin(), path.end());
+  std::sort(paths.begin(), paths.end(),
+            [](std::vector<READCOUNTS> i, std::vector<READCOUNTS> j) {
+    return i.size() < j.size();
+  });
+  TAXID consensus = 0;
+  for (unsigned i = 0; i < paths[0].size(); i++) {
+    TAXID temp = 0;
+    for (auto& path : paths) {
+      if (temp == 0)
+        temp = path[i];
+      else if (temp != path[i]) {
+        return consensus;
+      }
+    }
+    consensus = temp;
+  }
+  return consensus;
+}
+
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getParentTaxID(const TAXID taxID) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1)
+    return entry->second.parentTaxonomyID;
+  else
+    return 0;
+}
+
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getScientificName(const TAXID taxID) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  if (entry != taxIDsAndEntries.end()) {
+    return entry->second.scientificName;
+  } else
+    return std::string();
+}
+
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getRank(const TAXID taxID) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  if (entry != taxIDsAndEntries.end()) {
+    return entry->second.rank;
+  } else
+    return std::string();
+}
+
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getLineage(TAXID taxonomyID) const {
+  std::string lineage;
+  while (true) {
+    // 131567 = Cellular organisms
+    if (taxonomyID != 131567) {
+      if (lineage.size()) lineage.insert(0, "; ");
+      lineage.insert(0, getScientificName(taxonomyID));
+      if (getRank(taxonomyID) == "species") lineage.clear();
+    }
+    taxonomyID = getParentTaxID(taxonomyID);
+    if (taxonomyID == 0) {
+      if (lineage.size()) lineage.append(".");
+      break;
+    }
+  }
+  return lineage;
+}
+
+template<typename TAXID, typename READCOUNTS>
+std::string TaxonomyDB<TAXID,READCOUNTS>::getMetaPhlAnLineage(TAXID taxonomyID) const {
+  std::string rank = getRank(taxonomyID);
+  if (rank == "superphylum") return std::string();
+  std::string lineage;
+  while (true) {
+    // 131567 = Cellular organisms
+    if (taxonomyID != 131567) {
+      std::string rank = getRank(taxonomyID);
+      if (rank == "species") {
+        lineage.insert(0, "|s__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "genus") {
+        lineage.insert(0, "|g__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "family") {
+        lineage.insert(0, "|f__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "order") {
+        lineage.insert(0, "|o__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "class") {
+        lineage.insert(0, "|c__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "phylum") {
+        lineage.insert(0, "|p__");
+        lineage.insert(4, getScientificName(taxonomyID));
+      } else if (rank == "superkingdom") {
+        lineage.insert(0, "k__");
+        lineage.insert(3, getScientificName(taxonomyID));
+      }
+    }
+    taxonomyID = getParentTaxID(taxonomyID);
+    if (taxonomyID == 0) {
+      break;
+    }
+  }
+  std::replace(lineage.begin(), lineage.end(), ' ', '_');
+  return lineage;
+}
+
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtRank(const TAXID taxID,
+                                    const std::string& rank) const {
+  auto entry = taxIDsAndEntries.find(taxID);
+  while (entry != taxIDsAndEntries.end() &&
+         entry->second.parentTaxonomyID != 1) {
+    if (entry->second.rank == rank) {
+      return entry->second.taxonomyID;
+    } else
+      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+  }
+  return 0;
+}
+
+template<typename TAXID, typename READCOUNTS>
+int TaxonomyDB<TAXID,READCOUNTS>::isBelowInTree(TAXID upper, TAXID lower) const {
+  auto entry = taxIDsAndEntries.find(lower);
+  unsigned level = 0;
+  while (entry != taxIDsAndEntries.end() &&
+         entry->second.parentTaxonomyID != 1) {
+    if (entry->first == upper) {
+      return level;
+    } else {
+      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+      level++;
+    }
+  }
+  return -1;
+}
+
+template<typename TAXID, typename READCOUNTS>
+bool TaxonomyDB<TAXID,READCOUNTS>::isSubSpecies(TAXID taxonomyID) const {
+  bool isSubSpecies = false;
+  auto entry = taxIDsAndEntries.find(taxonomyID);
+  int numLevels = 0;
+  while (entry != taxIDsAndEntries.end() &&
+         entry->second.parentTaxonomyID != 1) {
+    if (entry->second.rank == "species") {
+      if (numLevels > 0) {
+        isSubSpecies = true;
+      }
+      break;
+    } else
+      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+    numLevels++;
+  }
+  return isSubSpecies;
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) {
+	auto it = taxIDsAndEntries.find(taxid);
+		if (it == taxIDsAndEntries.end()) {
+			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
+			return;
+		}
+		TaxonomyEntry<TAXID,READCOUNTS>* tax = &it->second;
+		//cerr << taxid << " rc before: " << tax->read_counts << endl;
+		tax->read_counts += read_counts_;
+		//cerr << taxid << " rc after:  " << tax->read_counts << endl;
+
+		while (tax->parent != nullptr) {
+			tax = tax->parent;
+			tax->read_counts_children += read_counts_;
+		}
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::fillCounts(const unordered_map<TAXID, READCOUNTS>& taxon_counts) {
+	for (auto& elem : taxon_counts) {
+		addCounts(elem.first, elem.second);
+	 }
+
+	for (auto& tax : taxIDsAndEntries) {
+		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID,READCOUNTS>());
+	}
+}
+
+
+template<typename TAXID, typename READCOUNTS>
+TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
+	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string rank) {
+	_total_n_reads =
+			reads(_taxdb.taxIDsAndEntries.at(0).read_counts) +
+			reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) +
+			reads(_taxdb.taxIDsAndEntries.at(1).read_counts) +
+			reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// +
+	if (_total_n_reads == 0) {
+		std::cerr << "total number of reads is zero - not creating a report!" << endl;
+		return;
+	}
+
+	if (format == "kraken") {
+		// A: print number of unidentified reads
+		printReport(_taxdb.taxIDsAndEntries.at(0),0u);
+		// B: print normal results
+		printReport(_taxdb.taxIDsAndEntries.at(1),0u);
+		// C: Print Unclassified stuff
+		//printReport(_taxdb.taxIDsAndEntries.at(-1),0u);
+	} else {
+		// print stuff at a certain level ..
+		//_uid_abundance;
+		//_taxinfo
+
+	}
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
+	if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) {
+		printLine(tax, depth);
+		for (auto child : tax.children)
+			printReport(*child, depth+1);
+	}
+}
+
+template<typename TAXID, typename READCOUNTS>
+void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
+	for (auto& col : _report_cols) {
+		switch (col) {
+		case REPORTCOLS::NAME:        _reportOfb << tax.scientificName ; break;
+		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
+		case REPORTCOLS::TAX_ID:     _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
+		case REPORTCOLS::DEPTH:     _reportOfb << depth; break;
+		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break;
+		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
+		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
+		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break;
+		case REPORTCOLS::NUM_READS:  _reportOfb << (tax.read_counts); break;
+		//case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
+		//case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break;
+		//case REPORTCOLS::GENOME_SIZE: ; break;
+		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
+		//case REPORTCOLS::SUM_SCORE: ; break;
+		case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break;
+		default: _reportOfb << "NA";
+		}
+		if (&col == &_report_cols.back()) {
+			_reportOfb << '\n';
+		} else {
+			_reportOfb << '\t';
+		}
+	}
+}
+
+
+  // Return lowest common ancestor of a and b
+  // LCA(0,x) = LCA(x,0) = x
+  // Default ancestor is 1 (root of tree)
+uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b)
+  {
+    if (a == 0 || b == 0)
+      return a ? a : b;
+
+    // create a path from a to the root
+	std::unordered_set<uint32_t> a_path;
+    while (a > 0 && a != parent_map[a]) {
+	  if (a == b)
+		  return a;
+      a_path.insert(a);
+      a = parent_map[a];
+    }
+
+    // search for b in the path from a to the root
+    while (b > 0 && b != parent_map[b]) {
+      if (a_path.count(b) > 0)
+        return b;
+      b = parent_map[b];
+    }
+    return 1;
+  }
+
+template<typename K,typename V>
+inline
+V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, const V default_value) {
+	auto itr = my_map.find(query);
+
+	if (itr == my_map.end()) {
+		return default_value;
+	}
+
+	return itr->second;
+}
+
+
+

From c5318e0245d5997251fa4db695984959f5dec3f3 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Fri, 25 Aug 2017 10:49:05 -0400
Subject: [PATCH 036/105] Add UID mapping to Kraken-HLL

---
 scripts/kraken_hll             |  12 +
 scripts/kraken_hll-build_db.sh |  17 +-
 src/build_taxdb.cpp            |   2 +-
 src/classify.cpp               |  93 +++++-
 src/db_sort.cpp                |   3 +
 src/krakenutil.cpp             | 151 ++++++++-
 src/krakenutil.hpp             |  14 +-
 src/query_taxdb.cpp            | 138 ++++++++
 src/read_uid_mapping.cpp       |  76 +++++
 src/set_lcas.cpp               | 172 ++++++++--
 src/taxdb.cpp                  | 584 ---------------------------------
 src/taxdb.h                    |  47 ++-
 12 files changed, 655 insertions(+), 654 deletions(-)
 create mode 100644 src/query_taxdb.cpp
 create mode 100644 src/read_uid_mapping.cpp
 delete mode 100644 src/taxdb.cpp

diff --git a/scripts/kraken_hll b/scripts/kraken_hll
index b31fca3..e2d8412 100755
--- a/scripts/kraken_hll
+++ b/scripts/kraken_hll
@@ -58,6 +58,7 @@ my $classified_out;
 my $outfile;
 my $report_file;
 my $print_sequence = 0;
+my $uid_mapping = 0;
 
 GetOptions(
   "help" => \&display_help,
@@ -78,6 +79,7 @@ GetOptions(
   "check-names" => \$check_names,
   "gzip-compressed" => \$gunzip,
   "bzip2-compressed" => \$bunzip2,
+  "uid-mapping" => \$uid_mapping,
   "only-classified-output" => \$only_classified_output,
 ) or die $!;
 
@@ -145,6 +147,16 @@ push @flags, "-M" if $preload;
 push @flags, "-r", $report_file if defined $report_file;
 push @flags, "-a", $db_prefix[0]."/taxDB";
 push @flags, "-s" if $print_sequence;
+if ($uid_mapping) {
+  my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid";
+  if (!-f $uid_mapping_file) {
+    print STDERR "Missing required file $uid_mapping_file for UID mapping.\n";
+    exit(1); 
+  }
+  push @flags, "-I", $uid_mapping_file; 
+} else {
+
+}
 
 # handle piping for decompression/merging
 my @pipe_argv;
diff --git a/scripts/kraken_hll-build_db.sh b/scripts/kraken_hll-build_db.sh
index fa90e6b..402dc45 100755
--- a/scripts/kraken_hll-build_db.sh
+++ b/scripts/kraken_hll-build_db.sh
@@ -23,6 +23,7 @@
 set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 set -o pipefail  # Stop on failures in non-final pipeline commands
+set -x
 
 function report_time_elapsed() {
   curr_time=$(date "+%s.%N")
@@ -61,14 +62,14 @@ fi
 
 if [ "$KRAKEN_REBUILD_DATABASE" == "1" ]
 then
-  rm -f database.* *.map lca.complete library/seq-files.txt
+  rm -f database.* *.map lca.complete library-files.txt
 fi
 
-if [ !-f "library/seq-files.txt" ]; then
+if [ ! -f "library-files.txt" ]; then
     echo "Finding all library files"
-    find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library/seq-files.txt
+    find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt
 fi
-N_FILES=`cat library/seq-files.txt | wc -l`
+N_FILES=`cat library-files.txt | wc -l`
 echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)"
 
 if [ -e "database.jdb" ]
@@ -87,7 +88,7 @@ else
     echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
   fi
 
-  cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \
+  cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
     $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
       -o database /dev/fd/0
 
@@ -171,8 +172,8 @@ then
 else
   echo "Creating seqID to taxID map (step 4 of 6).."
   start_time1=$(date "+%s.%N")
-  cat library/seq-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library/seq-headers.txt
-  join -t $'\t' nucl_gb.accession2taxid.sorted library/seq-headers.txt > seqid2taxid.map.tmp
+  cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt
+  join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp
   mv seqid2taxid.map.tmp seqid2taxid.map
   line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
 
@@ -190,7 +191,7 @@ else
 	PARAM=" -a"
   fi
   start_time1=$(date "+%s.%N")
-  cat library/seq-files.txt | tr '\n' '\0' | xargs -0 cat | \
+  cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
     set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \
     -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
   touch "lca.complete"
diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index f4a4957..6f33763 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -27,7 +27,7 @@ using namespace std;
 
 int main(int argc, char **argv) {
 	if (argc < 3 || argc > 4) {
-      std::cerr << "Usage: a.out names.dmp nodes.dmp [taxon-counts]\n";
+      std::cerr << "Usage: build_taxdb names.dmp nodes.dmp [taxon-counts]\n";
       return 1;
     }
     TaxonomyDB<uint32_t, uint32_t> taxdb {(string)argv[1], (string)argv[2]};
diff --git a/src/classify.cpp b/src/classify.cpp
index a2a61b3..990012f 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -25,6 +25,7 @@
 #include "readcounts.hpp"
 #include "taxdb.h"
 #include "gzstream.h"
+#include <sstream>
 
 const size_t DEF_WORK_UNIT_SIZE = 500000;
 int New_taxid_start = 1000000000;
@@ -56,6 +57,12 @@ bool Populate_memory = false;
 bool Only_classified_kraken_output = false;
 bool Print_sequence = false;
 bool Print_Progress = true;
+
+bool Map_UIDs = false;
+string UID_to_TaxID_map_filename;
+map<uint32_t, vector<uint32_t> > UID_to_taxids_map;
+QuickFile UID_to_TaxID_map_file;
+
 uint32_t Minimum_hit_count = 1;
 unordered_map<uint32_t, uint32_t> Parent_map;
 string Classified_output_file, Unclassified_output_file, Kraken_output_file, Report_output_file, TaxDB_file;
@@ -94,8 +101,6 @@ ostream* cout_or_file(string file) {
     }
 }
 
-
-
 void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) {
 	QuickFile db_file;
 	db_file.open_file(DB_filename);
@@ -112,6 +117,35 @@ void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename)
 	database.set_index(&db_index);
 }
 
+vector<uint32_t> get_taxids_for_uid(uint32_t uid, char* fptr) {
+  size_t int_size = sizeof(int);
+  size_t block_size = sizeof(int)*2;
+  // TODO: Just get a uint64_t and shift the bits, probably faster
+  uint32_t taxid  = *(uint32_t*)(fptr+(uid-1)*block_size);
+  uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size);
+  
+  vector<uint32_t> taxids = {taxid};
+  while (parent_uid != 0) {
+    taxid  = *(uint32_t*)(fptr+(parent_uid-1)*block_size);
+    parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size);
+    taxids.push_back(taxid);
+  }
+  std::sort(taxids.begin(), taxids.end());
+  return(taxids);
+}
+
+vector<uint32_t> get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map<uint32_t, vector<uint32_t> >& uid_map ) {
+  auto it = uid_map.find(uid);
+  if (it != uid_map.end()) {
+    return it->second;
+  } 
+  vector<uint32_t> taxids = get_taxids_for_uid(uid, fptr);
+  uid_map[uid] = taxids;
+  return(taxids);
+}
+
+
+
 int main(int argc, char **argv) {
   #ifdef _OPENMP
   omp_set_num_threads(1);
@@ -119,11 +153,25 @@ int main(int argc, char **argv) {
 
   parse_command_line(argc, argv);
   
+  if (Map_UIDs) {
+    if (DB_filenames.size() > 1) {
+      cerr << "Cannot use more than one database with UID mapping!" << endl;
+      return 1;
+    }
+
+    cerr << "Reading UID mapping file " << UID_to_TaxID_map_filename << endl;
+    UID_to_TaxID_map_file.open_file(UID_to_TaxID_map_filename);
+    if (Populate_memory) {
+      UID_to_TaxID_map_file.load_file();
+    }
+  }
+
   if (!TaxDB_file.empty()) {
-	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_file, true);
+    // TODO: Define if the taxDB has read counts or not!!
+	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_file, false);
       for (const auto & tax : taxdb.taxIDsAndEntries) {
           if (tax.first != 0)
-          Parent_map[tax.first] = tax.second.parentTaxonomyID;
+            Parent_map[tax.first] = tax.second.parentTaxonomyID;
       }
       Parent_map[1] = 0;
   } else {
@@ -287,11 +335,12 @@ void process_file(char *filename) {
       kraken_output_ss.str("");
       classified_output_ss.str("");
       unclassified_output_ss.str("");
-      for (size_t j = 0; j < work_unit.size(); j++)
+      for (size_t j = 0; j < work_unit.size(); j++) {
         my_total_classified += 
             classify_sequence( work_unit[j], kraken_output_ss,
                            classified_output_ss, unclassified_output_ss,
                            my_taxon_counts);
+      }
 
       #pragma omp critical(write_output)
       {
@@ -330,6 +379,7 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu
 bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
                        unordered_map<uint32_t, ReadCounts>& my_taxon_counts) {
+  // TODO: use vector::reserve
   vector<uint32_t> taxa;
   vector<uint8_t> ambig_list;
   unordered_map<uint32_t, uint32_t> hit_counts;
@@ -356,12 +406,15 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
       else {
         ambig_list.push_back(0);
 
+        // go through multiple databases to map k-mer
         for (size_t i=0; i<KrakenDatabases.size(); ++i) {
-            taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr, 
-                    db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);
-            if (taxon) break;
+          taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr,
+              db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);
+          if (taxon) break;
         }
 
+        //cerr << "taxon for " << *kmer_ptr << " is " << taxon << endl;
+
         my_taxon_counts[taxon].add_kmer(*kmer_ptr);
 
         if (taxon) {
@@ -375,10 +428,19 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
   }
 
   uint32_t call = 0;
-  if (Quick_mode)
-    call = hits >= Minimum_hit_count ? taxon : 0;
-  else
-    call = resolve_tree(hit_counts, Parent_map);
+  if (Map_UIDs) {
+    if (Quick_mode) {
+      cerr << "Quick mode not available when mapping UIDs" << endl;
+      exit(1);
+    } else {
+      call = resolve_uids2(hit_counts, Parent_map, UID_to_TaxID_map_file.ptr());
+    }
+  } else {
+    if (Quick_mode)
+      call = hits >= Minimum_hit_count ? taxon : 0;
+    else
+      call = resolve_tree(hit_counts, Parent_map);
+  }
 
   ++(my_taxon_counts[call].n_reads);
 
@@ -482,7 +544,7 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:s")) != -1) {
+  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:sI:")) != -1) {
     switch (opt) {
       case 'd' :
         DB_filenames.push_back(optarg);
@@ -545,6 +607,10 @@ void parse_command_line(int argc, char **argv) {
       case 'M' :
         Populate_memory = true;
         break;
+      case 'I' :
+        UID_to_TaxID_map_filename = optarg;
+        Map_UIDs = true;
+        break;
       default:
         usage();
         break;
@@ -573,6 +639,7 @@ void usage(int exit_code) {
        << "  -o filename      Output file for Kraken output" << endl
        << "  -r filename      Output file for Kraken report output" << endl
        << "  -a filename      TaxDB" << endl
+       << "  -I filename      UID to TaxId map" << endl
        << "  -t #             Number of threads" << endl
        << "  -u #             Thread work unit size (in bp)" << endl
        << "  -q               Quick operation" << endl
diff --git a/src/db_sort.cpp b/src/db_sort.cpp
index 1bafef3..713119a 100644
--- a/src/db_sort.cpp
+++ b/src/db_sort.cpp
@@ -44,6 +44,7 @@ int main(int argc, char **argv) {
 
   parse_command_line(argc, argv);
 
+  cerr << "db_sort: Getting database into memory ...";
   QuickFile input_db_file(Input_DB_filename);
   KrakenDB *input_db = new KrakenDB(input_db_file.ptr());
   Key_len = input_db->get_key_len();
@@ -62,10 +63,12 @@ int main(int argc, char **argv) {
   input_db = new KrakenDB(header);
   input_db_file.close_file();  // Stop using memory-mapped file
 
+  cerr << "db_sort: Sorting ...";
   char *data = new char[ key_ct * (Key_len + val_len) ];
   // Populate data w/ pairs from DB and sort bins in parallel
   bin_and_sort_data(*input_db, data, db_index);
 
+  cerr << "db_sort: Sorting complete - writing database to disk ..." << endl;
   ofstream output_file(Output_DB_filename.c_str(), std::ofstream::binary);
   output_file.write(header, skip_len);
   output_file.write(data, key_ct * (Key_len + val_len));
diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index 48e54e9..28ca837 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -17,12 +17,14 @@
  * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include "assert_helpers.h"
 #include "kraken_headers.hpp"
 #include "krakenutil.hpp"
 
 using namespace std;
 
 namespace kraken {
+
   // Build a node->parent unordered_map from NCBI Taxonomy nodes.dmp file
   unordered_map<uint32_t, uint32_t> build_parent_map(string filename) {
     unordered_map<uint32_t, uint32_t> pmap;
@@ -47,7 +49,7 @@ namespace kraken {
   // Return lowest common ancestor of a and b
   // LCA(0,x) = LCA(x,0) = x
   // Default ancestor is 1 (root of tree)
-  uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map,
+  uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map,
     uint32_t a, uint32_t b)
   {
     if (a == 0 || b == 0)
@@ -57,35 +59,41 @@ namespace kraken {
     set<uint32_t> a_path;
     while (a > 0) {
       a_path.insert(a);
-      a = parent_map[a];
+      assert(parent_map.find(a) != parent_map.end());
+      a = parent_map.at(a);
     }
 
     // search for b in the path from a to the root
     while (b > 0) {
       if (a_path.count(b) > 0)
         return b;
-      b = parent_map[b];
+      assert(parent_map.find(b) != parent_map.end());
+      b = parent_map.at(b);
     }
     return 1;
   }
 
   // Tree resolution: take all hit taxa (plus ancestors), then
   // return leaf of highest weighted leaf-to-root path.
-  uint32_t resolve_tree(unordered_map<uint32_t, uint32_t> &hit_counts,
-                        unordered_map<uint32_t, uint32_t> &parent_map)
+  uint32_t resolve_tree(const unordered_map<uint32_t, uint32_t> &hit_counts,
+                        const unordered_map<uint32_t, uint32_t> &parent_map)
   {
     set<uint32_t> max_taxa;
     uint32_t max_taxon = 0, max_score = 0;
-    unordered_map<uint32_t, uint32_t>::iterator it = hit_counts.begin();
 
     // Sum each taxon's LTR path
-    while (it != hit_counts.end()) {
+    for (auto it = hit_counts.begin();
+         it != hit_counts.end(); ++it) {
       uint32_t taxon = it->first;
       uint32_t node = taxon;
       uint32_t score = 0;
       while (node > 0) {
-        score += hit_counts[node];
-        node = parent_map[node];
+        auto it2 = hit_counts.find(node);
+        if (it2 != hit_counts.end()) {
+          score += it2->second;
+        }
+        node = parent_map.at(node);
+
       }
 
       if (score > max_score) {
@@ -98,8 +106,6 @@ namespace kraken {
           max_taxa.insert(max_taxon);
         max_taxa.insert(taxon);
       }
-
-      it++;
     }
 
     // If two LTR paths are tied for max, return LCA of all
@@ -113,6 +119,129 @@ namespace kraken {
     return max_taxon;
   }
 
+
+  // Tree resolution: take all hit taxa (plus ancestors), then
+  // return leaf of highest weighted leaf-to-root path.
+  uint32_t resolve_uids(
+      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+      const unordered_map<uint32_t, uint32_t> &parent_map,
+      const vector< vector<uint32_t> > &UID_to_taxids_vec) {
+    unordered_map<uint32_t, uint32_t> taxid_counts;
+    unordered_map<uint32_t, double> frac_taxid_counts;
+
+    if (uid_hit_counts.size() == 0) {
+      return(0);
+    }
+
+    for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
+      uint32_t uid = it->first;
+      double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size());
+      for (auto taxid : UID_to_taxids_vec[uid-1]) {
+        taxid_counts[taxid] += it->second;
+        frac_taxid_counts[taxid] += frac_count;
+      }
+    }
+    vector<uint32_t> max_taxids;
+    uint32_t max_count = 0;
+    double max_frac_count = 0;
+    for (auto it : taxid_counts) {
+      if (it.second == max_count) {
+        if (frac_taxid_counts[it.first] == max_frac_count) {
+          max_taxids.push_back(it.first);
+        } else if (frac_taxid_counts[it.first] > max_frac_count) {
+          max_frac_count = frac_taxid_counts[it.first];
+          max_taxids = { it.first };
+        }
+      } else if (it.second > max_count) {
+        max_taxids = { it.first };
+        max_count = it.second;
+        max_frac_count = frac_taxid_counts[it.first];
+      }
+    }
+
+    uint32_t max_taxon = max_taxids[0];
+    auto sit = max_taxids.begin();
+    for (++sit; sit != max_taxids.end(); ++sit) {
+      max_taxon = lca(parent_map, max_taxon, *sit);
+
+    }
+
+    // return the taxid that appeared most often
+    return max_taxon;
+  }
+
+  // Tree resolution: take all hit taxa (plus ancestors), then
+  // return leaf of highest weighted leaf-to-root path.
+  uint32_t resolve_uids2(
+      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+      const unordered_map<uint32_t, uint32_t> &parent_map,
+      char* fptr) {
+    unordered_map<uint32_t, uint32_t> taxid_counts;
+    unordered_map<uint32_t, double> frac_taxid_counts;
+
+    if (uid_hit_counts.size() == 0) {
+      return(0);
+    }
+
+    size_t int_size = sizeof(int);
+    size_t block_size = sizeof(int)*2;
+    for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
+      uint32_t uid = it->first;
+      if (uid == 0) {
+	continue;
+      }
+      uint32_t taxid;
+      // TODO: Just get a uint64_t and shift the bits, probably faster
+      vector<uint32_t> taxids;
+      do {
+        taxid = *(uint32_t*)(fptr+(uid-1)*block_size);
+        uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size);
+  
+        taxid_counts[taxid] += it->second;
+	taxids.push_back(taxid);
+      } while (uid != 0);
+
+      double frac_count = (double)it->second / (double)taxids.size();
+      for (uint32_t taxid : taxids) {
+        frac_taxid_counts[taxid] += frac_count;
+      }
+    }
+
+    if (taxid_counts.size() == 0) {
+      return(0);
+    }
+    vector<uint32_t> max_taxids;
+    uint32_t max_count = 0;
+    double max_frac_count = 0;
+    for (auto it : taxid_counts) {
+      if (it.second == max_count) {
+        if (frac_taxid_counts[it.first] == max_frac_count) {
+          max_taxids.push_back(it.first);
+        } else if (frac_taxid_counts[it.first] > max_frac_count) {
+          max_frac_count = frac_taxid_counts[it.first];
+          max_taxids = { it.first };
+        }
+      } else if (it.second > max_count) {
+        max_taxids = { it.first };
+        max_count = it.second;
+        max_frac_count = frac_taxid_counts[it.first];
+      }
+    }
+
+    uint32_t max_taxon = max_taxids[0];
+    auto sit = max_taxids.begin();
+    for (++sit; sit != max_taxids.end(); ++sit) {
+      max_taxon = lca(parent_map, max_taxon, *sit);
+
+    }
+
+    // return the taxid that appeared most often
+    return max_taxon;
+  }
+
+
+
+
   uint8_t KmerScanner::k = 0;
   uint64_t KmerScanner::kmer_mask = 0;
   uint32_t KmerScanner::mini_kmer_mask = 0;
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
index 97dd041..854e26b 100644
--- a/src/krakenutil.hpp
+++ b/src/krakenutil.hpp
@@ -31,8 +31,18 @@ namespace kraken {
   // NOTE: LCA(0,x) = LCA(x,0) = x
 
   // Resolve classification tree
-  uint32_t resolve_tree(std::unordered_map<uint32_t, uint32_t> &hit_counts,
-                        std::unordered_map<uint32_t, uint32_t> &parent_map);
+  uint32_t resolve_tree(const std::unordered_map<uint32_t, uint32_t> &hit_counts,
+                        const std::unordered_map<uint32_t, uint32_t> &parent_map);
+
+  uint32_t resolve_uids(
+        const std::unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+        const std::unordered_map<uint32_t, uint32_t> &parent_map,
+        const std::vector< std::vector<uint32_t> > &UID_to_taxids_vec);
+
+  uint32_t resolve_uids2(
+        const std::unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+        const std::unordered_map<uint32_t, uint32_t> &parent_map,
+        char* fptr);
 
   class KmerScanner {
     public:
diff --git a/src/query_taxdb.cpp b/src/query_taxdb.cpp
new file mode 100644
index 0000000..7412792
--- /dev/null
+++ b/src/query_taxdb.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the Kraken taxonomic sequence classification system.
+ *
+ * Kraken is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Kraken is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "taxdb.h"
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <sysexits.h>
+#include <cstring>
+#include <getopt.h>
+
+using namespace std;
+
+string return_rank;
+
+void process_taxID(char mode, uint32_t taxID);
+void process_taxIDs(char mode, vector<uint32_t> taxIDs);
+size_t parse_command_line(int argc, char **argv);
+void usage(int exit_code=EX_USAGE);
+
+TaxonomyDB<uint32_t, uint32_t> taxdb;
+
+int main(int argc, char **argv) {
+  size_t optind = parse_command_line(argc, argv);
+
+  string line;
+  uint32_t taxID;
+  char mode = *argv[optind++];
+	for (;optind < argc; ++optind) {
+	  if (strcmp(argv[optind],"-") == 0) {
+	    // read STDIN
+	    if (mode == 'l') {
+	    while (getline(std::cin, line)) {
+	      stringstream ss(line);
+	      vector<uint32_t> taxIDs;
+	      while (ss >> taxID) {
+	        taxIDs.push_back(taxID);
+	      }
+	      process_taxIDs(mode,taxIDs);
+	    }
+	    }
+	    while (std::cin >> taxID) {
+	      process_taxID(mode,taxID);
+	    }
+	  } else {
+	    taxID = atol(argv[optind]);
+	    process_taxID(mode,taxID);
+	  }
+	}
+
+	exit(1);
+}
+void process_taxIDs(char mode, vector<uint32_t> taxIDs) {
+  switch (mode) {
+
+  case 'r':
+      if (!return_rank.empty()) {
+        cout << taxdb.getTaxIDAtRank(taxIDs[0], return_rank) << '\n';
+      }
+      break;
+  case 'l':
+    cout << taxdb.getEntry(taxdb.getLowestCommonAncestor(taxIDs)).rank << endl;
+    break;
+  default:
+    usage();
+    break;
+  }
+}
+
+
+void process_taxID(char mode, uint32_t taxID) {
+  switch (mode) {
+  case 'r':
+    if (!return_rank.empty()) {
+      cout << taxdb.getTaxIDAtRank(taxID, return_rank) << '\n';
+    }
+    break;
+  case 'l':
+  default:
+    usage();
+    break;
+  }
+}
+
+size_t parse_command_line(int argc, char **argv) {
+  int opt;
+  long long sig;
+
+  if (argc > 1 && strcmp(argv[1], "-h") == 0)
+    usage(0);
+
+  while ((opt = getopt(argc, argv, "r:m:")) != -1) {
+    switch (opt) {
+      case 'r':
+        return_rank = optarg;
+        break;
+      default:
+        usage();
+        break;
+    }
+  }
+
+  if (argv[optind] == NULL || argv[optind + 1] == NULL) {
+    printf("Mandatory argument(s) missing\n");
+    exit(1);
+  }
+
+  taxdb.readTaxonomyIndex(argv[optind++], false);
+  return optind;
+}
+
+void usage(int exit_code) {
+  cerr << "Usage: query_taxdb [options] taxDB mode [taxIDs]" << endl
+       << endl
+       << "Options: (*mandatory)" << endl
+       << "  -m mode      Mode: l for LCA, r for rank" << endl
+       << "  -r rank      Output parent rank of taxIDs" << endl
+       << "  -h           Print this message" << endl
+       << endl;
+  exit(exit_code);
+}
+
diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp
new file mode 100644
index 0000000..76b839a
--- /dev/null
+++ b/src/read_uid_mapping.cpp
@@ -0,0 +1,76 @@
+
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+#include <unordered_map>
+#include <algorithm>
+
+using namespace std;
+using namespace kraken;
+
+vector<uint32_t> get_taxids_for_uid(uint32_t uid, char* fptr) {
+  size_t int_size = sizeof(int);
+  size_t block_size = sizeof(int)*2;
+  // TODO: Just get a uint64_t and shift the bits, probably faster
+  uint32_t taxid  = *(uint32_t*)(fptr+(uid-1)*block_size);
+  uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size);
+  
+  vector<uint32_t> taxids = {taxid};
+  while (parent_uid != 0) {
+    taxid  = *(uint32_t*)(fptr+(parent_uid-1)*block_size);
+    parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size);
+    taxids.push_back(taxid);
+  }
+  std::sort(taxids.begin(), taxids.end());
+  return(taxids);
+}
+
+
+vector<uint32_t> get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map<uint32_t, vector<uint32_t> >& uid_map ) {
+  auto it = uid_map.find(uid);
+  if (it != uid_map.end()) {
+    return it->second;
+  } 
+  vector<uint32_t> taxids = get_taxids_for_uid(uid, fptr);
+  uid_map[uid] = taxids;
+  return(taxids);
+}
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    std::cerr << "Usage: read_uid_mapping <uid mappingfile> [<uid>]"
+        "The file is supposed to have lines terminated by '\n'."
+         << std::endl;
+    return 1;
+  }
+  char *filename = argv[1];
+  kraken::QuickFile UID_to_TaxID_map_file;
+  UID_to_TaxID_map_file.open_file(filename);
+
+  char* fptr = UID_to_TaxID_map_file.ptr();
+  if (argc == 2) {
+    vector< vector <uint32_t> > UIDs_to_taxids;
+    uint32_t UID = 1;
+    size_t int_size = sizeof(UID);
+    size_t i = 0;
+    for (size_t pos = 0; pos < UID_to_TaxID_map_file.size(); pos += 2*int_size) {
+      uint32_t* taxid_ptr  = (uint32_t*)(fptr+pos);
+      uint32_t* parent_uid = (uint32_t*)(fptr+pos+int_size);
+      //UIDs_to_taxids.push_back( { UIDs_to_taxids[] } );
+      //pos += int_size;
+      cout << ++i << '\t' << *taxid_ptr << '\t' << *parent_uid << endl;
+    }
+  } else {
+    unordered_map<uint32_t, vector<uint32_t> > UID_to_TaxID_map;
+    for (int i=2; i <argc; ++i) {
+      uint32_t UID = atol(argv[i]);
+      vector<uint32_t> taxids = get_taxids_for_uid(UID, UID_to_TaxID_map, fptr);
+      cout << UID << '\t';
+      for (auto t : taxids) {
+        cout << t << ' ';
+      }
+      cout << endl;
+    }
+  }
+
+  return 0;
+}
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 61504d7..1396a7f 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -1,3 +1,4 @@
+// vim: noai:ts=2:sw=2:expandtab:smarttab
 /*
  * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
  *
@@ -25,6 +26,7 @@
 #include "taxdb.h"
 #include "readcounts.hpp"
 #include <unordered_map>
+#include <map>
 
 #define SKIP_LEN 50000
 
@@ -39,7 +41,8 @@ void process_file(string filename, uint32_t taxid);
 void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish);
 
 int Num_threads = 1;
-string DB_filename, Index_filename, TaxDB_filename,
+string DB_filename, Index_filename,
+  Output_DB_filename, TaxDB_filename,
   File_to_taxon_map_filename,
   ID_to_taxon_map_filename, Multi_fasta_filename;
 bool force_taxid = false;
@@ -50,8 +53,23 @@ bool verbose = false;
 bool Operate_in_RAM = false;
 bool One_FASTA_file = false;
 bool Add_taxIds_for_Sequences = false;
+bool Use_uids_instead_of_taxids = false;
+bool Output_UID_map_to_STDOUT = false;
+bool Pretend = false;
 
+string UID_map_filename;
+ofstream UID_map_file;
+
+uint32_t current_uid = 0;
+uint32_t max_uid = -1;
 unordered_map<uint32_t, uint32_t> Parent_map;
+//unordered_multimap<uint32_t, uint32_t> Children_map;
+//typedef std::_Rb_tree_iterator<std::pair<const std::set<unsigned int>, unsigned int> > map_it;
+//typedef std::_Rb_tree_iterator<std::pair<const std::vector<unsigned int>, unsigned int> > map_it;
+typedef const vector<uint32_t>* map_it;
+vector< map_it > UID_to_taxids_vec;
+map< vector<uint32_t>, uint32_t> Taxids_to_UID_map;
+
 unordered_map<string, uint32_t> ID_to_taxon_map;
 unordered_map<uint32_t, bool> SeqId_added;
 KrakenDB Database;
@@ -65,15 +83,26 @@ int main(int argc, char **argv) {
   parse_command_line(argc, argv);
 
   if (!TaxDB_filename.empty() && !force_taxid) {
-	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_filename);
-      for (const auto & tax : taxdb.taxIDsAndEntries) {
-          if (tax.first != 0)
-          Parent_map[tax.first] = tax.second.parentTaxonomyID;
-      }
-      Parent_map[1] = 0;
+    taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_filename);
+    for (const auto & tax : taxdb.taxIDsAndEntries) {
+      if (tax.first != 0)
+        Parent_map[tax.first] = tax.second.parentTaxonomyID;
+//      Children_map[tax.second.parentTaxonomyID].insert(tax.first);
+    }
+    Parent_map[1] = 0;
   } else {
-      cerr << "TaxDB argument is required!" << endl;
-      return 1;
+    cerr << "TaxDB argument is required!" << endl;
+    return 1;
+  }
+
+  if (Use_uids_instead_of_taxids) {
+    UID_map_file.open(UID_map_filename, ios_base::out | ios_base::binary);
+
+    if (!UID_map_file.is_open()) {
+      cerr << "Something went wrong while creating the file." << endl;
+      exit(1);
+    }
+
   }
 
   QuickFile db_file(DB_filename, "rw");
@@ -90,6 +119,14 @@ int main(int argc, char **argv) {
     Database = KrakenDB(temp_ptr);
     cerr << "done" << endl;
   } else {
+    if (Output_DB_filename.size() > 0) {
+      cerr << "You need to operate in RAM (flag -M) to use output to a different file (flag -o)" << endl;
+      return 1;
+    }
+    //std::ifstream ifs("input.txt", std::ios::binary);
+    //std::ofstream ofs("output.txt", std::ios::binary);
+    //ofs << ifs.rdbuf();
+
     Database = KrakenDB(db_file.ptr());
   }
 
@@ -104,15 +141,22 @@ int main(int argc, char **argv) {
   else
     process_files();
 
-  if (Operate_in_RAM) {
+  if (Operate_in_RAM && !Pretend) {
+    if (Output_DB_filename.size() > 0) {
+      DB_filename = Output_DB_filename;
+    }
+    cerr << "Writing database from RAM back to " << DB_filename << " ..." << endl;
     ofstream ofs(DB_filename.c_str(), ofstream::binary);
     ofs.write(temp_ptr, db_file_size);
     ofs.close();
     delete temp_ptr;
   }
 
+  UID_map_file.close();
 
-  if (Add_taxIds_for_Sequences && !TaxDB_filename.empty()) {
+  // Write new TaxDB file if new taxids were added
+  if (Add_taxIds_for_Sequences && !TaxDB_filename.empty() && !Pretend) {
+    cerr << "Writing new TaxDB ..." << endl;
     ofstream ofs(TaxDB_filename.c_str());
     taxdb.writeTaxonomyIndex(ofs);
     ofs.close();
@@ -171,7 +215,10 @@ void process_single_file() {
     uint32_t taxid;
     string prefix = "kraken:taxid|";
     if (dna.id.substr(0,prefix.size()) == prefix) {
-        taxid = std::atoi(dna.id.substr(prefix.size()).c_str());
+        taxid = std::stol(dna.id.substr(prefix.size()));
+        if (taxid == 0) {
+          cerr << "Error: taxid is zero for the line '" << dna.id << "'?!" << endl;
+        }
         const auto strBegin = dna.header_line.find_first_not_of("\t ");
         if (strBegin != std::string::npos)
             dna.header_line = dna.header_line.substr(strBegin);
@@ -181,9 +228,9 @@ void process_single_file() {
     
     if (Add_taxIds_for_Sequences) {
       auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
-	  if (entryIt == taxdb.taxIDsAndEntries.end()) {
+      if (entryIt == taxdb.taxIDsAndEntries.end()) {
         cerr << "Error! Didn't find " << taxid << " in TaxonomyDB!!" << endl;
-	  } else {
+      } else {
         entryIt->second.scientificName = dna.header_line;
       }
     }
@@ -195,10 +242,9 @@ void process_single_file() {
 
         ++seqs_processed;
     } else {
-        if (verbose) 
-            cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl;
-
-        ++seqs_no_taxid;
+      if (verbose) 
+        cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl;
+      ++seqs_no_taxid;
     }
 
     cerr << "\rProcessed " << seqs_processed << " sequences";
@@ -255,7 +301,7 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
       continue;
     val_ptr = Database.kmer_query(
                 Database.canonical_representation(*kmer_ptr)
-              );
+    );
     if (val_ptr == NULL) {
       if (! Allow_extra_kmers) {
         errx(EX_DATAERR, "kmer found in sequence that is not in database");
@@ -265,10 +311,69 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
       }
       continue;
     }
-    if (!force_taxid)
-        *val_ptr = lca(Parent_map, taxid, *val_ptr);
-    else
-        *val_ptr = taxid;
+    if (Use_uids_instead_of_taxids) {
+      uint32_t kmer_uid = *val_ptr;
+      bool new_taxid = kmer_uid == 0;
+      vector<uint32_t> taxid_set;
+      if (new_taxid) {
+        taxid_set.push_back(taxid);
+      } else {
+        if (kmer_uid > UID_to_taxids_vec.size()) {
+          // This can happen when set_lcas is called on a database that is not all zeros
+          cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl;
+          exit(1);
+        }
+        taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1));
+        auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order
+
+        if (it == taxid_set.end() || *it != taxid) {
+          // add the taxid to the set, in the right position
+           taxid_set.insert( it, taxid ); // insert before iterator it
+           new_taxid = true;
+        }
+      }
+
+      if (new_taxid) {
+        if (max_uid <= current_uid) {
+          cerr << "Maxxed out on the UIDs!!" << endl;
+          exit(1);
+        }
+
+        // get a new taxid for this set
+        #pragma omp critical(new_uid)
+        {
+        auto insert_res = Taxids_to_UID_map.insert( { std::move(taxid_set), current_uid + 1 } );
+        if (insert_res.second) {
+          ++current_uid;
+
+          // print result for map:
+          if (Output_UID_map_to_STDOUT) {
+            auto tid_it = insert_res.first->first.begin();
+            cout << current_uid << '\t' << *tid_it++; 
+            while (tid_it != insert_res.first->first.end()) { cout << ' ' << *tid_it++; }
+            cout << '\n';
+          }
+
+          // FORMAT: TAXID<uint32_t> PARENT<uint32_t>
+          // TODO: Consider using mmap here
+          UID_map_file.write((char*)&taxid, sizeof(taxid));
+          UID_map_file.write((char*)&kmer_uid, sizeof(kmer_uid));
+
+          //UID_to_taxids_vec[current_uid] = taxid_set;
+          UID_to_taxids_vec.push_back( &(insert_res.first->first) );
+          *val_ptr = current_uid;
+        } else {
+         *val_ptr = insert_res.first->second;
+        }
+        }
+      }
+    } else if (!force_taxid) {
+      *val_ptr = lca(Parent_map, taxid, *val_ptr);
+    } else {
+      // When force_taxid is set, do not compute lca, but assign the taxid
+      // of the (last) sequence to k-mers
+      *val_ptr = taxid;
+    }
   }
 }
 
@@ -278,11 +383,18 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:a")) != -1) {
+  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:apI:o:S")) != -1) {
     switch (opt) {
       case 'f' :
         File_to_taxon_map_filename = optarg;
         break;
+      case 'I' :
+        Use_uids_instead_of_taxids = true;
+        UID_map_filename = optarg;
+        break;
+      case 'S' :
+        Output_UID_map_to_STDOUT = true;
+        break;
       case 'd' :
         DB_filename = optarg;
         break;
@@ -324,7 +436,12 @@ void parse_command_line(int argc, char **argv) {
       case 'M' :
         Operate_in_RAM = true;
         break;
-
+      case 'o' :
+        Output_DB_filename = optarg;
+        break;
+      case 'p' :
+        Pretend = true;
+        break;
       default:
         usage();
         break;
@@ -353,12 +470,16 @@ void usage(int exit_code) {
        << "* -b filename      Taxonomy DB file" << endl
        << "  -t #             Number of threads" << endl
        << "  -M               Copy DB to RAM during operation" << endl
+       << "  -o filename      Output database to filename, instead of overwriting the input database" << endl
        << "  -x               K-mers not found in DB do not cause errors" << endl
        << "  -f filename      File to taxon map" << endl
        << "  -F filename      Multi-FASTA file with sequence data" << endl
        << "  -m filename      Sequence ID to taxon map" << endl
        << "  -a               Add taxonomy IDs (starting with "<<New_taxid_start<<") for sequences to Taxonomy DB" << endl
        << "  -T               Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
+       << "  -I filename      Write UIDs into database, and output (binary) UID-to-taxid map to filename" << endl
+       << "  -S               Write UID-to-taxid map to STDOUT" << endl
+       << "  -p               Pretend - do not write database back to disk (when working in RAM)" << endl
        << "  -v               Verbose output" << endl
        << "  -h               Print this message" << endl
        << endl
@@ -366,3 +487,4 @@ void usage(int exit_code) {
        << "-F/-m are ignored." << endl;
   exit(exit_code);
 }
+
diff --git a/src/taxdb.cpp b/src/taxdb.cpp
deleted file mode 100644
index 41ba0ee..0000000
--- a/src/taxdb.cpp
+++ /dev/null
@@ -1,584 +0,0 @@
-#include "taxdb.h"
-using namespace std;
-
-void log_msg (const std::string& s) {
-	std::cerr << s << "\n";
-}
-
-template<typename T>
-uint64_t string_to_T(string str) {
-  stringstream stream(str);
-  T result;
-  stream >> result;
-  return result;
-}
-
-template <typename T>
-inline
-uint64_t reads(const T read_count) {
-	cerr << "No reads function for type!! " << endl;
-	throw ;
-	return(0);
-}
-
-
-
-inline
-uint64_t reads(const uint64_t read_count) {
-	return(read_count);
-}
-
-std::vector<std::string> in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) {
-    std::vector<std::string> tokens;
-	size_t i = 0;
-	size_t next_end = start_at-1;
-    
-	for (size_t next_start = s.find(start_char, next_end + 1); \
-		 next_start != string::npos;
-         next_start = s.find(start_char, next_end + 1), ++i) {
-
-		next_end = s.find(end_char, next_start + 1);
-		if (next_end == string::npos)
-			throw std::runtime_error("unmatched start and end!");
-
-        tokens.push_back(s.substr(next_start+1, next_end-1));
-    }
-
-    return tokens;
-}
-
-
-
-std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) {
-    std::vector<std::string> tokens(max_fields);
-    size_t delim_length = delimiter.length();
-    size_t last = 0;
-    size_t i = 0;
-
-    for (size_t next = s.find(delimiter, last);
-         (max_fields > 0 && i < max_fields) && next != string::npos;
-         next = s.find(delimiter, last), ++i) {
-        tokens[i] = s.substr(last, next-last);
-        last = next + delim_length;
-    }
-    if (max_fields > 0 && i < max_fields) {
-        tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars);
-    }
-
-    return tokens;
-}
-
-std::vector<std::string> get_fields(const std::string &s, const std::string& delimiter, vector<size_t> fields) {
-    std::vector<std::string> tokens;
-	tokens.reserve(fields.size());
-    size_t delim_length = delimiter.length();
-    size_t last = 0;
-    size_t i = 0;
-	size_t current_field = 0;
-
-    for (size_t next = s.find(delimiter, last);
-         tokens.size() < fields.size() && next != string::npos;
-         next = s.find(delimiter, last), ++i) {
-		if (i == fields[current_field]) {
-          tokens.push_back(s.substr(last, next-last));
-           ++current_field;
-		}
-        last = next + delim_length;
-    }
-
-    return tokens;
-}
-
-
-
-//template<>
-//TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
-//	read_counts = 0;
-//	read_counts_children = 0;
-//}
-template<typename TAXID, typename READCOUNTS>
-bool TaxonomyEntryPtr_comp<TAXID,READCOUNTS>::operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const {
-	        return ((reads(a->read_counts)+reads(a->read_counts_children)) > (reads(b->read_counts)+reads(b->read_counts_children)));
-			    }
-
-
-template<typename TAXID, typename READCOUNTS>
-std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getScientificNameMap() const {
-	std::unordered_map<std::string, TAXID> scientificNameMap;
-	for (const auto & tax : taxIDsAndEntries) {
-		scientificNameMap[tax.second.scientificName] = tax.first;
-    }
-	return scientificNameMap;
-}
-
-template<typename TAXID, typename READCOUNTS>
-unordered_map<TAXID, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getParentMap() const {
-	unordered_map<TAXID, TAXID> Parent_map;
-	for (const auto & tax : taxIDsAndEntries) {
-		if (tax.first != 0)
-			Parent_map[tax.first] = tax.second.parentTaxonomyID;
-    }
-    Parent_map[1] = 1;
-	return Parent_map;
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::createPointers() {
-  for (auto& tax : taxIDsAndEntries) {
-  if (tax.second.parentTaxonomyID != tax.first) {
-    auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
-    if (parentIt != taxIDsAndEntries.end()) {
-      tax.second.parent = &(parentIt->second);
-      parentIt->second.children.push_back(&tax.second);
-    }
-  }
-  }
-}
-
-template<typename TAXID, typename READCOUNTS>
-TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB() { }
-
-template<typename TAXID, typename READCOUNTS>
-TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName) {
-  log_msg("Building taxonomy index");
-  readTaxonomyIndex(inFileName);
-  createPointers();
-  log_msg("Built a taxonomy tree with " + std::to_string(taxIDsAndEntries.size()) +
-      " nodes");
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::parseNodesDump(const std::string nodesDumpFileName) {
-  std::ifstream nodesDumpFile(nodesDumpFileName);
-  if (!nodesDumpFile.is_open())
-    throw std::runtime_error("unable to open nodes file");
-  std::string line;
-
-  TAXID taxonomyID;
-  TAXID parentTaxonomyID;
-  std::string rank;
-
-  while (nodesDumpFile.good()) {
-    getline(nodesDumpFile, line);
-    std::vector<std::string> tokens = tokenise(line, "\t|\t", 3, 2);
-    if (tokens.size() < 3) {
-	  continue;
-	}
-
-	taxonomyID = string_to_T<TAXID>(tokens[0]);
-    parentTaxonomyID = string_to_T<TAXID>(tokens[1]);
-    rank = tokens[2];
-
-    auto entryIt = taxIDsAndEntries.find(taxonomyID);
-	if (entryIt == taxIDsAndEntries.end()) {
-	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, parentTaxonomyID, rank);
-	} else {
-      entryIt->second.parentTaxonomyID = parentTaxonomyID;
-      entryIt->second.rank = rank;
-    }
-  }
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFileName) {
-  std::ifstream namesDumpFile(namesDumpFileName);
-  if (!namesDumpFile.is_open())
-    throw std::runtime_error("unable to open names file");
-  std::string line;
-
-  TAXID taxonomyID;
-  std::string scientificName;
-  while (namesDumpFile.good()) {
-    getline(namesDumpFile, line);
-    std::vector<std::string> tokens = tokenise(line, "\t|\t", 4, 2);
-    if (tokens.size() < 4 || tokens[3] != "scientific name") {
-	  continue;
-	}
-    taxonomyID = string_to_T<TAXID>(tokens[0]);
-    scientificName = tokens[1];
-
-    auto entryIt = taxIDsAndEntries.find(taxonomyID);
-	if (entryIt == taxIDsAndEntries.end()) {
-	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, scientificName);
-	} else {
-      entryIt->second.scientificName = scientificName;
-    }
-  }
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs,
-									const std::string namesDumpFileName,
-                                    const std::string nodesDumpFileName) {
-  parseNodesDump(nodesDumpFileName);
-  parseNamesDump(namesDumpFileName);
-  writeTaxonomyIndex(outs);
-}
-
-template<typename KeyType, typename ValueType>
-std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>& unordered) {
-  std::vector<KeyType> keys;
-  keys.reserve (unordered.size());
-  for (auto& it : unordered) {
-	      keys.push_back(it.first);
-  }
-  std::sort (keys.begin(), keys.end());
-  return keys;
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs) const {
-  for (TAXID& key : getSortedKeys(taxIDsAndEntries)) {
-	const auto& entry = taxIDsAndEntries.at(key);
-    outs << key << "\t" << entry.parentTaxonomyID << "\t"
-            << entry.scientificName << "\t" << entry.rank << "\n";
-  }
-}
-
-
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName) {
-  std::ifstream inFile(inFileName);
-  if (!inFile.is_open())
-    throw std::runtime_error("unable to open taxonomy index file " + inFileName);
-
-  TAXID taxonomyID, parentTaxonomyID;
-  std::string scientificName, rank;
-
-  std::string line;
-  while (!inFile.eof()) {
-	inFile >> taxonomyID >> parentTaxonomyID;
-	inFile.get(); // read tab
-	std::getline(inFile, scientificName, '\t');
-	std::getline(inFile, rank, '\n');
-    TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName);
-
-	//cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl;
-    taxIDsAndEntries.insert({
-      taxonomyID, newEntry
-    });
-  }
-  taxIDsAndEntries.insert({
-	0, {0, 0, "no rank", "unclassified" }
-  });
-}
-
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
-    const std::vector<TAXID>& taxIDs) const {
-  if (taxIDs.size() == 0) {
-    return 0;
-  }
-  std::vector<std::vector<READCOUNTS> > paths;
-  for (auto& taxID : taxIDs) {
-    bool good = true;
-    std::vector<READCOUNTS> path;
-    TAXID tempTaxID = taxID;
-    while (tempTaxID != 0) {
-      path.push_back(tempTaxID);
-      tempTaxID = getParentTaxID(tempTaxID);
-    }
-    if (good) paths.push_back(path);
-  }
-  if (paths.size() == 0) {
-    return 0;
-  }
-  for (auto& path : paths)
-    std::reverse(path.begin(), path.end());
-  std::sort(paths.begin(), paths.end(),
-            [](std::vector<READCOUNTS> i, std::vector<READCOUNTS> j) {
-    return i.size() < j.size();
-  });
-  TAXID consensus = 0;
-  for (unsigned i = 0; i < paths[0].size(); i++) {
-    TAXID temp = 0;
-    for (auto& path : paths) {
-      if (temp == 0)
-        temp = path[i];
-      else if (temp != path[i]) {
-        return consensus;
-      }
-    }
-    consensus = temp;
-  }
-  return consensus;
-}
-
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getParentTaxID(const TAXID taxID) const {
-  auto entry = taxIDsAndEntries.find(taxID);
-  if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1)
-    return entry->second.parentTaxonomyID;
-  else
-    return 0;
-}
-
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getScientificName(const TAXID taxID) const {
-  auto entry = taxIDsAndEntries.find(taxID);
-  if (entry != taxIDsAndEntries.end()) {
-    return entry->second.scientificName;
-  } else
-    return std::string();
-}
-
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getRank(const TAXID taxID) const {
-  auto entry = taxIDsAndEntries.find(taxID);
-  if (entry != taxIDsAndEntries.end()) {
-    return entry->second.rank;
-  } else
-    return std::string();
-}
-
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getLineage(TAXID taxonomyID) const {
-  std::string lineage;
-  while (true) {
-    // 131567 = Cellular organisms
-    if (taxonomyID != 131567) {
-      if (lineage.size()) lineage.insert(0, "; ");
-      lineage.insert(0, getScientificName(taxonomyID));
-      if (getRank(taxonomyID) == "species") lineage.clear();
-    }
-    taxonomyID = getParentTaxID(taxonomyID);
-    if (taxonomyID == 0) {
-      if (lineage.size()) lineage.append(".");
-      break;
-    }
-  }
-  return lineage;
-}
-
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getMetaPhlAnLineage(TAXID taxonomyID) const {
-  std::string rank = getRank(taxonomyID);
-  if (rank == "superphylum") return std::string();
-  std::string lineage;
-  while (true) {
-    // 131567 = Cellular organisms
-    if (taxonomyID != 131567) {
-      std::string rank = getRank(taxonomyID);
-      if (rank == "species") {
-        lineage.insert(0, "|s__");
-        lineage.insert(4, getScientificName(taxonomyID));
-      } else if (rank == "genus") {
-        lineage.insert(0, "|g__");
-        lineage.insert(4, getScientificName(taxonomyID));
-      } else if (rank == "family") {
-        lineage.insert(0, "|f__");
-        lineage.insert(4, getScientificName(taxonomyID));
-      } else if (rank == "order") {
-        lineage.insert(0, "|o__");
-        lineage.insert(4, getScientificName(taxonomyID));
-      } else if (rank == "class") {
-        lineage.insert(0, "|c__");
-        lineage.insert(4, getScientificName(taxonomyID));
-      } else if (rank == "phylum") {
-        lineage.insert(0, "|p__");
-        lineage.insert(4, getScientificName(taxonomyID));
-      } else if (rank == "superkingdom") {
-        lineage.insert(0, "k__");
-        lineage.insert(3, getScientificName(taxonomyID));
-      }
-    }
-    taxonomyID = getParentTaxID(taxonomyID);
-    if (taxonomyID == 0) {
-      break;
-    }
-  }
-  std::replace(lineage.begin(), lineage.end(), ' ', '_');
-  return lineage;
-}
-
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtRank(const TAXID taxID,
-                                    const std::string& rank) const {
-  auto entry = taxIDsAndEntries.find(taxID);
-  while (entry != taxIDsAndEntries.end() &&
-         entry->second.parentTaxonomyID != 1) {
-    if (entry->second.rank == rank) {
-      return entry->second.taxonomyID;
-    } else
-      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
-  }
-  return 0;
-}
-
-template<typename TAXID, typename READCOUNTS>
-int TaxonomyDB<TAXID,READCOUNTS>::isBelowInTree(TAXID upper, TAXID lower) const {
-  auto entry = taxIDsAndEntries.find(lower);
-  unsigned level = 0;
-  while (entry != taxIDsAndEntries.end() &&
-         entry->second.parentTaxonomyID != 1) {
-    if (entry->first == upper) {
-      return level;
-    } else {
-      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
-      level++;
-    }
-  }
-  return -1;
-}
-
-template<typename TAXID, typename READCOUNTS>
-bool TaxonomyDB<TAXID,READCOUNTS>::isSubSpecies(TAXID taxonomyID) const {
-  bool isSubSpecies = false;
-  auto entry = taxIDsAndEntries.find(taxonomyID);
-  int numLevels = 0;
-  while (entry != taxIDsAndEntries.end() &&
-         entry->second.parentTaxonomyID != 1) {
-    if (entry->second.rank == "species") {
-      if (numLevels > 0) {
-        isSubSpecies = true;
-      }
-      break;
-    } else
-      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
-    numLevels++;
-  }
-  return isSubSpecies;
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::addCounts(const TAXID taxid, const READCOUNTS& read_counts_) {
-	auto it = taxIDsAndEntries.find(taxid);
-		if (it == taxIDsAndEntries.end()) {
-			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
-			return;
-		}
-		TaxonomyEntry<TAXID,READCOUNTS>* tax = &it->second;
-		//cerr << taxid << " rc before: " << tax->read_counts << endl;
-		tax->read_counts += read_counts_;
-		//cerr << taxid << " rc after:  " << tax->read_counts << endl;
-
-		while (tax->parent != nullptr) {
-			tax = tax->parent;
-			tax->read_counts_children += read_counts_;
-		}
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::fillCounts(const unordered_map<TAXID, READCOUNTS>& taxon_counts) {
-	for (auto& elem : taxon_counts) {
-		addCounts(elem.first, elem.second);
-	 }
-
-	for (auto& tax : taxIDsAndEntries) {
-		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID,READCOUNTS>());
-	}
-}
-
-
-template<typename TAXID, typename READCOUNTS>
-TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
-	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_UNIQUE_KMERS, REPORTCOLS::NUM_KMERS, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string rank) {
-	_total_n_reads =
-			reads(_taxdb.taxIDsAndEntries.at(0).read_counts) +
-			reads(_taxdb.taxIDsAndEntries.at(0).read_counts_children) +
-			reads(_taxdb.taxIDsAndEntries.at(1).read_counts) +
-			reads(_taxdb.taxIDsAndEntries.at(1).read_counts_children);// +
-	if (_total_n_reads == 0) {
-		std::cerr << "total number of reads is zero - not creating a report!" << endl;
-		return;
-	}
-
-	if (format == "kraken") {
-		// A: print number of unidentified reads
-		printReport(_taxdb.taxIDsAndEntries.at(0),0u);
-		// B: print normal results
-		printReport(_taxdb.taxIDsAndEntries.at(1),0u);
-		// C: Print Unclassified stuff
-		//printReport(_taxdb.taxIDsAndEntries.at(-1),0u);
-	} else {
-		// print stuff at a certain level ..
-		//_uid_abundance;
-		//_taxinfo
-
-	}
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
-	if (_show_zeros || (reads(tax.read_counts)+reads(tax.read_counts_children)) > 0) {
-		printLine(tax, depth);
-		for (auto child : tax.children)
-			printReport(*child, depth+1);
-	}
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
-	for (auto& col : _report_cols) {
-		switch (col) {
-		case REPORTCOLS::NAME:        _reportOfb << tax.scientificName ; break;
-		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
-		case REPORTCOLS::TAX_ID:     _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
-		case REPORTCOLS::DEPTH:     _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:  _reportOfb << 100.0*(reads(tax.read_counts) + reads(tax.read_counts_children))/_total_n_reads; break;
-		//case REPORTCOLS::ABUNDANCE:  _reportOfb << 100*counts.abundance[0]; break;
-		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
-		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.read_counts) + reads(tax.read_counts_children)); break;
-		case REPORTCOLS::NUM_READS:  _reportOfb << (tax.read_counts); break;
-		//case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.kmers.cardinality(); break;
-		//case REPORTCOLS::NUM_KMERS: _reportOfb << tax.numKmers; break;
-		//case REPORTCOLS::GENOME_SIZE: ; break;
-		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
-		//case REPORTCOLS::SUM_SCORE: ; break;
-		case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break;
-		default: _reportOfb << "NA";
-		}
-		if (&col == &_report_cols.back()) {
-			_reportOfb << '\n';
-		} else {
-			_reportOfb << '\t';
-		}
-	}
-}
-
-
-  // Return lowest common ancestor of a and b
-  // LCA(0,x) = LCA(x,0) = x
-  // Default ancestor is 1 (root of tree)
-uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b)
-  {
-    if (a == 0 || b == 0)
-      return a ? a : b;
-
-    // create a path from a to the root
-	std::unordered_set<uint32_t> a_path;
-    while (a > 0 && a != parent_map[a]) {
-	  if (a == b)
-		  return a;
-      a_path.insert(a);
-      a = parent_map[a];
-    }
-
-    // search for b in the path from a to the root
-    while (b > 0 && b != parent_map[b]) {
-      if (a_path.count(b) > 0)
-        return b;
-      b = parent_map[b];
-    }
-    return 1;
-  }
-
-template<typename K,typename V>
-inline
-V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, const V default_value) {
-	auto itr = my_map.find(query);
-
-	if (itr == my_map.end()) {
-		return default_value;
-	}
-
-	return itr->second;
-}
-
-
-
diff --git a/src/taxdb.h b/src/taxdb.h
index 3b825f2..7c94f33 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -101,16 +101,21 @@ class TaxonomyDB {
   TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false);
   TaxonomyDB();
   void writeTaxonomyIndex(std::ostream & outs) const;
+  void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes);
 
   TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
   std::string getScientificName(const TAXID taxID) const;
   std::string getRank(const TAXID taxID) const;
   TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
+
   TAXID getParentTaxID(const TAXID taxID) const;
   std::unordered_map<TAXID, TAXID> getParentMap() const;
   std::unordered_map<std::string, TAXID> getScientificNameMap() const;
   std::string getLineage(TAXID taxonomyID) const;
   std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
+  TaxonomyEntry<TAXID,READCOUNTS> getEntry(TAXID taxID) const;
+
+  size_t distance(TAXID taxID1, TAXID taxID2) const;
 
   bool isSubSpecies(TAXID taxonomyID) const;
   int isBelowInTree(TAXID upper, TAXID lower) const;
@@ -125,8 +130,9 @@ class TaxonomyDB {
   std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
   bool genomeSizes_are_set = false;
  private:
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > 
-    readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes);
+
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
+        readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes);
   void parseNamesDump(const std::string namesDumpFileName);
   void parseNodesDump(const std::string nodesDumpFileName);
   void createPointers(std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >& taxIDsAndEntries);
@@ -281,6 +287,17 @@ unordered_map<TAXID, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getParentMap() const {
 	return Parent_map;
 }
 
+template<typename TAXID, typename READCOUNTS>
+TaxonomyEntry<TAXID,READCOUNTS> TaxonomyDB<TAXID,READCOUNTS>::getEntry(TAXID taxID) const {
+  auto it = taxIDsAndEntries.find(taxID);
+  if (it == taxIDsAndEntries.end()) {
+    TaxonomyEntry<TAXID, READCOUNTS> ti { 0, 0, "NA"};
+    return ti;
+  } else {
+    return it->second;
+  }
+}
+
 template<typename TAXID, typename READCOUNTS>
 void TaxonomyDB<TAXID,READCOUNTS>::createPointers(std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >& taxIDsAndEntries) {
   for (auto& tax : taxIDsAndEntries) {
@@ -299,7 +316,7 @@ TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB() { }
 
 template<typename TAXID, typename READCOUNTS>
 TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) :
-  taxIDsAndEntries( readTaxonomyIndex(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes)
+  taxIDsAndEntries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes)
  { }
 
 template<typename TAXID, typename READCOUNTS>
@@ -403,9 +420,15 @@ void TaxonomyDB<TAXID,READCOUNTS>::setGenomeSizes(const std::unordered_map<TAXID
   genomeSizes_are_set = true;
 }
 
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) {
+  taxIDsAndEntries = readTaxonomyIndex_(inFileName, hasGenomeSizes);
+  genomeSizes_are_set = hasGenomeSizes;
+}
+
 template<typename TAXID, typename READCOUNTS>
 std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > 
- TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) {
+ TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) {
   log_msg("Reading taxonomy index from " + inFileName);
   std::ifstream inFile(inFileName);
   if (!inFile.is_open())
@@ -418,12 +441,12 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
 
   std::string line;
   while (!inFile.eof()) {
-	inFile >> taxonomyID >> parentTaxonomyID;
-	inFile.get(); // read tab
-	std::getline(inFile, scientificName, '\t');
+    inFile >> taxonomyID >> parentTaxonomyID;
+    inFile.get(); // read tab
+    std::getline(inFile, scientificName, '\t');
     if (hasGenomeSizes) {
   	  std::getline(inFile, rank, '\t');
-	  inFile >> genomeSize >> genomeSizeOfChildren;
+  	  inFile >> genomeSize >> genomeSizeOfChildren;
     } else {
   	  std::getline(inFile, rank, '\n');
     }
@@ -438,7 +461,7 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
 	0, {0, 0, "no rank", "unclassified" }
   });
   createPointers(taxIDsAndEntries);
-  log_msg("Finished, read " + std::to_string(taxIDsAndEntries.size()) + " taxa");
+  log_msg("done reading TaxDB, read " + std::to_string(taxIDsAndEntries.size()) + " taxa");
   return(taxIDsAndEntries);
 }
 
@@ -469,6 +492,7 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
     return i.size() < j.size();
   });
   TAXID consensus = 0;
+  // assumes equal paths lengths??
   for (unsigned i = 0; i < paths[0].size(); i++) {
     TAXID temp = 0;
     for (auto& path : paths) {
@@ -574,12 +598,15 @@ template<typename TAXID, typename READCOUNTS>
 TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtRank(const TAXID taxID,
                                     const std::string& rank) const {
   auto entry = taxIDsAndEntries.find(taxID);
+  //cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl;
   while (entry != taxIDsAndEntries.end() &&
          entry->second.parentTaxonomyID != 1) {
+    //cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl;
     if (entry->second.rank == rank) {
       return entry->second.taxonomyID;
-    } else
+    } else {
       entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+    }
   }
   return 0;
 }

From 2873a79b35b482eab30d9a196ece730ed631eb63 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 28 Aug 2017 09:54:52 -0400
Subject: [PATCH 037/105] Renamed to KrakenU

---
 .gitignore                                                | 1 +
 README.md                                                 | 8 ++++----
 install_kraken.sh                                         | 4 ++--
 scripts/{kraken_hll => krakenu}                           | 0
 ...en_hll-add_to_library.sh => krakenu-add_to_library.sh} | 0
 scripts/{kraken_hll-build => krakenu-build}               | 2 +-
 scripts/{kraken_hll-build_db.sh => krakenu-build_db.sh}   | 2 +-
 ...ck_for_jellyfish.sh => krakenu-check_for_jellyfish.sh} | 0
 scripts/{kraken_hll-clean_db.sh => krakenu-clean_db.sh}   | 0
 ...ll-cp_into_tempfile.pl => krakenu-cp_into_tempfile.pl} | 0
 ...mic_library.sh => krakenu-download_genomic_library.sh} | 0
 ...-download_taxonomy.sh => krakenu-download_taxonomy.sh} | 0
 scripts/{kraken_hll-filter => krakenu-filter}             | 0
 scripts/{kraken_hll-mpa-report => krakenu-mpa-report}     | 0
 .../{kraken_hll-read_merger.pl => krakenu-read_merger.pl} | 0
 scripts/{kraken_hll-report => krakenu-report}             | 0
 scripts/{kraken_hll-shrink_db.sh => krakenu-shrink_db.sh} | 0
 ...d_installation.sh => krakenu-standard_installation.sh} | 8 ++++----
 scripts/{kraken_hll-translate => krakenu-translate}       | 0
 .../{kraken_hll-upgrade_db.sh => krakenu-upgrade_db.sh}   | 0
 ...-verify_gi_numbers.pl => krakenu-verify_gi_numbers.pl} | 0
 src/Makefile                                              | 2 +-
 22 files changed, 14 insertions(+), 13 deletions(-)
 rename scripts/{kraken_hll => krakenu} (100%)
 rename scripts/{kraken_hll-add_to_library.sh => krakenu-add_to_library.sh} (100%)
 rename scripts/{kraken_hll-build => krakenu-build} (99%)
 rename scripts/{kraken_hll-build_db.sh => krakenu-build_db.sh} (99%)
 rename scripts/{kraken_hll-check_for_jellyfish.sh => krakenu-check_for_jellyfish.sh} (100%)
 rename scripts/{kraken_hll-clean_db.sh => krakenu-clean_db.sh} (100%)
 rename scripts/{kraken_hll-cp_into_tempfile.pl => krakenu-cp_into_tempfile.pl} (100%)
 rename scripts/{kraken_hll-download_genomic_library.sh => krakenu-download_genomic_library.sh} (100%)
 rename scripts/{kraken_hll-download_taxonomy.sh => krakenu-download_taxonomy.sh} (100%)
 rename scripts/{kraken_hll-filter => krakenu-filter} (100%)
 rename scripts/{kraken_hll-mpa-report => krakenu-mpa-report} (100%)
 rename scripts/{kraken_hll-read_merger.pl => krakenu-read_merger.pl} (100%)
 rename scripts/{kraken_hll-report => krakenu-report} (100%)
 rename scripts/{kraken_hll-shrink_db.sh => krakenu-shrink_db.sh} (100%)
 rename scripts/{kraken_hll-standard_installation.sh => krakenu-standard_installation.sh} (83%)
 rename scripts/{kraken_hll-translate => krakenu-translate} (100%)
 rename scripts/{kraken_hll-upgrade_db.sh => krakenu-upgrade_db.sh} (100%)
 rename scripts/{kraken_hll-verify_gi_numbers.pl => krakenu-verify_gi_numbers.pl} (100%)

diff --git a/.gitignore b/.gitignore
index 500b4a0..d6ff918 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /install/
+/Debug/
diff --git a/README.md b/README.md
index a31d87c..83ae11b 100644
--- a/README.md
+++ b/README.md
@@ -33,13 +33,13 @@ Here's a small example of a classification against a viral database with k=25. T
 
 ## Usage
 
-For usage, see `kraken_hll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM).
+For usage, see `krakenu --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM).
 
 ### Differences to `kraken`
- - Use `kraken_hll --report-file FILENAME ...` to write the kraken report to `FILENAME`.
- - Use `kraken_hll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `kraken_hll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same.
+ - Use `krakenu --report-file FILENAME ...` to write the kraken report to `FILENAME`.
+ - Use `krakenu --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenu-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same.
  - Add a suffix `.gz` to output files to generate gzipped output files
 
 ### Differences to `kraken-build`
- - Use `kraken_hll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`.
+ - Use `krakenu-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`.
  - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space
diff --git a/install_kraken.sh b/install_kraken.sh
index b909336..f6f5701 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -39,7 +39,7 @@ fi
 export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")
 
 mkdir -p "$KRAKEN_DIR"
-make -C src clean
+#make -C src clean
 make -C src install
 for file in scripts/*
 do
@@ -58,7 +58,7 @@ echo "Kraken installation complete."
 echo
 echo "To make things easier for you, you may want to copy/symlink the following"
 echo "files into a directory in your PATH:"
-for file in $KRAKEN_DIR/kraken_hll*
+for file in $KRAKEN_DIR/krakenu*
 do
   [ -x "$file" ] && echo "  $file"
 done
diff --git a/scripts/kraken_hll b/scripts/krakenu
similarity index 100%
rename from scripts/kraken_hll
rename to scripts/krakenu
diff --git a/scripts/kraken_hll-add_to_library.sh b/scripts/krakenu-add_to_library.sh
similarity index 100%
rename from scripts/kraken_hll-add_to_library.sh
rename to scripts/krakenu-add_to_library.sh
diff --git a/scripts/kraken_hll-build b/scripts/krakenu-build
similarity index 99%
rename from scripts/kraken_hll-build
rename to scripts/krakenu-build
index 8367fdd..9df965e 100755
--- a/scripts/kraken_hll-build
+++ b/scripts/krakenu-build
@@ -298,7 +298,7 @@ sub build_database {
   $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0);
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq;
   my $opt = ($verbose? "-x" : "");
-  exec "kraken_hll-build_db.sh $opt";
+  exec "krakenu-build_db.sh";
 }
 
 sub clean_database {
diff --git a/scripts/kraken_hll-build_db.sh b/scripts/krakenu-build_db.sh
similarity index 99%
rename from scripts/kraken_hll-build_db.sh
rename to scripts/krakenu-build_db.sh
index 402dc45..e3d38ad 100755
--- a/scripts/kraken_hll-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -42,7 +42,7 @@ script_dir=`dirname $0`
 
 DATABASE_DIR="$KRAKEN_DB_NAME"
 FIND_OPTS=-L
-JELLYFISH_BIN=`$script_dir/kraken_hll-check_for_jellyfish.sh`
+JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh`
 
 if [ ! -d "$DATABASE_DIR" ]
 then
diff --git a/scripts/kraken_hll-check_for_jellyfish.sh b/scripts/krakenu-check_for_jellyfish.sh
similarity index 100%
rename from scripts/kraken_hll-check_for_jellyfish.sh
rename to scripts/krakenu-check_for_jellyfish.sh
diff --git a/scripts/kraken_hll-clean_db.sh b/scripts/krakenu-clean_db.sh
similarity index 100%
rename from scripts/kraken_hll-clean_db.sh
rename to scripts/krakenu-clean_db.sh
diff --git a/scripts/kraken_hll-cp_into_tempfile.pl b/scripts/krakenu-cp_into_tempfile.pl
similarity index 100%
rename from scripts/kraken_hll-cp_into_tempfile.pl
rename to scripts/krakenu-cp_into_tempfile.pl
diff --git a/scripts/kraken_hll-download_genomic_library.sh b/scripts/krakenu-download_genomic_library.sh
similarity index 100%
rename from scripts/kraken_hll-download_genomic_library.sh
rename to scripts/krakenu-download_genomic_library.sh
diff --git a/scripts/kraken_hll-download_taxonomy.sh b/scripts/krakenu-download_taxonomy.sh
similarity index 100%
rename from scripts/kraken_hll-download_taxonomy.sh
rename to scripts/krakenu-download_taxonomy.sh
diff --git a/scripts/kraken_hll-filter b/scripts/krakenu-filter
similarity index 100%
rename from scripts/kraken_hll-filter
rename to scripts/krakenu-filter
diff --git a/scripts/kraken_hll-mpa-report b/scripts/krakenu-mpa-report
similarity index 100%
rename from scripts/kraken_hll-mpa-report
rename to scripts/krakenu-mpa-report
diff --git a/scripts/kraken_hll-read_merger.pl b/scripts/krakenu-read_merger.pl
similarity index 100%
rename from scripts/kraken_hll-read_merger.pl
rename to scripts/krakenu-read_merger.pl
diff --git a/scripts/kraken_hll-report b/scripts/krakenu-report
similarity index 100%
rename from scripts/kraken_hll-report
rename to scripts/krakenu-report
diff --git a/scripts/kraken_hll-shrink_db.sh b/scripts/krakenu-shrink_db.sh
similarity index 100%
rename from scripts/kraken_hll-shrink_db.sh
rename to scripts/krakenu-shrink_db.sh
diff --git a/scripts/kraken_hll-standard_installation.sh b/scripts/krakenu-standard_installation.sh
similarity index 83%
rename from scripts/kraken_hll-standard_installation.sh
rename to scripts/krakenu-standard_installation.sh
index 341e4e0..e10254b 100755
--- a/scripts/kraken_hll-standard_installation.sh
+++ b/scripts/krakenu-standard_installation.sh
@@ -31,10 +31,10 @@ then
 fi
 
 check_for_jellyfish.sh
-kraken_hll-build --db $KRAKEN_DB_NAME --download-taxonomy
-kraken_hll-build --db $KRAKEN_DB_NAME --download-library bacteria
-kraken_hll-build --db $KRAKEN_DB_NAME --download-library viruses
-kraken_hll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
+krakenu-build --db $KRAKEN_DB_NAME --download-taxonomy
+krakenu-build --db $KRAKEN_DB_NAME --download-library bacteria
+krakenu-build --db $KRAKEN_DB_NAME --download-library viruses
+krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
                --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \
                --max-db-size "$KRAKEN_MAX_DB_SIZE" \
                --minimizer-len $KRAKEN_MINIMIZER_LEN \
diff --git a/scripts/kraken_hll-translate b/scripts/krakenu-translate
similarity index 100%
rename from scripts/kraken_hll-translate
rename to scripts/krakenu-translate
diff --git a/scripts/kraken_hll-upgrade_db.sh b/scripts/krakenu-upgrade_db.sh
similarity index 100%
rename from scripts/kraken_hll-upgrade_db.sh
rename to scripts/krakenu-upgrade_db.sh
diff --git a/scripts/kraken_hll-verify_gi_numbers.pl b/scripts/krakenu-verify_gi_numbers.pl
similarity index 100%
rename from scripts/kraken_hll-verify_gi_numbers.pl
rename to scripts/krakenu-verify_gi_numbers.pl
diff --git a/src/Makefile b/src/Makefile
index 03f32cb..f721cf4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 CXX = g++
-CXXFLAGS = -Wall -std=c++11 -fopenmp -O3
+CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g
 PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 

From 607cb0cf4826ec844abe11a6144a35a391f5be9b Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 31 Aug 2017 13:34:36 -0400
Subject: [PATCH 038/105] Put version string in separate file

---
 VERSION           | 1 +
 install_kraken.sh | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 VERSION

diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..49d5957
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1
diff --git a/install_kraken.sh b/install_kraken.sh
index f6f5701..000c9b7 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -19,7 +19,7 @@
 
 set -e
 
-VERSION="0.10.7-kraken-hll"
+VERSION=`cat $(dirname $0)/VERSION`
 
 if [ -z "$1" ] || [ -n "$2" ]
 then

From 432d6ceb4eb87fb18e1f91529e324129e85940c9 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 31 Aug 2017 14:11:39 -0400
Subject: [PATCH 039/105] Fixed script paths

---
 scripts/krakenu-build | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/krakenu-build b/scripts/krakenu-build
index 9df965e..267fc7d 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenu-build
@@ -262,7 +262,7 @@ sub display_version {
 }
 
 sub download_taxonomy {
-  exec "download_taxonomy.sh";
+  exec "krakenu-download_taxonomy.sh";
 }
 
 sub download_library {
@@ -271,12 +271,12 @@ sub download_library {
     warn "Unknown library type \"$type\"\n";
     usage();
   }
-  exec "download_genomic_library.sh", $type;
+  exec "krakenu-download_genomic_library.sh", $type;
 }
 
 sub add_to_library {
   my $arg = shift;
-  exec "add_to_library.sh", $arg;
+  exec "krakenu-add_to_library.sh", $arg;
 }
 
 sub shrink_db {
@@ -287,11 +287,11 @@ sub shrink_db {
   if (! defined($new_db)) {
     die "Must specify new database name to perform shrink task\n";
   }
-  exec "shrink_db.sh", $new_count, $new_db, $shrink_block_offset;
+  exec "krakenu-shrink_db.sh", $new_count, $new_db, $shrink_block_offset;
 }
 
 sub standard_installation {
-  exec "standard_installation.sh";
+  exec "krakenu-standard_installation.sh";
 }
 
 sub build_database {
@@ -302,9 +302,9 @@ sub build_database {
 }
 
 sub clean_database {
-  exec "clean_db.sh";
+  exec "krakenu-clean_db.sh";
 }
 
 sub upgrade_database {
-  exec "upgrade_db.sh";
+  exec "krakenu-upgrade_db.sh";
 }

From e52b7e0f8c183b19b11fc09b11cf06ea84456847 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Fri, 22 Sep 2017 11:32:44 -0400
Subject: [PATCH 040/105] Major update for UID mapping and having taxon entries
 for assemblies

---
 scripts/krakenu-build                       |  29 +-
 scripts/krakenu-build_db.sh                 | 127 ++++--
 scripts/krakenu-download                    | 431 ++++++++++++++++++++
 scripts/krakenu-download_genomic_library.sh | 119 ------
 scripts/krakenu-download_taxonomy.sh        |  48 ---
 scripts/krakenu-standard_installation.sh    |   6 +-
 src/Makefile                                |   6 +-
 src/classify.cpp                            | 116 ++----
 src/krakendb.cpp                            |  13 +-
 src/krakendb.hpp                            |   3 +-
 src/krakenutil.cpp                          | 153 +------
 src/krakenutil.hpp                          |  18 +-
 src/read_uid_mapping.cpp                    |   3 +-
 src/set_lcas.cpp                            | 188 +++++----
 src/taxdb.h                                 |  64 +--
 src/uid_mapping.cpp                         | 196 +++++++++
 src/uid_mapping.hpp                         |  45 ++
 17 files changed, 1017 insertions(+), 548 deletions(-)
 create mode 100755 scripts/krakenu-download
 delete mode 100755 scripts/krakenu-download_genomic_library.sh
 delete mode 100755 scripts/krakenu-download_taxonomy.sh
 create mode 100644 src/uid_mapping.cpp
 create mode 100644 src/uid_mapping.hpp

diff --git a/scripts/krakenu-build b/scripts/krakenu-build
index 267fc7d..2303f76 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenu-build
@@ -64,7 +64,12 @@ my (
   $upgrade,
   $clean,
 
+  $build_uid_database,
+  $build_lca_database,
+
+  $add_taxonomy_ids_for_genome,
   $add_taxonomy_ids_for_seq
+
 );
 
 my $verbose = 0;
@@ -75,7 +80,10 @@ $kmer_len = $DEF_KMER_LEN;
 $work_on_disk = "";
 $hash_size = "";
 $max_db_size = "";
+$add_taxonomy_ids_for_genome = 0;
 $add_taxonomy_ids_for_seq = 0;
+$build_uid_database = 1;
+$build_lca_database = 1;
 
 # variables corresponding to task options
 my @TASK_LIST = (
@@ -115,7 +123,10 @@ GetOptions(
   "clean" => \$clean,
   "verbose" => \$verbose,
 
-  "generate-taxonomy-ids-for-sequences" => \$add_taxonomy_ids_for_seq
+  "taxids-for-genomes" => \$add_taxonomy_ids_for_genome,
+  "taxids-for-sequences" => \$add_taxonomy_ids_for_seq,
+  "lca-database!" => \$build_lca_database,
+  "uid-database!" => \$build_uid_database
 ) or usage();
 
 if (@ARGV) {
@@ -241,12 +252,17 @@ Options:
   --shrink-block-offset NUM  When shrinking, select the k-mer that is NUM
                              positions from the end of a block of k-mers
                              (default: 1)
+  --uid-database             Build a UID database (default no)
+  --lca-database             Build a LCA database (default yes)
+  --no-lca-database          Do not build a LCA database
   --work-on-disk             Perform most operations on disk rather than in
                              RAM (will slow down build in most cases)
-  --generate-taxonomy-ids-for-sequences 
-                             Generate taxonomy IDs for sequences, starting with 1000000000.
-							 Can be useful to resolve classifications with multiple genomes
-							 for one taxonomy ID.
+  --taxids-for-genomes       Add taxonomy IDs (starting with 1bio) for genomes.
+                             Only works with 3-column seqid2taxid map with third 
+                             column being the name
+  --taxids-for-sequences     Add taxonomy IDs for sequences, starting with 1bio.
+                             Can be useful to resolve classifications with multiple genomes
+                             for one taxonomy ID.
 EOF
   exit $exit_code;
 }
@@ -297,6 +313,9 @@ sub standard_installation {
 sub build_database {
   $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0);
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq;
+  $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome;
+  $ENV{"KRAKEN_UID_DATABASE"} = $build_uid_database;
+  $ENV{"KRAKEN_LCA_DATABASE"} = $build_lca_database;
   my $opt = ($verbose? "-x" : "");
   exec "krakenu-build_db.sh";
 }
diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index e3d38ad..aebff74 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+#vim: noai:ts=2:sw=2
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
@@ -23,9 +24,9 @@
 set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 set -o pipefail  # Stop on failures in non-final pipeline commands
-set -x
 
 function report_time_elapsed() {
+  set -x
   curr_time=$(date "+%s.%N")
   perl -e '$time = $ARGV[1] - $ARGV[0];' \
        -e '$sec = int($time); $nsec = $time - $sec;' \
@@ -37,12 +38,25 @@ function report_time_elapsed() {
        $1 $curr_time
 }
 
+export VERBOSE=1
+
+function cmd () {
+  export start_time1=$(date "+%s.%N")
+  if [[ $VERBOSE -eq 1 ]]; then
+    echo "EXECUTING $@"
+  fi
+  $@
+}
+
+
 start_time=$(date "+%s.%N")
 script_dir=`dirname $0`
 
 DATABASE_DIR="$KRAKEN_DB_NAME"
 FIND_OPTS=-L
 JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh`
+NCBI_SERVER="ftp.ncbi.nih.gov"
+FTP_SERVER="ftp://$NCBI_SERVER"
 
 if [ ! -d "$DATABASE_DIR" ]
 then
@@ -72,7 +86,7 @@ fi
 N_FILES=`cat library-files.txt | wc -l`
 echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)"
 
-if [ -e "database.jdb" ]
+if [ -e "database.jdb" ] || [ -e "database0.kdb" ]
 then
   echo "Skipping step 1, k-mer set already exists."
 else
@@ -150,18 +164,19 @@ else
   fi
 fi
 
-if [ -e "database.kdb" ]
+SORTED_DB_NAME=database0.kdb
+if [ -e "$SORTED_DB_NAME" ]
 then
   echo "Skipping step 3, k-mer set already sorted."
 else
   echo "Sorting k-mer set (step 3 of 6)..."
   start_time1=$(date "+%s.%N")
   db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \
-    -d database.jdb -o database.kdb.tmp \
+    -d database.jdb -o $SORTED_DB_NAME.tmp \
     -i database.idx
 
   # Once here, DB is sorted, can put file in proper place.
-  mv database.kdb.tmp database.kdb
+  mv $SORTED_DB_NAME.tmp $SORTED_DB_NAME
 
   echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]"
 fi
@@ -180,35 +195,99 @@ else
   echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
 fi
 
-if [ -e "lca.complete" ]
+
+if [ -s "taxDB" ]
 then
-  echo "Skipping step 5, LCAs already set."
+  echo "Skipping step 5, taxDB exists."
 else
-  echo "Setting LCAs in database (step 5 of 6)..."
-  PARAM=""
-  if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then
-	echo " Adding taxonomy IDs for sequences"
-	PARAM=" -a"
-  fi
+  echo "Creating taxDB (step 5 of 6)... "
   start_time1=$(date "+%s.%N")
-  cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
-    set_lcas $MEMFLAG -x -d database.kdb -i database.idx -v \
-    -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
-  touch "lca.complete"
+  if [ ! -f taxonomy/names.dmp ] || [ ! -f taxonomy/nodes.dmp ]; then
+    echo "taxonomy/names.dmp or taxonomy/nodes.dmp does not exist - downloading it ..."
+    [ -d taxonomy ] || mkdir taxonomy
+    cd taxonomy
+    wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz
+    tar zxf taxdump.tar.gz
+    cd ..
+  fi
+  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp
+  mv taxDB.tmp taxDB
+  echo "taxDB construction finished. [$(report_time_elapsed $start_time1)]"
+fi
+
+if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
+  if [ -e "database.kdb" ]
+  then
+    echo "Skipping step 6, LCAs already set."
+  else
+    echo "Building standard Kraken LCA database (step 6 of 6)..."
+    PARAM=""
+    if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then
+  	echo " Adding taxonomy IDs for sequences"
+  	PARAM=" -a"
+    fi
+    if [[ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]]; then
+  	echo " Adding taxonomy IDs for genomes"
+  	PARAM="$PARAM -A"
+    fi
+    start_time1=$(date "+%s.%N")
+    set -x
+    cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
+      set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \
+      -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kmer_count \
+      -F /dev/fd/0 > seqid2taxid-plus.map
 
-  echo "Database LCAs set. [$(report_time_elapsed $start_time1)]"
+    ## Make a classification report
+    krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input library/archaea.fna > $(basename `pwd`).kraken
+    set +x
+    if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
+      mv seqid2taxid.map seqid2taxid.map.orig
+      mv seqid2taxid-plus.map seqid2taxid.map
+    fi
+    echo "LCA database created. [$(report_time_elapsed $start_time1)]"
+  fi
 fi
 
-if [ -s "taxDB" ]
+
+if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
+  if [ -e "uid_database.complete" ]
+  then
+    echo "Skipping step 6.3, UIDs already set."
+  else
+    echo "Building UID database (step 6.3 of 6)..."
+    PARAM=""
+    if [[ "$KRAKEN_LCA_DATABASE" == "0" ]]; then
+      if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" && ]]; then
+  	echo " Adding taxonomy IDs for sequences"
+  	PARAM=" -a"
+      fi
+      if [[ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]]; then
+   	echo " Adding taxonomy IDs for genomes"
+    	PARAM="$PARAM -A"
+      fi
+    fi
+    start_time1=$(date "+%s.%N")
+    cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
+      set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \
+      -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
+    touch "uid_database.complete"
+  
+    echo "UID Database created. [$(report_time_elapsed $start_time1)]"
+  fi
+fi
+
+if [ -s "uid_database.count" ]
 then
-  echo "Skipping step 6, taxDB exists."
+  echo "Skipping step 6.4, uid_database.kmer_count exists."
 else
-  echo "Creating taxDB (step 6 of 6)... "
-  time $JELLYFISH_BIN histo --high 100000000 database.kdb | tee database.taxon_count
-  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp database.taxon_count | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp
-  mv taxDB.tmp taxDB
+  echo "Creating uid_database.kmer_count (step 6.4 of 6)... "
+  start_time1=$(date "+%s.%N")
+  time $JELLYFISH_BIN histo --high 100000000 uid_database.kdb > uid_database.kmer_count
+  echo "uid_database.kmer_count finished. [$(report_time_elapsed $start_time1)]"
 fi
 
 
 echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]
 You can delete all files but database.{kdb,idx} and taxDB now, if you want"
+
+
diff --git a/scripts/krakenu-download b/scripts/krakenu-download
new file mode 100755
index 0000000..f3aa4bd
--- /dev/null
+++ b/scripts/krakenu-download
@@ -0,0 +1,431 @@
+#!/bin/env perl
+
+# krakenu-download.pl - based on centrifuge-download
+# (c) Florian Breitwieser, 2017
+
+use strict;
+use warnings;
+use File::Basename;
+use File::Fetch;
+use File::Copy;
+use File::Path qw/make_path remove_tree/;
+use IO::Uncompress::Gunzip qw/gunzip $GunzipError/;
+use autodie;
+use Term::ANSIColor;
+use Getopt::Long;
+use Parallel::ForkManager;
+
+sub download_taxonomy(@);
+sub download_contaminats(@);
+sub download(@);
+sub print_header_lines(@);
+sub download_domain(@);
+sub download_viral_neighbors(@);
+
+my $FTP="ftp://ftp.ncbi.nih.gov";
+my @ALL_GENOMES=qw/bacteria viral archaea fungi protozoa invertebrate plant vertebrate_mammalian vertebrate_other/;
+my @ALL_DATABASES=qw/refseq genbank taxonomy contaminants/;
+my @ALL_ASSEMBLY_LEVELS=qw/Complete\ Genome Chromosome Scaffold Contig/;
+my @SMALL_GENOMES=qw/mitochondrion plaasmid plastid/;
+
+## Option parsing
+my $DATABASE="refseq";
+my $ASSEMBLY_LEVEL="Complete Genome";
+my $REFSEQ_CATEGORY;
+my $TAXID;
+
+my $BASE_DIR;
+my $DB_DIR;
+my $N_PROC=5;
+my $CHANGE_HEADER=0;
+my $DOWNLOAD_RNA=0;
+my $DO_DUST=0;
+my $FILTER_UNPLACED=0;
+my $VERBOSE=0;
+my $OVERWRITE_FILES=0;
+my $INCLUDE_VIRAL_NEIGHBORS=0;
+my $DOMAINS;
+my $DL_MOD_RSYNC;
+
+my %ac_to_taxid;
+my $downloaded_viral_refseq=0;
+my $FNA_FILES="genomic";
+
+my $USAGE="\n".basename($0).
+" [<options>] <database> <database>*
+
+ARGUMENT
+ <database>    One of refseq, genbank, contaminants or taxonomy:
+                 - contaminants gets contaminant sequences from UniVec and EmVec,
+                 - taxonomy for taxonomy mappings.
+                 - use refseq or genbank for genomic sequences,
+                   - refseq and genbank can be proceeded by '/DOMAIN' or '/DOMAIN/ASS_LEVEL', e.g.
+                     - refseq/archaea, refseq/viral/Any, or genbank/bacteria
+                     - if ASS_LEVEL is not given, the default is used
+
+COMMON OPTIONS
+ -o <directory>     Folder to which the files are downloaded. Default: '.'
+ --db <directory>   Alternative to -o: Download to <directory>/{library,taxonomy}.
+ -P <# of threads>  Number of processes when downloading (uses xargs). Default: '$N_PROC'
+ --rsync, -R        Download using rsync.
+ --overwrite        Redownload and overwrite files with the same name.
+ -v                 Verbose.
+
+WHEN USING database refseq OR genbank:
+ -d <domain>        What domain to download. One or more of @ALL_GENOMES (comma separated).
+ -a <assembly level>  Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'. Use 'Any' for any assembly level.
+ -c <refseq category>   Only download genomes in the specified refseq category. Default: any.
+ -t <taxids>        Only download the specified taxonomy IDs, comma separated. Default: any.
+ --fna <seq types>  Comma-separated list of sequence types, including genomic, rna, rna_from_genomic, cds_from_genomic. Default: $FNA_FILES.
+ -u                 Filter unplaced sequences.
+ -m                 Mask low-complexity regions using dustmasker.
+ -l                 Modify sequence header to include taxonomy ID for Kraken (i.e. add '>kraken:taxid|TAXID' to each sequence).
+ --include-viral-neighbors  Include neighbors for viral genomes as defined at https://www.ncbi.nlm.nih.gov/genome/viruses/.
+                            Only works if refseq viral is downloaded in the same session!
+";
+
+# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific)
+Getopt::Long::Configure('no_auto_abbrev','pass_through');
+GetOptions(
+  "output|o=s"  =>\$BASE_DIR,
+  "db=s" => \$DB_DIR,
+  "threads|P=i" =>\$N_PROC,
+  "domain|d=s"  => \$DOMAINS,
+  "assembly-level|a=s" => \$ASSEMBLY_LEVEL,
+  "category|c=s" => \$REFSEQ_CATEGORY,
+  "taxonomy-id|t=s" => \$TAXID,
+  "fna=s" => \$FNA_FILES,
+  "rsync|R" => \$DL_MOD_RSYNC,
+  "include-viral-neighbors" => \$INCLUDE_VIRAL_NEIGHBORS,
+  "filter-unplaced|u" => \$FILTER_UNPLACED,
+  "mask|m" => \$DO_DUST,
+  "change-header|l" => \$CHANGE_HEADER,
+  "force" => \$OVERWRITE_FILES,
+  "verbose|v" => \$VERBOSE) or die "Error in command line arguments";
+
+if (!defined $ARGV[0] || !$ARGV[0] =~ /refseq|genbank|taxonomy|contaminants/) {
+  print STDERR $USAGE;
+  exit 1;
+}
+
+if (defined $BASE_DIR && defined $DB_DIR) {
+  print "Define either --db or -o, not both!";
+  exit 1;
+}
+
+# Use current directory as base directory
+$BASE_DIR = "." unless defined $DB_DIR || defined $BASE_DIR;
+
+# If DB directory is defined, use that as base directory
+#  -- kept -o and --db options to allow the use of either Kraken and Centrifuge type command line
+my $add_dir = defined $DB_DIR;
+$BASE_DIR = $DB_DIR if defined $DB_DIR;
+sub get_dir {
+  my ($dir, $name) = @_;
+  my $dir1 = $add_dir? "$dir/$name" : $dir;
+  make_path $dir1;
+  return $dir1;
+}
+
+my $pm = new Parallel::ForkManager($N_PROC);
+$pm->run_on_finish(sub {
+  my ($pid, $exit_code, $indent, $exit_signal, $core_dump, $data) = @_;
+  if (defined $data) {
+    @ac_to_taxid{keys %$data} = values %$data;
+  }
+}
+);
+
+my %select_taxonomy_ids;
+if (defined $TAXID) {
+  %select_taxonomy_ids = map { $_ => 1 } split(/,/, $TAXID);
+}
+
+if (!defined $ARGV[0]) {
+  print STDERR $USAGE;
+  exit 1;
+}
+
+foreach my $DATABASE (@ARGV) {
+  if ( $DATABASE eq "taxonomy" ) { 
+    download_taxonomy(get_dir($BASE_DIR,"taxonomy"));
+  } elsif ( $DATABASE eq "contaminants" ) { 
+    download_contaminats(get_dir($BASE_DIR,"library/contaminants"));
+  } elsif ( $DATABASE =~ /^refseq/ || $DATABASE =~ /^genbank/ ) {
+    my ($db, $domain, @levels) = split(/\//, $DATABASE);
+    if (!defined $domain) {
+      foreach my $domain (split(/,/,$DOMAINS)) {
+        my $lib_dir = $add_dir? "$BASE_DIR/library/$domain" : "$BASE_DIR/$domain";
+        download_domain($lib_dir, $domain, $ASSEMBLY_LEVEL);
+      }
+    } else {
+      my $lib_dir = $add_dir? "$BASE_DIR/library/$domain" : "$BASE_DIR/$domain";
+      my $level = $ASSEMBLY_LEVEL;
+      my $taxid = $TAXID;
+      foreach (@levels) {
+        if (/taxid(.*)/) {
+          $taxid = $1;
+        } else {
+          $level = $_;
+        }
+      }
+      download_domain($lib_dir, $domain, $level, $taxid);
+    }
+  } else {
+    print STDERR "Unknown database $DATABASE. \n";
+    print STDERR $USAGE;
+    exit 1;
+  }
+}
+
+if ($INCLUDE_VIRAL_NEIGHBORS) {
+  if (!$downloaded_viral_refseq) {
+    print STDERR "--include-viral-neighbors only works when RefSeq viral is downloaded in the same session!";
+  } else {
+    my $lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors";
+    download_viral_neighbors($lib_dir);
+  }
+}
+
+
+
+#########################################################
+## Functions
+
+sub download(@) {
+  my ($url, $file) = @_;
+  if (-f $file && !$OVERWRITE_FILES) {
+    print STDERR "Not fetching $url - file $file exists.\n" if $VERBOSE;
+    return 1;
+  }
+
+  $url =~ s/https/http/;
+
+  if ( $DL_MOD_RSYNC && $url =~ /^ftp/ ) {
+    $url =~ s/^ftp/rsync/;
+  }
+
+  print STDERR "Fetching $url to $file ...\n" if $VERBOSE;
+  my $ff = File::Fetch->new(uri=>"$url");
+  my $where = $ff->fetch(to=> dirname($file)) or die $ff->error;
+  #my $where = $ff->fetch(to=> dirname($file)) or die "\n$ff->error for $url!";
+  move($where, $file);
+  return -f $file;
+}
+
+sub download_viral_neighbors(@) {
+  my ($dir) = @_;
+  print STDERR "Downloading viral neighbors into $dir ...\n";
+  my $url = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2";
+  my $nbr_file = "$dir/viral_neighbors-taxid10239.nbr";
+  download($url, $nbr_file);
+  open(my $F, "<", $nbr_file);
+  my @file = <$F>;
+  close($F);
+  my $i = 0;
+  my $n_genomes = scalar @file;
+
+  foreach (@file) {
+    next if /^#/;
+    ++$i;
+    print STDERR "\r  Downloading viral neighbor sequence $i/$n_genomes ..." unless $VERBOSE;
+    my $pid = $pm->start and next;
+    my ($rep_acs, $nbr_ac, undef, undef, $nname, $sname) = split /\t/;
+    my ($name, $taxid);
+    foreach my $rep_ac (split (/,/, $rep_acs)) {
+      if (defined $ac_to_taxid{$rep_ac}) {
+        ($name, $taxid) = @{$ac_to_taxid{$rep_ac}};
+        last;
+      }
+    }
+    if (!defined $taxid) {
+      print STDERR "No mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n";
+      $pm->finish(0);
+      next;
+    }
+    (my $name1 = $name) =~ s/[^a-zA-Z0-9_]/_/g;
+    $name1 =~ s/__/_/g;
+    my $file = "$dir/$name1-tax$taxid/$nbr_ac.fna";
+    my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac";
+    if (download($url,$file)) {
+      print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac");
+    }
+    ## TODO: dust viral neighbors
+    $pm->finish(0);
+  }
+  print STDERR "\n";
+  $pm->wait_all_children();
+}
+
+sub print_header_lines(@) {
+  my ($file, $taxid, $name, $map_ref) = @_;
+  #return if -f "$file.map";
+  open (my $F, ">", "$file.map");
+  open (my $G, "<", $file);
+  while (<$G>) {
+    next unless /^>([^ ]*)/;
+    my $ac = $1;
+    print $F "$ac\t$taxid\t$name\n";
+    $ac =~ s/\.[0-9]*$//;
+    $map_ref->{$ac} = [$name, $taxid] if defined $map_ref;
+    #$ac_to_taxid{$ac} = [$name, $taxid] if $downloaded_viral_refseq && $INCLUDE_VIRAL_NEIGHBORS;
+  }
+  close($G);
+  close($F);
+}
+
+sub download_contaminats(@) {
+  my ($CONTAMINANT_DIR) = @_;
+  print STDERR "Downloading contaminant databases ... \n";
+  my $CONTAMINANT_TAXID=32630;
+  make_path $CONTAMINANT_DIR;
+
+  # download UniVec and EmVec database
+  download("ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec","$CONTAMINANT_DIR/UniVec.fna");
+  download("ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz","$CONTAMINANT_DIR/emvec.dat.gz");
+
+  open(my $E1, "|-", "gunzip -c emvec.dat.gz");
+  open(my $E2, ">", "$CONTAMINANT_DIR/EmVec.fna");
+
+  my ($ac,$de);
+  my $in_seq = 0;
+  while(<$E1>) {
+    if (/^AC\s+(.*)/) {
+      $ac = $1;
+      $ac =~ s/;$//;
+    } elsif (/^DE\s+(.*)/) {
+      $de = $1;
+   } elsif (/^SQ/) {
+      $in_seq = 1;
+      print $E2 ">$ac $de\n";
+      print "$ac\t$CONTAMINANT_TAXID\tEmVec\n";
+    } elsif ($in_seq) {
+      if (/^\s+[agct]/) {
+        s/\s+[0-9]+$//;
+       s/ //g;
+       print $_;
+      } else {
+        $in_seq = 0;
+      }
+    }
+  }
+  close($E2);
+  close($E1);
+  unlink("emvec.dat.gz");
+ 
+  if ( $CHANGE_HEADER ) {
+    system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/UniVec.fna");
+    system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/EmVec.fna");
+  } else {
+    print_header_lines("$CONTAMINANT_DIR/UniVec.fna", $CONTAMINANT_TAXID, "UniVec");
+  }
+}
+
+sub download_taxonomy(@) {
+  my ($dir) = @_;
+  print STDERR "Downloading NCBI taxonomy ... \n";
+  make_path $dir;
+
+  download("$FTP/pub/taxonomy/taxdump.tar.gz", "$dir/taxdump.tar.gz");
+  system("tar -C $dir -zxvf $dir/taxdump.tar.gz nodes.dmp names.dmp 1>&2");
+  system("date > $dir/timestamp");
+}
+
+sub download_domain(@) {
+  my ($domain_dir, $domain, $_assembly_level, $_taxid) = @_;
+  print STDERR "Downloading assembly summary file for $domain genomes.\n";
+  die unless defined $domain_dir && defined $domain;
+  if (-d $domain_dir) {
+    print STDERR "WARNING: $domain_dir already exists - potentially overwriting files.\n";
+  } else {
+    make_path $domain_dir;
+  }
+  my $ass_file = "$domain_dir/assembly_summary.txt";
+  my $ass_file_filtered = "$domain_dir/assembly_summary_filtered.txt";
+  my $n_genomes = 0;
+  download("ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$domain/assembly_summary.txt", $ass_file) or die "Could not download assembly summary file!";
+
+  $downloaded_viral_refseq =1 if $domain eq "viral";
+
+  my @genomes_to_dl;
+  open(my $A1, "<", $ass_file);
+  open(my $A2, ">", $ass_file_filtered);
+  while (<$A1>) {
+    next if /^#/;
+    my ($assembly_accession, $bioproject, $biosample, $wgs_master, $refseq_category, 
+      $taxid, $species_taxid, $organism_name, $infraspecific_name, $isolate, $version_status, 
+      $assembly_level, $release_type, $genome_rep, $seq_rel_date, $asm_name, $submitter, 
+      $gbrs_paired_asm, $paired_asm_comp, $ftp_path, $excluded_from_refseq, $relation_to_type_material) = split /\t/;
+
+    next unless $version_status eq "latest";
+    next if ($_assembly_level ne "Any" && $assembly_level ne $_assembly_level);
+    next if (defined $REFSEQ_CATEGORY && $refseq_category ne $REFSEQ_CATEGORY);
+    next if (defined $_taxid && $taxid ne $_taxid);
+    print $A2 $_;
+    ++ $n_genomes;
+    push @genomes_to_dl, [$ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession];
+  }
+  close $A2;
+  close $A1;
+
+  my $i = 0;
+  foreach my $g (@genomes_to_dl) {
+    my ($ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession) = @$g;
+    ++$i;
+
+    if (defined $infraspecific_name) {
+        (my $i1 = $infraspecific_name) =~ s/strain=//;
+        $organism_name .= " $infraspecific_name" unless $organism_name =~ /$i1/ || $i1 eq "";
+    }
+
+    print STDERR "\r Downloading $domain genomes:  $i/$n_genomes ..." unless $VERBOSE;
+
+    my $bname = basename($ftp_path);
+    ( my $organism_name1 = $organism_name ) =~ s/[^a-zA-Z0-9_]/_/g;
+    $organism_name1 = substr($organism_name1, 0, 100);
+    $organism_name1 =~ s/__/_/g;
+    $organism_name1 =~ s/_$//;
+    my $bname1 = "${organism_name1}-tax${taxid}-${bname}";
+    my $pid = $pm->start and next;
+    my %local_ac_to_taxid;
+    
+    foreach my $ext (split(/,/, $FNA_FILES)) {
+      my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz";
+      my $bfname = $bname1."_".$ext;
+      my $fname = $bfname.".fna";
+
+      if (!$OVERWRITE_FILES && -f "$domain_dir/$fname") {
+        print STDERR "$domain_dir/$fname exists - not downloading.. \n" if $VERBOSE;
+      } else {
+        download($full_ftp_path, "$domain_dir/$fname.gz");
+        gunzip "$domain_dir/$fname.gz" => "$domain_dir/$fname" or die "gunzip failed: $GunzipError";
+        unlink "$domain_dir/$fname.gz";
+      }
+
+      if ($CHANGE_HEADER) {
+        system("sed -i 's/^>/>kraken:taxid|$taxid /' '$domain_dir/$fname'");
+      }
+      if ($FILTER_UNPLACED) {
+        ## Not implemented yet!
+      }
+
+      ## Output sequenceID to taxonomy ID map to STDOUT
+      
+      if ($domain eq "viral" && $INCLUDE_VIRAL_NEIGHBORS) {
+        print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession", \%local_ac_to_taxid);
+      } else {
+        print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession");
+      }
+
+      if ($DO_DUST) {
+        ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps
+        system("dustmasker -infmt fasta -in '$domain_dir/$fname' -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > '$domain_dir/${bfname}_dustmasked.fna'");
+        unlink("$domain_dir/$fname");
+      }
+    }
+    $pm->finish(0, \%local_ac_to_taxid);
+  }
+
+  $pm->wait_all_children;
+  print STDERR "\n";
+}
diff --git a/scripts/krakenu-download_genomic_library.sh b/scripts/krakenu-download_genomic_library.sh
deleted file mode 100755
index b1a7f13..0000000
--- a/scripts/krakenu-download_genomic_library.sh
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/bin/bash
-
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
-#
-# This file is part of the Kraken taxonomic sequence classification system.
-#
-# Kraken is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Kraken is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
-
-# Download specific genomic libraries for use with Kraken.
-# Supported choices are:
-#   bacteria - NCBI RefSeq complete bacterial/archaeal genomes
-#   plasmids - NCBI RefSeq plasmid sequences
-#   viruses - NCBI RefSeq complete viral DNA and RNA genomes
-#   human - NCBI RefSeq GRCh38 human reference genome
-
-set -u  # Protect against uninitialized vars.
-set -e  # Stop on error
-
-LIBRARY_DIR="$KRAKEN_DB_NAME/library"
-NCBI_SERVER="ftp.ncbi.nih.gov"
-FTP_SERVER="ftp://$NCBI_SERVER"
-RSYNC_SERVER="rsync://$NCBI_SERVER"
-THIS_DIR=$PWD
-
-case "$1" in
-  "bacteria")
-    mkdir -p $LIBRARY_DIR/Bacteria
-    cd $LIBRARY_DIR/Bacteria
-    if [ ! -e "lib.complete" ]
-    then
-      rm -f all.fna.tar.gz
-      wget $FTP_SERVER/genomes/Bacteria/all.fna.tar.gz
-      echo -n "Unpacking..."
-      tar zxf all.fna.tar.gz
-      rm all.fna.tar.gz
-      echo " complete."
-      touch "lib.complete"
-    else
-      echo "Skipping download of bacterial genomes, already downloaded here."
-    fi
-    ;;
-  "plasmids")
-    mkdir -p $LIBRARY_DIR/Plasmids
-    cd $LIBRARY_DIR/Plasmids
-    if [ ! -e "lib.complete" ]
-    then
-      rm -f plasmids.all.fna.tar.gz
-      wget $FTP_SERVER/genomes/Plasmids/plasmids.all.fna.tar.gz
-      echo -n "Unpacking..."
-      tar zxf plasmids.all.fna.tar.gz
-      rm plasmids.all.fna.tar.gz
-      echo " complete."
-      touch "lib.complete"
-    else
-      echo "Skipping download of plasmids, already downloaded here."
-    fi
-    ;;
-  "viruses")
-    mkdir -p $LIBRARY_DIR/Viruses
-    cd $LIBRARY_DIR/Viruses
-    if [ ! -e "lib.complete" ]
-    then
-      rm -f all.fna.tar.gz
-      rm -f all.ffn.tar.gz
-      wget $FTP_SERVER/genomes/Viruses/all.fna.tar.gz
-      wget $FTP_SERVER/genomes/Viruses/all.ffn.tar.gz
-      echo -n "Unpacking..."
-      tar zxf all.fna.tar.gz
-      tar zxf all.ffn.tar.gz
-      rm all.fna.tar.gz
-      rm all.ffn.tar.gz
-      echo " complete."
-      touch "lib.complete"
-    else
-      echo "Skipping download of viral genomes, already downloaded here."
-    fi
-    ;;
-  "human")
-    mkdir -p $LIBRARY_DIR/Human
-    cd $LIBRARY_DIR/Human
-    if [ ! -e "lib.complete" ]
-    then
-      # get list of CHR_* directories
-      wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/
-      directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing)
-      rm .listing
-
-      # For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add
-      for directory in $directories
-      do
-        wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/$directory/
-        file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\S+\.fa\.gz)\s*$/ and print $1' .listing)
-        [ -z "$file" ] && exit 1
-        rm .listing
-        wget $FTP_SERVER/genomes/H_sapiens/$directory/$file
-        gunzip "$file"
-      done
-
-      touch "lib.complete"
-    else
-      echo "Skipping download of human genome, already downloaded here."
-    fi
-    ;;
-  *)
-    echo "Unsupported library.  Valid options are: "
-    echo "  bacteria plasmids virus human"
-    ;;
-esac
diff --git a/scripts/krakenu-download_taxonomy.sh b/scripts/krakenu-download_taxonomy.sh
deleted file mode 100755
index fc27842..0000000
--- a/scripts/krakenu-download_taxonomy.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
-#
-# This file is part of the Kraken taxonomic sequence classification system.
-#
-# Kraken is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Kraken is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
-
-# Download NCBI taxonomy information for Kraken.
-# Designed to be called by kraken_build
-
-set -u  # Protect against uninitialized vars.
-set -e  # Stop on error
-
-TAXONOMY_DIR="$KRAKEN_DB_NAME/taxonomy"
-NCBI_SERVER="ftp.ncbi.nih.gov"
-FTP_SERVER="ftp://$NCBI_SERVER"
-THIS_DIR=$PWD
-
-mkdir -p "$TAXONOMY_DIR"
-cd "$TAXONOMY_DIR"
-
-if [ ! -e "nucl_gb.accession2taxid.flag" ]
-then
-  wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
-  time gunzip -c nucl_gb.accession2taxid.gz | cut -f 2,3 | sort -k 1,1 > nucl_gb.accession2taxid.sorted
-  touch nucl_gb.accession2taxid.flag
-  echo "Downloaded and sorted GB to taxon map"
-fi
-
-if [ ! -e "taxdump.flag" ]
-then
-  wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz
-  tar zxf taxdump.tar.gz
-  touch taxdump.flag
-  echo "Downloaded and uncompressed taxonomy tree data"
-fi
diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenu-standard_installation.sh
index e10254b..815d482 100755
--- a/scripts/krakenu-standard_installation.sh
+++ b/scripts/krakenu-standard_installation.sh
@@ -31,9 +31,9 @@ then
 fi
 
 check_for_jellyfish.sh
-krakenu-build --db $KRAKEN_DB_NAME --download-taxonomy
-krakenu-build --db $KRAKEN_DB_NAME --download-library bacteria
-krakenu-build --db $KRAKEN_DB_NAME --download-library viruses
+krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy
+krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map
+krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map
 krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
                --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \
                --max-db-size "$KRAKEN_MAX_DB_SIZE" \
diff --git a/src/Makefile b/src/Makefile
index f721cf4..82246e9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 CXX = g++
-CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g
+CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g -Wfatal-errors
 PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
@@ -17,9 +17,9 @@ db_shrink: krakendb.o quickfile.o
 
 db_sort: krakendb.o quickfile.o
 
-set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o
+set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
 
-classify: krakendb.o quickfile.o krakenutil.o seqreader.o
+classify: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
 	$(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS)
 
 build_taxdb: taxdb.h
diff --git a/src/classify.cpp b/src/classify.cpp
index 990012f..b5e196f 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -25,6 +25,7 @@
 #include "readcounts.hpp"
 #include "taxdb.h"
 #include "gzstream.h"
+#include "uid_mapping.hpp"
 #include <sstream>
 
 const size_t DEF_WORK_UNIT_SIZE = 500000;
@@ -39,7 +40,6 @@ void process_file(char *filename);
 bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
                        unordered_map<uint32_t, ReadCounts>&);
-string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig);
 set<uint32_t> get_ancestry(uint32_t taxon);
 void report_stats(struct timeval time1, struct timeval time2);
 unordered_map<uint32_t, ReadCounts> taxon_counts; // stats per taxon
@@ -79,6 +79,7 @@ static vector<KrakenDB*> KrakenDatabases (DB_filenames.size());
 uint64_t total_classified = 0;
 uint64_t total_sequences = 0;
 uint64_t total_bases = 0;
+uint32_t ambig_taxon = -1;
 
 inline bool ends_with(std::string const & value, std::string const & ending)
 {
@@ -117,35 +118,6 @@ void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename)
 	database.set_index(&db_index);
 }
 
-vector<uint32_t> get_taxids_for_uid(uint32_t uid, char* fptr) {
-  size_t int_size = sizeof(int);
-  size_t block_size = sizeof(int)*2;
-  // TODO: Just get a uint64_t and shift the bits, probably faster
-  uint32_t taxid  = *(uint32_t*)(fptr+(uid-1)*block_size);
-  uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size);
-  
-  vector<uint32_t> taxids = {taxid};
-  while (parent_uid != 0) {
-    taxid  = *(uint32_t*)(fptr+(parent_uid-1)*block_size);
-    parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size);
-    taxids.push_back(taxid);
-  }
-  std::sort(taxids.begin(), taxids.end());
-  return(taxids);
-}
-
-vector<uint32_t> get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map<uint32_t, vector<uint32_t> >& uid_map ) {
-  auto it = uid_map.find(uid);
-  if (it != uid_map.end()) {
-    return it->second;
-  } 
-  vector<uint32_t> taxids = get_taxids_for_uid(uid, fptr);
-  uid_map[uid] = taxids;
-  return(taxids);
-}
-
-
-
 int main(int argc, char **argv) {
   #ifdef _OPENMP
   omp_set_num_threads(1);
@@ -161,9 +133,11 @@ int main(int argc, char **argv) {
 
     cerr << "Reading UID mapping file " << UID_to_TaxID_map_filename << endl;
     UID_to_TaxID_map_file.open_file(UID_to_TaxID_map_filename);
-    if (Populate_memory) {
-      UID_to_TaxID_map_file.load_file();
-    }
+
+    // Always Populate memory
+    //if (Populate_memory) {
+    UID_to_TaxID_map_file.load_file();
+    //}
   }
 
   if (!TaxDB_file.empty()) {
@@ -376,12 +350,27 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu
 	return taxon;
 }
 
+inline
+void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_t& last_counter, uint32_t current_taxon) {
+  if (last_taxon == current_taxon) {
+    ++last_counter;
+  } else {
+    if (last_counter > 0) {
+      if (last_taxon == ambig_taxon) {
+        hitlist_string += "A:" + std::to_string(last_counter) + ' ';
+      } else {
+        hitlist_string += std::to_string(last_taxon) + ':' + std::to_string(last_counter) + ' ';
+      }
+    }
+    last_counter = 1;
+    last_taxon = current_taxon;
+  }
+}
+
 bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
                        unordered_map<uint32_t, ReadCounts>& my_taxon_counts) {
   // TODO: use vector::reserve
-  vector<uint32_t> taxa;
-  vector<uint8_t> ambig_list;
   unordered_map<uint32_t, uint32_t> hit_counts;
   uint64_t *kmer_ptr;
   uint32_t taxon = 0;
@@ -394,6 +383,10 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
     int64_t current_max_pos = 0;
   };
 
+  string hitlist_string;
+  uint32_t last_taxon;
+  uint32_t last_counter;
+
   vector<db_status> db_statuses(KrakenDatabases.size());
 
   if (dna.seq.size() >= KrakenDatabases[0]->get_k()) {
@@ -401,10 +394,9 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
     while ((kmer_ptr = scanner.next_kmer()) != NULL) {
       taxon = 0;
       if (scanner.ambig_kmer()) {
-        ambig_list.push_back(1);
+        append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon);
       }
       else {
-        ambig_list.push_back(0);
 
         // go through multiple databases to map k-mer
         for (size_t i=0; i<KrakenDatabases.size(); ++i) {
@@ -423,7 +415,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
             break;
         }
       }
-      taxa.push_back(taxon);
+      append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon);
     }
   }
 
@@ -433,7 +425,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
       cerr << "Quick mode not available when mapping UIDs" << endl;
       exit(1);
     } else {
-      call = resolve_uids2(hit_counts, Parent_map, UID_to_TaxID_map_file.ptr());
+      call = resolve_uids2(hit_counts, Parent_map, (const uint32_t *)UID_to_TaxID_map_file.ptr(), UID_to_TaxID_map_file.size());
     }
   } else {
     if (Quick_mode)
@@ -478,10 +470,13 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
     koss << "Q:" << hits;
   }
   else {
-    if (taxa.empty())
+    if (hitlist_string.empty() && last_counter == 0)
       koss << "0:0";
-    else
-      koss << hitlist_string(taxa, ambig_list);
+    else {
+      koss << hitlist_string
+           << (last_taxon == ambig_taxon? "A" :  std::to_string(last_taxon))
+           << ':' << std::to_string(last_counter);
+    }
   }
 
   if (Print_sequence)
@@ -491,43 +486,6 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
   return call;
 }
 
-string hitlist_string(vector<uint32_t> &taxa, vector<uint8_t> &ambig)
-{
-  int64_t last_code;
-  int code_count = 1;
-  ostringstream hitlist;
-
-  if (ambig[0])   { last_code = -1; }
-  else            { last_code = taxa[0]; }
-
-  for (size_t i = 1; i < taxa.size(); i++) {
-    int64_t code;
-    if (ambig[i]) { code = -1; }
-    else          { code = taxa[i]; }
-
-    if (code == last_code) {
-      code_count++;
-    }
-    else {
-      if (last_code >= 0) {
-        hitlist << last_code << ":" << code_count << " ";
-      }
-      else {
-        hitlist << "A:" << code_count << " ";
-      }
-      code_count = 1;
-      last_code = code;
-    }
-  }
-  if (last_code >= 0) {
-    hitlist << last_code << ":" << code_count;
-  }
-  else {
-    hitlist << "A:" << code_count;
-  }
-  return hitlist.str();
-}
-
 set<uint32_t> get_ancestry(uint32_t taxon) {
   set<uint32_t> path;
 
diff --git a/src/krakendb.cpp b/src/krakendb.cpp
index ec9927c..f89f869 100644
--- a/src/krakendb.cpp
+++ b/src/krakendb.cpp
@@ -69,15 +69,18 @@ KrakenDB::KrakenDB(char *ptr) {
   key_len = key_bits / 8 + !! (key_bits % 8);
 }
 
-std::unordered_map<uint32_t,uint64_t> KrakenDB::count_taxons() {
-  throw std::runtime_error("count_taxons() is not working");
-  // Not working currently!!
+//using std::map to have the keys sorted
+std::map<uint32_t,uint64_t> KrakenDB::count_taxons() {
   char *ptr = get_pair_ptr();
   size_t pair_sz = pair_size();
 
-  std::unordered_map<uint32_t, uint64_t> taxon_counts;
+  std::map<uint32_t, uint64_t> taxon_counts;
+  if (ptr == NULL) { 
+    std::cerr << "Kraken database pointer is NULL [pair_sz: " << pair_sz << ", key_ct: "<<key_ct<<", key_len: "<< key_len<<"]!" << std::endl;
+    exit(1);
+  }
   for (uint64_t i = 0; i < key_ct; i++) {
-    uint32_t* taxon = (uint32_t *) ptr + pair_sz * i + key_len;
+    uint32_t* taxon = (uint32_t *) (ptr + pair_sz * i + key_len);
     if (taxon == NULL) {
         std::cerr << "taxon is NULL (i is " << i << " and key_ct is " << key_ct << ")" << std::endl;
     } else {
diff --git a/src/krakendb.hpp b/src/krakendb.hpp
index f586026..4683654 100644
--- a/src/krakendb.hpp
+++ b/src/krakendb.hpp
@@ -22,6 +22,7 @@
 
 #include "kraken_headers.hpp"
 #include <unordered_map>
+#include <map>
 
 namespace kraken {
   class KrakenDBIndex {
@@ -64,7 +65,7 @@ namespace kraken {
 
 
     // return a count of k-mers for all taxons
-	std::unordered_map<uint32_t,uint64_t> count_taxons();
+    std::map<uint32_t,uint64_t> count_taxons();
     
     // return "bin key" for kmer, based on index
     // If idx_nt not specified, use index's value
diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index 28ca837..2da433e 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -20,6 +20,7 @@
 #include "assert_helpers.h"
 #include "kraken_headers.hpp"
 #include "krakenutil.hpp"
+#include <unordered_set>
 
 using namespace std;
 
@@ -49,30 +50,30 @@ namespace kraken {
   // Return lowest common ancestor of a and b
   // LCA(0,x) = LCA(x,0) = x
   // Default ancestor is 1 (root of tree)
-  uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map,
-    uint32_t a, uint32_t b)
-  {
+  uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b) {
     if (a == 0 || b == 0)
       return a ? a : b;
 
     // create a path from a to the root
-    set<uint32_t> a_path;
-    while (a > 0) {
+    std::unordered_set<uint32_t> a_path;
+    while (a > 0 && a != parent_map.at(a)) {
+	  if (a == b)
+		  return a;
       a_path.insert(a);
-      assert(parent_map.find(a) != parent_map.end());
       a = parent_map.at(a);
     }
 
     // search for b in the path from a to the root
-    while (b > 0) {
+    while (b > 0 && b != parent_map.at(b)) {
       if (a_path.count(b) > 0)
         return b;
-      assert(parent_map.find(b) != parent_map.end());
       b = parent_map.at(b);
     }
     return 1;
   }
 
+
+
   // Tree resolution: take all hit taxa (plus ancestors), then
   // return leaf of highest weighted leaf-to-root path.
   uint32_t resolve_tree(const unordered_map<uint32_t, uint32_t> &hit_counts,
@@ -120,126 +121,6 @@ namespace kraken {
   }
 
 
-  // Tree resolution: take all hit taxa (plus ancestors), then
-  // return leaf of highest weighted leaf-to-root path.
-  uint32_t resolve_uids(
-      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
-      const unordered_map<uint32_t, uint32_t> &parent_map,
-      const vector< vector<uint32_t> > &UID_to_taxids_vec) {
-    unordered_map<uint32_t, uint32_t> taxid_counts;
-    unordered_map<uint32_t, double> frac_taxid_counts;
-
-    if (uid_hit_counts.size() == 0) {
-      return(0);
-    }
-
-    for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
-      uint32_t uid = it->first;
-      double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size());
-      for (auto taxid : UID_to_taxids_vec[uid-1]) {
-        taxid_counts[taxid] += it->second;
-        frac_taxid_counts[taxid] += frac_count;
-      }
-    }
-    vector<uint32_t> max_taxids;
-    uint32_t max_count = 0;
-    double max_frac_count = 0;
-    for (auto it : taxid_counts) {
-      if (it.second == max_count) {
-        if (frac_taxid_counts[it.first] == max_frac_count) {
-          max_taxids.push_back(it.first);
-        } else if (frac_taxid_counts[it.first] > max_frac_count) {
-          max_frac_count = frac_taxid_counts[it.first];
-          max_taxids = { it.first };
-        }
-      } else if (it.second > max_count) {
-        max_taxids = { it.first };
-        max_count = it.second;
-        max_frac_count = frac_taxid_counts[it.first];
-      }
-    }
-
-    uint32_t max_taxon = max_taxids[0];
-    auto sit = max_taxids.begin();
-    for (++sit; sit != max_taxids.end(); ++sit) {
-      max_taxon = lca(parent_map, max_taxon, *sit);
-
-    }
-
-    // return the taxid that appeared most often
-    return max_taxon;
-  }
-
-  // Tree resolution: take all hit taxa (plus ancestors), then
-  // return leaf of highest weighted leaf-to-root path.
-  uint32_t resolve_uids2(
-      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
-      const unordered_map<uint32_t, uint32_t> &parent_map,
-      char* fptr) {
-    unordered_map<uint32_t, uint32_t> taxid_counts;
-    unordered_map<uint32_t, double> frac_taxid_counts;
-
-    if (uid_hit_counts.size() == 0) {
-      return(0);
-    }
-
-    size_t int_size = sizeof(int);
-    size_t block_size = sizeof(int)*2;
-    for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
-      uint32_t uid = it->first;
-      if (uid == 0) {
-	continue;
-      }
-      uint32_t taxid;
-      // TODO: Just get a uint64_t and shift the bits, probably faster
-      vector<uint32_t> taxids;
-      do {
-        taxid = *(uint32_t*)(fptr+(uid-1)*block_size);
-        uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size);
-  
-        taxid_counts[taxid] += it->second;
-	taxids.push_back(taxid);
-      } while (uid != 0);
-
-      double frac_count = (double)it->second / (double)taxids.size();
-      for (uint32_t taxid : taxids) {
-        frac_taxid_counts[taxid] += frac_count;
-      }
-    }
-
-    if (taxid_counts.size() == 0) {
-      return(0);
-    }
-    vector<uint32_t> max_taxids;
-    uint32_t max_count = 0;
-    double max_frac_count = 0;
-    for (auto it : taxid_counts) {
-      if (it.second == max_count) {
-        if (frac_taxid_counts[it.first] == max_frac_count) {
-          max_taxids.push_back(it.first);
-        } else if (frac_taxid_counts[it.first] > max_frac_count) {
-          max_frac_count = frac_taxid_counts[it.first];
-          max_taxids = { it.first };
-        }
-      } else if (it.second > max_count) {
-        max_taxids = { it.first };
-        max_count = it.second;
-        max_frac_count = frac_taxid_counts[it.first];
-      }
-    }
-
-    uint32_t max_taxon = max_taxids[0];
-    auto sit = max_taxids.begin();
-    for (++sit; sit != max_taxids.end(); ++sit) {
-      max_taxon = lca(parent_map, max_taxon, *sit);
-
-    }
-
-    // return the taxid that appeared most often
-    return max_taxon;
-  }
-
-
 
 
   uint8_t KmerScanner::k = 0;
@@ -277,14 +158,19 @@ namespace kraken {
   }
 
   uint64_t *KmerScanner::next_kmer() {
+    bool skip_pos = false;
     if (curr_pos >= pos2)
       return NULL;
     if (loaded_nt)  
       loaded_nt--;
     while (loaded_nt < k) {
-      loaded_nt++;
-      kmer <<= 2;
-      ambig <<= 1;
+      if (skip_pos) {
+	skip_pos = false;
+      } else {
+        loaded_nt++;
+        kmer <<= 2;
+        ambig <<= 1;
+      }
       switch ((*str)[curr_pos++]) {
         case 'A': case 'a':
           break;
@@ -297,6 +183,11 @@ namespace kraken {
         case 'T': case 't':
           kmer |= 3;
           break;
+	case '\n': case '\r':
+	  --loaded_nt;
+	  skip_pos = true;
+	  continue;
+	  break;
         default:
           ambig |= 1;
           break;
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
index 854e26b..46e8eb8 100644
--- a/src/krakenutil.hpp
+++ b/src/krakenutil.hpp
@@ -27,23 +27,17 @@ namespace kraken {
   // Build a map of node to parent from an NCBI taxonomy nodes.dmp file
   std::unordered_map<uint32_t, uint32_t> build_parent_map(std::string filename);
 
-  // Return the lowest common ancestor of a and b, according to parent_map
-  // NOTE: LCA(0,x) = LCA(x,0) = x
+  // Return lowest common ancestor of a and b
+  // LCA(0,x) = LCA(x,0) = x
+  // Default ancestor is 1 (root of tree)
+uint32_t lca(const std::unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b);
+
+
 
   // Resolve classification tree
   uint32_t resolve_tree(const std::unordered_map<uint32_t, uint32_t> &hit_counts,
                         const std::unordered_map<uint32_t, uint32_t> &parent_map);
 
-  uint32_t resolve_uids(
-        const std::unordered_map<uint32_t, uint32_t> &uid_hit_counts,
-        const std::unordered_map<uint32_t, uint32_t> &parent_map,
-        const std::vector< std::vector<uint32_t> > &UID_to_taxids_vec);
-
-  uint32_t resolve_uids2(
-        const std::unordered_map<uint32_t, uint32_t> &uid_hit_counts,
-        const std::unordered_map<uint32_t, uint32_t> &parent_map,
-        char* fptr);
-
   class KmerScanner {
     public:
 
diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp
index 76b839a..0ac84db 100644
--- a/src/read_uid_mapping.cpp
+++ b/src/read_uid_mapping.cpp
@@ -7,6 +7,7 @@
 using namespace std;
 using namespace kraken;
 
+inline
 vector<uint32_t> get_taxids_for_uid(uint32_t uid, char* fptr) {
   size_t int_size = sizeof(int);
   size_t block_size = sizeof(int)*2;
@@ -24,7 +25,7 @@ vector<uint32_t> get_taxids_for_uid(uint32_t uid, char* fptr) {
   return(taxids);
 }
 
-
+inline
 vector<uint32_t> get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map<uint32_t, vector<uint32_t> >& uid_map ) {
   auto it = uid_map.find(uid);
   if (it != uid_map.end()) {
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 1396a7f..504e2b6 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -25,6 +25,7 @@
 #include "seqreader.hpp"
 #include "taxdb.h"
 #include "readcounts.hpp"
+#include "uid_mapping.hpp"
 #include <unordered_map>
 #include <map>
 
@@ -43,6 +44,7 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish);
 int Num_threads = 1;
 string DB_filename, Index_filename,
   Output_DB_filename, TaxDB_filename,
+  Kmer_count_filename,
   File_to_taxon_map_filename,
   ID_to_taxon_map_filename, Multi_fasta_filename;
 bool force_taxid = false;
@@ -52,6 +54,7 @@ bool Allow_extra_kmers = false;
 bool verbose = false;
 bool Operate_in_RAM = false;
 bool One_FASTA_file = false;
+bool Add_taxIds_for_Assembly = false;
 bool Add_taxIds_for_Sequences = false;
 bool Use_uids_instead_of_taxids = false;
 bool Output_UID_map_to_STDOUT = false;
@@ -61,20 +64,20 @@ string UID_map_filename;
 ofstream UID_map_file;
 
 uint32_t current_uid = 0;
-uint32_t max_uid = -1;
 unordered_map<uint32_t, uint32_t> Parent_map;
 //unordered_multimap<uint32_t, uint32_t> Children_map;
 //typedef std::_Rb_tree_iterator<std::pair<const std::set<unsigned int>, unsigned int> > map_it;
 //typedef std::_Rb_tree_iterator<std::pair<const std::vector<unsigned int>, unsigned int> > map_it;
-typedef const vector<uint32_t>* map_it;
-vector< map_it > UID_to_taxids_vec;
-map< vector<uint32_t>, uint32_t> Taxids_to_UID_map;
+vector< const TaxidSet*  > UID_to_taxids_vec;
+map< TaxidSet, uint32_t> Taxids_to_UID_map;
 
 unordered_map<string, uint32_t> ID_to_taxon_map;
 unordered_map<uint32_t, bool> SeqId_added;
 KrakenDB Database;
 TaxonomyDB<uint32_t, ReadCounts> taxdb;
 
+const string prefix = "kraken:taxid|";
+
 int main(int argc, char **argv) {
   #ifdef _OPENMP
   omp_set_num_threads(1);
@@ -141,6 +144,16 @@ int main(int argc, char **argv) {
   else
     process_files();
 
+  if (!Kmer_count_filename.empty()) {
+    ofstream ofs(Kmer_count_filename.c_str());
+    cerr << "Writing kmer counts to " << Kmer_count_filename << "..." << endl;
+    auto counts = Database.count_taxons();
+    for (auto const & kv : counts) {
+      ofs << kv.first << '\t' << kv.second << '\n';
+    }
+    ofs.close();
+  }
+
   if (Operate_in_RAM && !Pretend) {
     if (Output_DB_filename.size() > 0) {
       DB_filename = Output_DB_filename;
@@ -155,7 +168,7 @@ int main(int argc, char **argv) {
   UID_map_file.close();
 
   // Write new TaxDB file if new taxids were added
-  if (Add_taxIds_for_Sequences && !TaxDB_filename.empty() && !Pretend) {
+  if ((Add_taxIds_for_Sequences || Add_taxIds_for_Assembly) && !TaxDB_filename.empty() && !Pretend) {
     cerr << "Writing new TaxDB ..." << endl;
     ofstream ofs(TaxDB_filename.c_str());
     taxdb.writeTaxonomyIndex(ofs);
@@ -165,35 +178,78 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-void process_single_file() {
-  cerr << "Processing FASTA files" << endl;
+inline 
+uint32_t get_taxid(
+    unordered_map<string, uint32_t>& name_to_taxid_map, 
+    unordered_map<uint32_t,uint32_t>& Parent_map,
+    string name, uint32_t parent_taxid, const string & rank_name) {
+
+  auto it = name_to_taxid_map.find(name);
+  if (it == name_to_taxid_map.end()) {
+    uint32_t new_taxid = ++New_taxid_start;
+    bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name);
+    if (!insert_res)
+          cerr << "Taxonomy ID " << new_taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl;
+    // insert_res shows if insert failed, but we don't care
+    // cerr << "Adding assembly: " << name << " with taxid " << new_taxid << endl;
+    Parent_map[new_taxid] = parent_taxid;
+    name_to_taxid_map[name] = new_taxid;
+    return new_taxid;
+   } else {
+    return it->second;
+   }
+}
+
+unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_filename, 
+    TaxonomyDB<uint32_t, ReadCounts>& taxdb, unordered_map<uint32_t,uint32_t>& Parent_map, 
+    bool Add_taxIds_for_Assembly, bool Add_taxIds_for_Sequences) {
+
+  unordered_map<string, uint32_t> ID_to_taxon_map;
   ifstream map_file(ID_to_taxon_map_filename.c_str());
   if (map_file.rdstate() & ifstream::failbit) {
     err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str());
   }
   string line, seq_id;
-  uint32_t parent_taxid, taxid;
+  uint32_t taxid;
+
+  // Used when adding new taxids for assembly or sequence
+  unordered_map<string, uint32_t> name_to_taxid_map;
+
   while (map_file.good()) {
     getline(map_file, line);
     if (line.empty())
       break;
     istringstream iss(line);
-    iss >> seq_id;
-    if (ID_to_taxon_map.find(seq_id) != ID_to_taxon_map.end()) 
-        continue;
+    iss >> seq_id >> taxid;
+
+    auto it = ID_to_taxon_map.find(seq_id);
+    if (it != ID_to_taxon_map.end()) {
+      // The sequence ID has been seen before, ignore
+      continue;
+    }
+
+    if (Add_taxIds_for_Assembly && iss.good()) {
+      iss.get();
+      string name;
+      getline(iss, name);
+      taxid = get_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly");
+    }
 
     if (Add_taxIds_for_Sequences) {
-      iss >> parent_taxid;
-      taxid = ++New_taxid_start;
-      Parent_map[taxid] = parent_taxid;
-      auto itEntry = taxdb.taxIDsAndEntries.insert({taxid, TaxonomyEntry<uint32_t, ReadCounts>(taxid, parent_taxid, "sequence")});
-      if (!itEntry.second)
-          cerr << "Taxonomy ID " << taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl;
-    } else {
-      iss >> taxid;
+      taxid = get_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence");
+    }
+    if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) {
+      cout << seq_id << '\t' << taxid << '\n';
     }
     ID_to_taxon_map[seq_id] = taxid;
   }
+  return std::move(ID_to_taxon_map);
+}
+
+void process_single_file() {
+  cerr << "Processing FASTA files" << endl;
+ 
+  ID_to_taxon_map = read_seqid_to_taxid_map(ID_to_taxon_map_filename, taxdb, Parent_map, Add_taxIds_for_Assembly, Add_taxIds_for_Sequences);
 
   FastaReader reader(Multi_fasta_filename);
   DNASequence dna;
@@ -213,23 +269,27 @@ void process_single_file() {
 
     // Get the taxid. If the header specifies kraken:taxid, use that
     uint32_t taxid;
-    string prefix = "kraken:taxid|";
-    if (dna.id.substr(0,prefix.size()) == prefix) {
+    auto it = ID_to_taxon_map.find(dna.id);
+    if (it != ID_to_taxon_map.end()) {
+      taxid = it->second;
+    } else if (dna.id.size() >= prefix.size() && dna.id.substr(0,prefix.size()) == prefix) {
         taxid = std::stol(dna.id.substr(prefix.size()));
         if (taxid == 0) {
-          cerr << "Error: taxid is zero for the line '" << dna.id << "'?!" << endl;
+          cerr << "Error: taxonomy ID is zero for sequence '" << dna.id << "'?!" << endl;
         }
         const auto strBegin = dna.header_line.find_first_not_of("\t ");
         if (strBegin != std::string::npos)
             dna.header_line = dna.header_line.substr(strBegin);
     } else {
-        taxid = ID_to_taxon_map[dna.id];
+        cerr << "Error! Didn't find taxonomy ID mapping for sequence " <<  dna.id << "!!" << endl;
+        ++seqs_skipped;
+        continue;
     }
     
     if (Add_taxIds_for_Sequences) {
       auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
       if (entryIt == taxdb.taxIDsAndEntries.end()) {
-        cerr << "Error! Didn't find " << taxid << " in TaxonomyDB!!" << endl;
+        cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["<<dna.header_line<<"]" << endl;
       } else {
         entryIt->second.scientificName = dna.header_line;
       }
@@ -271,6 +331,7 @@ void process_files() {
     istringstream iss(line);
     iss >> filename;
     iss >> taxid;
+    // TODO: Support a mapping file with only file names, not taxids
     process_file(filename, taxid);
     cerr << "\rProcessed " << ++seqs_processed << " sequences";
   }
@@ -291,6 +352,11 @@ void process_file(string filename, uint32_t taxid) {
     set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
 }
 
+void process_sequence(DNASequence dna) {
+  // TODO: Refactor such that a list of files + taxid can be given.
+  // Or maybe asembly_summary file?
+}
+
 void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
   KmerScanner scanner(seq, start, finish);
   uint64_t *kmer_ptr;
@@ -311,62 +377,11 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
       }
       continue;
     }
-    if (Use_uids_instead_of_taxids) {
-      uint32_t kmer_uid = *val_ptr;
-      bool new_taxid = kmer_uid == 0;
-      vector<uint32_t> taxid_set;
-      if (new_taxid) {
-        taxid_set.push_back(taxid);
-      } else {
-        if (kmer_uid > UID_to_taxids_vec.size()) {
-          // This can happen when set_lcas is called on a database that is not all zeros
-          cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl;
-          exit(1);
-        }
-        taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1));
-        auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order
 
-        if (it == taxid_set.end() || *it != taxid) {
-          // add the taxid to the set, in the right position
-           taxid_set.insert( it, taxid ); // insert before iterator it
-           new_taxid = true;
-        }
-      }
-
-      if (new_taxid) {
-        if (max_uid <= current_uid) {
-          cerr << "Maxxed out on the UIDs!!" << endl;
-          exit(1);
-        }
-
-        // get a new taxid for this set
-        #pragma omp critical(new_uid)
-        {
-        auto insert_res = Taxids_to_UID_map.insert( { std::move(taxid_set), current_uid + 1 } );
-        if (insert_res.second) {
-          ++current_uid;
-
-          // print result for map:
-          if (Output_UID_map_to_STDOUT) {
-            auto tid_it = insert_res.first->first.begin();
-            cout << current_uid << '\t' << *tid_it++; 
-            while (tid_it != insert_res.first->first.end()) { cout << ' ' << *tid_it++; }
-            cout << '\n';
-          }
-
-          // FORMAT: TAXID<uint32_t> PARENT<uint32_t>
-          // TODO: Consider using mmap here
-          UID_map_file.write((char*)&taxid, sizeof(taxid));
-          UID_map_file.write((char*)&kmer_uid, sizeof(kmer_uid));
-
-          //UID_to_taxids_vec[current_uid] = taxid_set;
-          UID_to_taxids_vec.push_back( &(insert_res.first->first) );
-          *val_ptr = current_uid;
-        } else {
-         *val_ptr = insert_res.first->second;
-        }
-        }
-      }
+    // TODO: Should I use pragma omp critical here?
+    if (Use_uids_instead_of_taxids) {
+      #pragma omp critical(new_uid)
+      *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file);
     } else if (!force_taxid) {
       *val_ptr = lca(Parent_map, taxid, *val_ptr);
     } else {
@@ -383,7 +398,7 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:apI:o:S")) != -1) {
+  while ((opt = getopt(argc, argv, "f:d:i:t:n:m:F:xMTvb:aApI:o:Sc:")) != -1) {
     switch (opt) {
       case 'f' :
         File_to_taxon_map_filename = optarg;
@@ -392,9 +407,6 @@ void parse_command_line(int argc, char **argv) {
         Use_uids_instead_of_taxids = true;
         UID_map_filename = optarg;
         break;
-      case 'S' :
-        Output_UID_map_to_STDOUT = true;
-        break;
       case 'd' :
         DB_filename = optarg;
         break;
@@ -430,9 +442,15 @@ void parse_command_line(int argc, char **argv) {
       case 'a' :
         Add_taxIds_for_Sequences = true;
         break;
+      case 'A' :
+        Add_taxIds_for_Assembly = true;
+        break;
       case 'b' :
         TaxDB_filename = optarg;
         break;
+      case 'c' :
+        Kmer_count_filename = optarg;
+        break;
       case 'M' :
         Operate_in_RAM = true;
         break;
@@ -475,10 +493,10 @@ void usage(int exit_code) {
        << "  -f filename      File to taxon map" << endl
        << "  -F filename      Multi-FASTA file with sequence data" << endl
        << "  -m filename      Sequence ID to taxon map" << endl
-       << "  -a               Add taxonomy IDs (starting with "<<New_taxid_start<<") for sequences to Taxonomy DB" << endl
+       << "  -a               Add taxonomy IDs (starting with "<<New_taxid_start<<") for assemblies (third column in seqid2taxid.map) to Taxonomy DB" << endl
+       << "  -A               Add taxonomy IDs (starting with "<<New_taxid_start<<") for sequences to Taxonomy DB" << endl
        << "  -T               Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
        << "  -I filename      Write UIDs into database, and output (binary) UID-to-taxid map to filename" << endl
-       << "  -S               Write UID-to-taxid map to STDOUT" << endl
        << "  -p               Pretend - do not write database back to disk (when working in RAM)" << endl
        << "  -v               Verbose output" << endl
        << "  -h               Print this message" << endl
diff --git a/src/taxdb.h b/src/taxdb.h
index 7c94f33..0495c8c 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -110,11 +110,14 @@ class TaxonomyDB {
 
   TAXID getParentTaxID(const TAXID taxID) const;
   std::unordered_map<TAXID, TAXID> getParentMap() const;
+  TAXID getByScientificName(string name) const;
   std::unordered_map<std::string, TAXID> getScientificNameMap() const;
   std::string getLineage(TAXID taxonomyID) const;
   std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
   TaxonomyEntry<TAXID,READCOUNTS> getEntry(TAXID taxID) const;
 
+  bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_);
+
   size_t distance(TAXID taxID1, TAXID taxID2) const;
 
   bool isSubSpecies(TAXID taxonomyID) const;
@@ -158,12 +161,6 @@ class TaxReport {
 	std::vector<REPORTCOLS> _report_cols;
 };
 
-
-  // Return lowest common ancestor of a and b
-  // LCA(0,x) = LCA(x,0) = x
-  // Default ancestor is 1 (root of tree)
-uint32_t lca(std::unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b);
-
 template<typename K,typename V>
 inline
 V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, const V default_value);
@@ -266,6 +263,15 @@ bool TaxonomyEntryPtr_comp<TAXID,READCOUNTS>::operator() ( const TaxonomyEntry<T
 	        return ((reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren)));
 			    }
 
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getByScientificName(string name) const {
+	for (const auto & tax : taxIDsAndEntries) {
+		if (tax.second.scientificName == name) {
+		  return tax.first;
+		}
+	}
+	return 0;
+}
 
 template<typename TAXID, typename READCOUNTS>
 std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getScientificNameMap() const {
@@ -507,6 +513,26 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
   return consensus;
 }
 
+template<typename TAXID, typename READCOUNTS>
+bool TaxonomyDB<TAXID, READCOUNTS>::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, 
+    std::string rank_, std::string scientificName_) {
+
+  TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0);
+  
+  auto parentIt = taxIDsAndEntries.find(parentTaxonomyID_);
+  if (parentIt == taxIDsAndEntries.end() || parentTaxonomyID_ == taxonomyID_) {
+    cerr << "ERROR while inserting taxonomy entry - taxonomy ID " << taxonomyID_  <<"; parent taxonomy ID " << parentTaxonomyID_ << "!" << endl;
+    return false;
+  }
+
+  newEntry.parent = &(parentIt->second);
+  auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry});
+  parentIt->second.children.push_back(&insert_res.first->second);
+
+  return insert_res.second;
+
+}
+
 template<typename TAXID, typename READCOUNTS>
 TAXID TaxonomyDB<TAXID,READCOUNTS>::getParentTaxID(const TAXID taxID) const {
   auto entry = taxIDsAndEntries.find(taxID);
@@ -798,32 +824,6 @@ void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax
 }
 
 
-  // Return lowest common ancestor of a and b
-  // LCA(0,x) = LCA(x,0) = x
-  // Default ancestor is 1 (root of tree)
-uint32_t lca(unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b)
-  {
-    if (a == 0 || b == 0)
-      return a ? a : b;
-
-    // create a path from a to the root
-	std::unordered_set<uint32_t> a_path;
-    while (a > 0 && a != parent_map[a]) {
-	  if (a == b)
-		  return a;
-      a_path.insert(a);
-      a = parent_map[a];
-    }
-
-    // search for b in the path from a to the root
-    while (b > 0 && b != parent_map[b]) {
-      if (a_path.count(b) > 0)
-        return b;
-      b = parent_map[b];
-    }
-    return 1;
-  }
-
 template<typename K,typename V>
 inline
 V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, const V default_value) {
diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp
new file mode 100644
index 0000000..966a685
--- /dev/null
+++ b/src/uid_mapping.cpp
@@ -0,0 +1,196 @@
+
+#include<iostream>
+#include "uid_mapping.hpp"
+#include "krakenutil.hpp"
+#include "assert_helpers.h"
+
+using namespace std;
+
+namespace kraken {
+
+  static size_t INT_SIZE=sizeof(uint32_t);
+  static size_t UID_BLOCK_SIZE=2*INT_SIZE;
+  static uint32_t max_uid = -1;
+
+  uint32_t uid_mapping(
+      map< TaxidSet, uint32_t>& Taxids_to_UID_map, 
+      vector< const TaxidSet*  >& UID_to_taxids_vec, 
+      uint32_t taxid, 
+      uint32_t kmer_uid, 
+      uint32_t& current_uid,
+      ofstream& UID_map_file) {
+
+    vector<uint32_t> taxid_set;
+    if (kmer_uid == 0) {
+      taxid_set.push_back(taxid);
+    } else {
+      if (kmer_uid > UID_to_taxids_vec.size()) {
+        // This can happen when set_lcas is called more than once on a database (ie not all values start w/ 0)
+        cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl;
+        exit(1);
+      }
+      taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1));
+      auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order
+      if (it == taxid_set.end() || *it != taxid) {
+        // add the taxid to the set, in the right position such that it remains sorted
+         taxid_set.insert( it, taxid ); // insert before iterator it
+      } else {
+        // the taxid is already part of the set for kmer_uid, return kmer_uid
+        return kmer_uid;
+      }
+    }
+
+    // This taxid is not part of kmer_uids set, but is this new taxon_set already assigned to another UID?
+    // Try inserting ..
+    auto insert_res = Taxids_to_UID_map.insert( { std::move(taxid_set), current_uid + 1 } );
+    if (!insert_res.second) {
+      // Insert unsuccessful, taxid set already has an UID
+      return insert_res.first->second;
+    }
+
+    // Get a new UID
+    if (max_uid <= ++current_uid) {
+      cerr << "Maxxed out on UIDs!!" << endl;
+      exit(1);
+    }
+
+    UID_to_taxids_vec.push_back( &(insert_res.first->first) );
+    assert(UID_to_taxids_vec.size() == current_uid);
+
+    // Write to mapping file
+    // format: TAXID<uint32_t> PARENT<uint32_t>
+    // read it with read_uid_mapping
+    UID_map_file.write((char*)&taxid, sizeof(taxid));
+    UID_map_file.write((char*)&kmer_uid, sizeof(kmer_uid));
+
+    return current_uid;
+  } // end of uid_mapping
+
+
+  // Tree resolution: take all hit taxa (plus ancestors), then
+  // return leaf of highest weighted leaf-to-root path.
+  uint32_t resolve_uids(
+      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+      const unordered_map<uint32_t, uint32_t> &parent_map,
+      const vector< vector<uint32_t> > &UID_to_taxids_vec) {
+    unordered_map<uint32_t, uint32_t> taxid_counts;
+    unordered_map<uint32_t, double> frac_taxid_counts;
+
+    if (uid_hit_counts.size() == 0) {
+      return(0);
+    }
+
+    for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
+      uint32_t uid = it->first;
+      double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size());
+      for (auto taxid : UID_to_taxids_vec[uid-1]) {
+        taxid_counts[taxid] += it->second;
+        frac_taxid_counts[taxid] += frac_count;
+      }
+    }
+    vector<uint32_t> max_taxids;
+    uint32_t max_count = 0;
+    double max_frac_count = 0;
+    for (auto it : taxid_counts) {
+      if (it.second == max_count) {
+        if (frac_taxid_counts[it.first] == max_frac_count) {
+          max_taxids.push_back(it.first);
+        } else if (frac_taxid_counts[it.first] > max_frac_count) {
+          max_frac_count = frac_taxid_counts[it.first];
+          max_taxids = { it.first };
+        }
+      } else if (it.second > max_count) {
+        max_taxids = { it.first };
+        max_count = it.second;
+        max_frac_count = frac_taxid_counts[it.first];
+      }
+    }
+
+    uint32_t max_taxon = max_taxids[0];
+    auto sit = max_taxids.begin();
+    for (++sit; sit != max_taxids.end(); ++sit) {
+      max_taxon = lca(parent_map, max_taxon, *sit);
+
+    }
+
+    // return the taxid that appeared most often
+    return max_taxon;
+  }
+
+  // Tree resolution: take all hit taxa (plus ancestors), then
+  // return leaf of highest weighted leaf-to-root path.
+  uint32_t resolve_uids2(
+      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+      const unordered_map<uint32_t, uint32_t> &parent_map,
+      const uint32_t* fptr, const size_t fsize) {
+
+    unordered_map<uint32_t, uint32_t> taxid_counts;
+    unordered_map<uint32_t, double> frac_taxid_counts;
+
+    if (uid_hit_counts.size() == 0) {
+      return(0);
+    }
+
+    for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
+      uint32_t next_uid = it->first;
+      if (next_uid == 0) {
+        continue;
+      }
+      uint32_t taxid;
+      // TODO: Just get a uint64_t and shift the bits, probably faster
+      vector<uint32_t> taxids;
+      do {
+        // Check if the accessed memory is out of range
+				//   -- move this to a DEBUG-only assert
+				// UID-1 is used because UIDs start at 1
+				uint32_t offset = (next_uid-1)*UID_BLOCK_SIZE;
+        if (offset >= fsize) {
+          cerr << "It seems you are trying to access a block after the file end: \n" <<
+          " fptr: " << fptr << "; uid: " << next_uid << "; " << " addr: " << (offset + INT_SIZE) << endl;
+          exit(1);
+        }
+        taxid = *(fptr + offset);
+        next_uid = *(fptr+ offset + INT_SIZE);
+        taxid_counts[taxid] += it->second;
+        taxids.push_back(taxid);
+      } while (next_uid != 0);
+
+      double frac_count = (double)it->second / (double)taxids.size();
+      for (uint32_t taxid : taxids) {
+        frac_taxid_counts[taxid] += frac_count;
+      }
+    }
+
+    if (taxid_counts.size() == 0) {
+      return(0);
+    }
+    vector<uint32_t> max_taxids;
+    uint32_t max_count = 0;
+    double max_frac_count = 0;
+    for (auto it : taxid_counts) {
+      if (it.second == max_count) {
+        if (frac_taxid_counts[it.first] == max_frac_count) {
+          max_taxids.push_back(it.first);
+        } else if (frac_taxid_counts[it.first] > max_frac_count) {
+          max_frac_count = frac_taxid_counts[it.first];
+          max_taxids = { it.first };
+        }
+      } else if (it.second > max_count) {
+        max_taxids = { it.first };
+        max_count = it.second;
+        max_frac_count = frac_taxid_counts[it.first];
+      }
+    }
+
+    uint32_t max_taxon = max_taxids[0];
+    auto sit = max_taxids.begin();
+    for (++sit; sit != max_taxids.end(); ++sit) {
+      max_taxon = lca(parent_map, max_taxon, *sit);
+
+    }
+
+    // return the taxid that appeared most often
+    return max_taxon;
+  }
+
+}
diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp
new file mode 100644
index 0000000..7c7d0fa
--- /dev/null
+++ b/src/uid_mapping.hpp
@@ -0,0 +1,45 @@
+
+#ifndef UID_MAPPING_H
+#define UID_MAPPING_H
+
+#include<vector>
+#include<map>
+#include<unordered_map>
+#include<fstream>
+using namespace std;
+
+
+// Takes the current UID kmer_uid, and checks whether
+// - taxid is in taxon set T specified in UID_to_taxids_vec[kmer_uid]?
+//   - yes: return kmer_uid
+//   - no: is there a set (T,taxid) in Taxids_to_UID_map?
+//     - yes: return the uid of that set
+//     - no:  
+//       - increment current_uid by one and set this as the set uid
+//       - add the set to Taxids_to_UID_map and UID_to_taxids_vec
+//       - write the mapping to UID_map_file
+//
+
+using TaxidSet = vector<uint32_t>;
+
+namespace kraken {
+uint32_t uid_mapping(
+      map< TaxidSet, uint32_t>& Taxids_to_UID_map, 
+      vector< const TaxidSet* >& UID_to_taxids_vec, 
+      uint32_t taxid, 
+      uint32_t kmer_uid, 
+      uint32_t& current_uid,
+      ofstream& UID_map_file);
+
+
+uint32_t resolve_uids(
+      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+      const unordered_map<uint32_t, uint32_t> &parent_map,
+      const vector< vector<uint32_t> > &UID_to_taxids_vec);
+
+uint32_t resolve_uids2(
+      const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
+      const unordered_map<uint32_t, uint32_t> &parent_map,
+      const uint32_t* fptr, const size_t fsize);
+}
+#endif

From bcca5a8569a378f4f16b92de64f6b67b20693ed1 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 10:54:19 -0400
Subject: [PATCH 041/105] Updated build script, and add some info when loading
 database

---
 scripts/krakenu-build_db.sh | 10 ++++++----
 src/krakendb.cpp            |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index aebff74..89dc7af 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -187,9 +187,10 @@ then
 else
   echo "Creating seqID to taxID map (step 4 of 6).."
   start_time1=$(date "+%s.%N")
-  cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt
-  join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp
-  mv seqid2taxid.map.tmp seqid2taxid.map
+  #cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt
+  #join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp
+  #mv seqid2taxid.map.tmp seqid2taxid.map
+  find library -name '*.map' -exec cat {} \; > seqid2taxid.map
   line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
 
   echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
@@ -238,7 +239,8 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
       -F /dev/fd/0 > seqid2taxid-plus.map
 
     ## Make a classification report
-    krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input library/archaea.fna > $(basename `pwd`).kraken
+    cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
+    krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input /dev/stdin > $(basename `pwd`).kraken
     set +x
     if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
       mv seqid2taxid.map seqid2taxid.map.orig
diff --git a/src/krakendb.cpp b/src/krakendb.cpp
index f89f869..de33901 100644
--- a/src/krakendb.cpp
+++ b/src/krakendb.cpp
@@ -67,6 +67,7 @@ KrakenDB::KrakenDB(char *ptr) {
     errx(EX_DATAERR, "can only handle 4 byte DB values");
   k = key_bits / 2;
   key_len = key_bits / 8 + !! (key_bits % 8);
+  std::cerr << "Loaded database with " << key_ct << " keys with k of " << (size_t)k << " [val_len " << val_len << ", key_len " << key_len << "]." << std::endl;
 }
 
 //using std::map to have the keys sorted

From a13e9fc28798d3e4ce8e2d09488a1870eaa7006d Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 18:03:44 -0400
Subject: [PATCH 042/105] Added jellyfish submodule

---
 .gitmodules | 3 +++
 Jellyfish   | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 Jellyfish

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..4bab269
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "Jellyfish"]
+	path = Jellyfish
+	url = https://github.com/gmarcais/Jellyfish
diff --git a/Jellyfish b/Jellyfish
new file mode 160000
index 0000000..fa9b676
--- /dev/null
+++ b/Jellyfish
@@ -0,0 +1 @@
+Subproject commit fa9b67610f604c0ca14a51dd68c5dd408c251317

From 4d3694058cc54c6a4c09354a6c13e9c37adfe4b0 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 18:05:49 -0400
Subject: [PATCH 043/105] specify branch for jellyfish

---
 .gitmodules | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitmodules b/.gitmodules
index 4bab269..899f8c8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,4 @@
 [submodule "Jellyfish"]
 	path = Jellyfish
 	url = https://github.com/gmarcais/Jellyfish
+	branch = series-1.1

From e8f687330b3e345fd236ed1017279e5ec7b3dbaf Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 19:00:21 -0400
Subject: [PATCH 044/105] Instal jellyfish from code archive

---
 .gitignore                             |  1 +
 .gitmodules                            |  4 ----
 Jellyfish                              |  1 -
 install_kraken.sh                      | 31 +++++++++++++++++++++++---
 scripts/krakenu-check_for_jellyfish.sh | 15 ++++++++-----
 5 files changed, 38 insertions(+), 14 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 160000 Jellyfish

diff --git a/.gitignore b/.gitignore
index d6ff918..4b1f087 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 /install/
 /Debug/
+/tests/dbs
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 899f8c8..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "Jellyfish"]
-	path = Jellyfish
-	url = https://github.com/gmarcais/Jellyfish
-	branch = series-1.1
diff --git a/Jellyfish b/Jellyfish
deleted file mode 160000
index fa9b676..0000000
--- a/Jellyfish
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit fa9b67610f604c0ca14a51dd68c5dd408c251317
diff --git a/install_kraken.sh b/install_kraken.sh
index 000c9b7..12c50f1 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -19,11 +19,22 @@
 
 set -e
 
+DIR=$(dirname $0)
 VERSION=`cat $(dirname $0)/VERSION`
 
+if [ "$1" == "--install-jellyfish" ]; then
+ INSTALL_JELLYFISH=1;
+ shift;
+fi
+
 if [ -z "$1" ] || [ -n "$2" ]
 then
-  echo "Usage: $(basename $0) KRAKEN_DIR"
+  echo "Usage: $(basename $0) [--install-jellyfish] KRAKEN_DIR
+
+If --install-jellyfish is specified, the source code for version 1.1
+is downloaded from http://www.cbcb.umd.edu/software/jellyfish and installed 
+in KRAKEN_DIR. Note that this may overwrite other jellyfish installation in 
+the same path."
   exit 64
 fi
 
@@ -34,14 +45,28 @@ then
   exit 1
 fi
 
+
 # Perl cmd used to canonicalize dirname - "readlink -f" doesn't work
 # on OS X.
 export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")
 
+if [ "$INSTALL_JELLYFISH" == "1" ]; then
+  WD=`pwd`
+  cd /tmp
+  wget http://www.cbcb.umd.edu/software/jellyfish/jellyfish-1.1.11.tar.gz
+  tar xvvf jellyfish-1.1.11.tar.gz
+  cd jellyfish-1.1.11
+  ./configure
+  make
+  cp bin/jellyfish $KRAKEN_DIR
+  #rm -r jellyfish-1.1.11.tar.gz jellyfish-1.1.11
+  cd $WD
+fi
+
 mkdir -p "$KRAKEN_DIR"
 #make -C src clean
-make -C src install
-for file in scripts/*
+make -C $DIR/src install
+for file in $DIR/scripts/*
 do
   perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\
            -e 's/#####=(\w+)=#####/$H{$1}/g' \
diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenu-check_for_jellyfish.sh
index 9143b62..311e307 100755
--- a/scripts/krakenu-check_for_jellyfish.sh
+++ b/scripts/krakenu-check_for_jellyfish.sh
@@ -25,12 +25,15 @@ set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 set -o pipefail  # Stop on failures in non-final pipeline commands
 
-JELLYFISH_BIN="jellyfish"
-if hash jellyfish1 2>/dev/null; then
-    JELLYFISH_BIN="jellyfish1"
-elif hash jellyfish 2>/dev/null; then
-    JELLYFISH_BIN="jellyfish"
-else 
+JELLYFISH_BIN=""
+for JF in $(dirname $0)/jellyfish jellyfish1 jellyfish; do
+  if hash $JF 2>/dev/null; then
+    JELLYFISH_BIN=$JF;
+    break
+  fi
+done
+
+if [ "$JELLYFISH_BIN" == "" ]; then
     echo "Did not find jellyfish!" 1>&2
     exit 1
 fi

From c60e100ee2a291194c67a72b6b5273c02942b73f Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 19:13:37 -0400
Subject: [PATCH 045/105] Update .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 4b1f087..27c6246 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 /install/
 /Debug/
 /tests/dbs
+/tests/data
+/tests/install

From a2f75cbe472fbade3e4c9ade9cac89205d69aaa1 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 19:21:38 -0400
Subject: [PATCH 046/105] Use /usr/bin/env

---
 scripts/krakenu                      | 2 +-
 scripts/krakenu-build                | 2 +-
 scripts/krakenu-cp_into_tempfile.pl  | 2 +-
 scripts/krakenu-filter               | 2 +-
 scripts/krakenu-mpa-report           | 2 +-
 scripts/krakenu-read_merger.pl       | 2 +-
 scripts/krakenu-report               | 2 +-
 scripts/krakenu-translate            | 2 +-
 scripts/krakenu-verify_gi_numbers.pl | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/krakenu b/scripts/krakenu
index e2d8412..243bcda 100755
--- a/scripts/krakenu
+++ b/scripts/krakenu
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-build b/scripts/krakenu-build
index 2303f76..1461663 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenu-build
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-cp_into_tempfile.pl b/scripts/krakenu-cp_into_tempfile.pl
index 4e24ff2..c502d2e 100755
--- a/scripts/krakenu-cp_into_tempfile.pl
+++ b/scripts/krakenu-cp_into_tempfile.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-filter b/scripts/krakenu-filter
index 04dcb7c..5ab01df 100755
--- a/scripts/krakenu-filter
+++ b/scripts/krakenu-filter
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-mpa-report b/scripts/krakenu-mpa-report
index 7813569..526a167 100755
--- a/scripts/krakenu-mpa-report
+++ b/scripts/krakenu-mpa-report
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-read_merger.pl b/scripts/krakenu-read_merger.pl
index 6e97099..adbecf9 100755
--- a/scripts/krakenu-read_merger.pl
+++ b/scripts/krakenu-read_merger.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-report b/scripts/krakenu-report
index 99cab1b..e9cdaf5 100755
--- a/scripts/krakenu-report
+++ b/scripts/krakenu-report
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-translate b/scripts/krakenu-translate
index 89a067a..46c9102 100755
--- a/scripts/krakenu-translate
+++ b/scripts/krakenu-translate
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
diff --git a/scripts/krakenu-verify_gi_numbers.pl b/scripts/krakenu-verify_gi_numbers.pl
index ec616f5..0bb5cdf 100755
--- a/scripts/krakenu-verify_gi_numbers.pl
+++ b/scripts/krakenu-verify_gi_numbers.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #

From 8a434fc21fe9083dfd021a9f76d118680ea81cef Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 19:22:36 -0400
Subject: [PATCH 047/105] Much faster krakenu-download using forks and LWP

---
 scripts/krakenu-download | 164 ++++++++++++++++++++++++++-------------
 1 file changed, 109 insertions(+), 55 deletions(-)

diff --git a/scripts/krakenu-download b/scripts/krakenu-download
index f3aa4bd..7cf6fd3 100755
--- a/scripts/krakenu-download
+++ b/scripts/krakenu-download
@@ -1,4 +1,5 @@
-#!/bin/env perl
+#!/usr/bin/env perl
+#vim: et:ts=2:sw=2
 
 # krakenu-download.pl - based on centrifuge-download
 # (c) Florian Breitwieser, 2017
@@ -13,7 +14,8 @@ use IO::Uncompress::Gunzip qw/gunzip $GunzipError/;
 use autodie;
 use Term::ANSIColor;
 use Getopt::Long;
-use Parallel::ForkManager;
+use LWP::UserAgent;
+
 
 sub download_taxonomy(@);
 sub download_contaminats(@);
@@ -46,8 +48,9 @@ my $OVERWRITE_FILES=0;
 my $INCLUDE_VIRAL_NEIGHBORS=0;
 my $DOMAINS;
 my $DL_MOD_RSYNC;
+my $n_children = 0;
+my @pids;
 
-my %ac_to_taxid;
 my $downloaded_viral_refseq=0;
 my $FNA_FILES="genomic";
 
@@ -113,6 +116,8 @@ if (defined $BASE_DIR && defined $DB_DIR) {
   exit 1;
 }
 
+my $ua = LWP::UserAgent->new( ssl_opts => { verify_hostname => 0 } );
+
 # Use current directory as base directory
 $BASE_DIR = "." unless defined $DB_DIR || defined $BASE_DIR;
 
@@ -127,15 +132,6 @@ sub get_dir {
   return $dir1;
 }
 
-my $pm = new Parallel::ForkManager($N_PROC);
-$pm->run_on_finish(sub {
-  my ($pid, $exit_code, $indent, $exit_signal, $core_dump, $data) = @_;
-  if (defined $data) {
-    @ac_to_taxid{keys %$data} = values %$data;
-  }
-}
-);
-
 my %select_taxonomy_ids;
 if (defined $TAXID) {
   %select_taxonomy_ids = map { $_ => 1 } split(/,/, $TAXID);
@@ -182,8 +178,9 @@ if ($INCLUDE_VIRAL_NEIGHBORS) {
   if (!$downloaded_viral_refseq) {
     print STDERR "--include-viral-neighbors only works when RefSeq viral is downloaded in the same session!";
   } else {
-    my $lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors";
-    download_viral_neighbors($lib_dir);
+    my $nbr_lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors";
+    my $viral_lib_dir = $add_dir? "$BASE_DIR/library/viral" : "$BASE_DIR/viral";
+    download_viral_neighbors($viral_lib_dir, $nbr_lib_dir);
   }
 }
 
@@ -193,35 +190,103 @@ if ($INCLUDE_VIRAL_NEIGHBORS) {
 ## Functions
 
 sub download(@) {
-  my ($url, $file) = @_;
-  if (-f $file && !$OVERWRITE_FILES) {
+  my ($url, $file, $gunzipped_filename) = @_;
+  if (-s $file && !$OVERWRITE_FILES) {
     print STDERR "Not fetching $url - file $file exists.\n" if $VERBOSE;
     return 1;
   }
 
-  $url =~ s/https/http/;
+  start_fork() and return;
+  if ($url =~ /^http/) {
+    print STDERR "Fetching $url to $file ..." if $VERBOSE;
+    if (!-d dirname($file)) {
+      make_path(dirname($file));
+    }
+    my $response = $ua->get($url, ':content_file' => $file);
+    if (!$response->is_success) {
+      print STDERR "\nFAIL: Error downloading $url!\n";
+      print STDERR $response->status_line."\n";
+      exit;
+    } else {
+      print STDERR "SUCCESS\n" if $VERBOSE;
+    }
+  } else {
+    if ( $DL_MOD_RSYNC && $url =~ /^ftp/ ) {
+     $url =~ s/^ftp/rsync/;
+    }
+    print STDERR "Fetching $url to $file ..." if $VERBOSE;
+
+    my $ff = File::Fetch->new(uri=>"$url");
+    my $where = $ff->fetch(to=> dirname($file)) or die $ff->error;
+    move($where, $file);
 
-  if ( $DL_MOD_RSYNC && $url =~ /^ftp/ ) {
-    $url =~ s/^ftp/rsync/;
+    if (defined $gunzipped_filename) {
+      print STDERR " GUNZIPPING";
+      gunzip $file => $gunzipped_filename or die "gunzip failed: $GunzipError";
+      unlink $file;
+      $file = $gunzipped_filename;
+    }
+    print STDERR " SUCCESS\n" if $VERBOSE;
   }
-
-  print STDERR "Fetching $url to $file ...\n" if $VERBOSE;
-  my $ff = File::Fetch->new(uri=>"$url");
-  my $where = $ff->fetch(to=> dirname($file)) or die $ff->error;
+  exit;
   #my $where = $ff->fetch(to=> dirname($file)) or die "\n$ff->error for $url!";
-  move($where, $file);
-  return -f $file;
+  return -s $file;
+}
+
+sub start_fork() {
+  my $pid;
+  return if $N_PROC <= 1;
+  if ($n_children == $N_PROC) {
+    $pid = wait();
+    --$n_children;
+  }
+  if (defined($pid = fork())) {
+    if ($pid) {
+      ++$n_children;
+      #print STDERR "Parent: forked child $pid\n";
+      push @pids, $pid;
+    } 
+  } else {
+    print STDERR "ERROR: Failed to fork\n";
+  }
+  return $pid;
+}
+
+sub wait_children() {
+  foreach my $pid (@pids) {
+    waitpid $pid, 0;
+  }
+  @pids = ();
+  $n_children = 0;
+}
+
+sub end_fork() {
+  exit() unless $N_PROC == 1;
 }
 
 sub download_viral_neighbors(@) {
-  my ($dir) = @_;
-  print STDERR "Downloading viral neighbors into $dir ...\n";
+  my ($viral_dir, $nbr_dir) = @_;
+  print STDERR "Reading map files from $viral_dir ... \n";
+  my %ac_to_taxid;
+  foreach my $f (glob("$viral_dir/*.map")) {
+    open (my $F, "<", $f);
+    while (<$F>) {
+      chomp;
+      my ($ac, $taxid, $name) = split(/\t/);
+      $ac =~ s/\.[0-9]*$//;
+      $ac_to_taxid{$ac} = [$name, $taxid];
+    }
+    close ($F);
+  }
+
+  print STDERR "Downloading viral neighbors into $nbr_dir ...\n";
   my $url = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2";
-  my $nbr_file = "$dir/viral_neighbors-taxid10239.nbr";
+  my $nbr_file = "$nbr_dir/viral_neighbors-taxid10239.nbr";
   download($url, $nbr_file);
   open(my $F, "<", $nbr_file);
   my @file = <$F>;
   close($F);
+
   my $i = 0;
   my $n_genomes = scalar @file;
 
@@ -229,46 +294,44 @@ sub download_viral_neighbors(@) {
     next if /^#/;
     ++$i;
     print STDERR "\r  Downloading viral neighbor sequence $i/$n_genomes ..." unless $VERBOSE;
-    my $pid = $pm->start and next;
+#    my $pid = $pm->start and next;
+
     my ($rep_acs, $nbr_ac, undef, undef, $nname, $sname) = split /\t/;
     my ($name, $taxid);
     foreach my $rep_ac (split (/,/, $rep_acs)) {
       if (defined $ac_to_taxid{$rep_ac}) {
         ($name, $taxid) = @{$ac_to_taxid{$rep_ac}};
         last;
-      }
+      } 
     }
     if (!defined $taxid) {
-      print STDERR "No mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n";
-      $pm->finish(0);
+      print STDERR "\nNo mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n";
       next;
     }
     (my $name1 = $name) =~ s/[^a-zA-Z0-9_]/_/g;
     $name1 =~ s/__/_/g;
-    my $file = "$dir/$name1-tax$taxid/$nbr_ac.fna";
+    my $file = "$nbr_dir/$name1-tax$taxid/$nbr_ac.fna";
     my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac";
     if (download($url,$file)) {
       print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac");
     }
-    ## TODO: dust viral neighbors
-    $pm->finish(0);
   }
   print STDERR "\n";
-  $pm->wait_all_children();
+  wait_children;
+
+#  $pm->wait_all_children();
 }
 
 sub print_header_lines(@) {
-  my ($file, $taxid, $name, $map_ref) = @_;
-  #return if -f "$file.map";
+  my ($file, $taxid, $name) = @_;
+  return if -s "$file.map";
+  print STDERR "Making map file for $file\n" if ($VERBOSE);
   open (my $F, ">", "$file.map");
   open (my $G, "<", $file);
   while (<$G>) {
     next unless /^>([^ ]*)/;
     my $ac = $1;
     print $F "$ac\t$taxid\t$name\n";
-    $ac =~ s/\.[0-9]*$//;
-    $map_ref->{$ac} = [$name, $taxid] if defined $map_ref;
-    #$ac_to_taxid{$ac} = [$name, $taxid] if $downloaded_viral_refseq && $INCLUDE_VIRAL_NEIGHBORS;
   }
   close($G);
   close($F);
@@ -372,13 +435,13 @@ sub download_domain(@) {
   foreach my $g (@genomes_to_dl) {
     my ($ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession) = @$g;
     ++$i;
+    print STDERR "\r Downloading $domain genomes:  $i/$n_genomes ..." unless $VERBOSE;
 
     if (defined $infraspecific_name) {
         (my $i1 = $infraspecific_name) =~ s/strain=//;
         $organism_name .= " $infraspecific_name" unless $organism_name =~ /$i1/ || $i1 eq "";
     }
 
-    print STDERR "\r Downloading $domain genomes:  $i/$n_genomes ..." unless $VERBOSE;
 
     my $bname = basename($ftp_path);
     ( my $organism_name1 = $organism_name ) =~ s/[^a-zA-Z0-9_]/_/g;
@@ -386,20 +449,16 @@ sub download_domain(@) {
     $organism_name1 =~ s/__/_/g;
     $organism_name1 =~ s/_$//;
     my $bname1 = "${organism_name1}-tax${taxid}-${bname}";
-    my $pid = $pm->start and next;
-    my %local_ac_to_taxid;
     
     foreach my $ext (split(/,/, $FNA_FILES)) {
       my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz";
       my $bfname = $bname1."_".$ext;
       my $fname = $bfname.".fna";
 
-      if (!$OVERWRITE_FILES && -f "$domain_dir/$fname") {
+      if (!$OVERWRITE_FILES && -s "$domain_dir/$fname") {
         print STDERR "$domain_dir/$fname exists - not downloading.. \n" if $VERBOSE;
       } else {
-        download($full_ftp_path, "$domain_dir/$fname.gz");
-        gunzip "$domain_dir/$fname.gz" => "$domain_dir/$fname" or die "gunzip failed: $GunzipError";
-        unlink "$domain_dir/$fname.gz";
+        download($full_ftp_path, "$domain_dir/$fname.gz", "$domain_dir/$fname");
       }
 
       if ($CHANGE_HEADER) {
@@ -411,11 +470,7 @@ sub download_domain(@) {
 
       ## Output sequenceID to taxonomy ID map to STDOUT
       
-      if ($domain eq "viral" && $INCLUDE_VIRAL_NEIGHBORS) {
-        print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession", \%local_ac_to_taxid);
-      } else {
-        print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession");
-      }
+      print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession");
 
       if ($DO_DUST) {
         ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps
@@ -423,9 +478,8 @@ sub download_domain(@) {
         unlink("$domain_dir/$fname");
       }
     }
-    $pm->finish(0, \%local_ac_to_taxid);
   }
 
-  $pm->wait_all_children;
+#  $pm->wait_all_children;
   print STDERR "\n";
 }

From 7185d736f17a4a33667c0936e6290952098bfeb5 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 25 Sep 2017 19:24:02 -0400
Subject: [PATCH 048/105] Add test files

---
 .gitignore                               |  2 ++
 scripts/krakenu-standard_installation.sh |  2 +-
 tests/init.sh                            |  7 +++++++
 tests/install_viral_databases.sh         | 15 +++++++++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 tests/init.sh
 create mode 100755 tests/install_viral_databases.sh

diff --git a/.gitignore b/.gitignore
index 27c6246..10700cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 /tests/dbs
 /tests/data
 /tests/install
+
+\.idea/
diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenu-standard_installation.sh
index 815d482..e09de80 100755
--- a/scripts/krakenu-standard_installation.sh
+++ b/scripts/krakenu-standard_installation.sh
@@ -30,7 +30,7 @@ then
   WOD_FLAG="--work-on-disk"
 fi
 
-check_for_jellyfish.sh
+krakenu-check_for_jellyfish.sh
 krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy
 krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map
 krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map
diff --git a/tests/init.sh b/tests/init.sh
new file mode 100644
index 0000000..c6cd8f3
--- /dev/null
+++ b/tests/init.sh
@@ -0,0 +1,7 @@
+
+## Install KrakenU locally into install/
+../install_kraken.sh `pwd`/install
+
+## Download taxonomy and genomic data into data/
+install/krakenu-download --db data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
+
diff --git a/tests/install_viral_databases.sh b/tests/install_viral_databases.sh
new file mode 100755
index 0000000..b7e6c3d
--- /dev/null
+++ b/tests/install_viral_databases.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -xeu
+
+mkdir -p dbs/refseq-viral/library
+mkdir -p dbs/refseq-viral-plus/library
+
+[[ -L dbs/refseq-viral/taxonomy ]] || ln -s data/taxonomy dbs/refseq-viral
+[[ -L dbs/refseq-viral/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral/library
+[[ -L dbs/refseq-viral-plus/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral-plus/library
+[[ -L dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s data/library/viral-neighbors/ dbs/refseq-viral-plus/library
+
+export PATH="install:$PATH"
+krakenu-build --db refseq-viral --build
+

From c7ea4b89cfe54755a7f74f126dc1e53e8493bc03 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 27 Sep 2017 23:44:08 -0400
Subject: [PATCH 049/105] Added files for automated tests

---
 tests/build-dbs.sh               | 13 +++++++++++++
 tests/classify-reads.sh          | 10 ++++++++++
 tests/init.sh                    | 13 +++++++++++--
 tests/install_viral_databases.sh | 15 ---------------
 tests/simulate-reads.sh          |  9 +++++++++
 5 files changed, 43 insertions(+), 17 deletions(-)
 create mode 100755 tests/build-dbs.sh
 create mode 100755 tests/classify-reads.sh
 mode change 100644 => 100755 tests/init.sh
 delete mode 100755 tests/install_viral_databases.sh
 create mode 100755 tests/simulate-reads.sh

diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
new file mode 100755
index 0000000..3e489e3
--- /dev/null
+++ b/tests/build-dbs.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -xeu
+
+[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
+
+mkdir -p $DIR/dbs/refseq-viral-plus/library
+[[ -L $DIR/dbs/refseq-viral-plus/library/viral ]] || ln -s $DIR/data/library/viral/ $DIR/dbs/refseq-viral-plus/library/
+[[ -L $DIR/dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s $DIR/data/library/viral-neighbors/ $DIR/dbs/refseq-viral-plus/library/
+
+export PATH="$DIR/install:$PATH"
+krakenu-build --db $DIR/dbs/refseq-viral --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy
+
diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh
new file mode 100755
index 0000000..807d287
--- /dev/null
+++ b/tests/classify-reads.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -xeu
+
+[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
+SDIR=$DIR/simulated_reads
+CDIR=$DIR/classification-results
+mkdir -p $CDIR
+
+NAM=viral-neighbors-10m
+time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $CDIR/$NAM.krakenu.report > $CDIR/$NAM.krakenu
diff --git a/tests/init.sh b/tests/init.sh
old mode 100644
new mode 100755
index c6cd8f3..a289d0d
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -1,7 +1,16 @@
+#!/bin/bash
+
+DIR=$1
+[[ "$DIR" == "" ]] && DIR=`pwd`
 
 ## Install KrakenU locally into install/
-../install_kraken.sh `pwd`/install
+$(dirname $0)/install_kraken.sh --install-jellyfish $DIR/install
 
 ## Download taxonomy and genomic data into data/
-install/krakenu-download --db data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
+$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
 
+for i in viral viral-neighbors archaea bacteria; do 
+  if [[ ! -f "$DIR/data/all-$i.fna" ]]; then 
+    find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
+  fi
+done
diff --git a/tests/install_viral_databases.sh b/tests/install_viral_databases.sh
deleted file mode 100755
index b7e6c3d..0000000
--- a/tests/install_viral_databases.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-set -xeu
-
-mkdir -p dbs/refseq-viral/library
-mkdir -p dbs/refseq-viral-plus/library
-
-[[ -L dbs/refseq-viral/taxonomy ]] || ln -s data/taxonomy dbs/refseq-viral
-[[ -L dbs/refseq-viral/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral/library
-[[ -L dbs/refseq-viral-plus/library/viral ]] || ln -s data/library/viral/ dbs/refseq-viral-plus/library
-[[ -L dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s data/library/viral-neighbors/ dbs/refseq-viral-plus/library
-
-export PATH="install:$PATH"
-krakenu-build --db refseq-viral --build
-
diff --git a/tests/simulate-reads.sh b/tests/simulate-reads.sh
new file mode 100755
index 0000000..d5fd965
--- /dev/null
+++ b/tests/simulate-reads.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -xeu
+
+[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
+SDIR=$DIR/simulated_reads
+mkdir -p $SDIR
+
+randomreads.sh ref=$DIR/data/all-viral-neighbors.fna out=$SDIR/viral-neighbors-10m.fq reads=10m len=150

From f9307644f7801033def058755c44a8a97927fac6 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 27 Sep 2017 23:46:11 -0400
Subject: [PATCH 050/105] Added OSX files

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 10700cd..4979752 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,5 +3,5 @@
 /tests/dbs
 /tests/data
 /tests/install
-
+*.dSYM
 \.idea/

From a359ba388ed7ca28cb921edbb6d906876bede04b Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 27 Sep 2017 23:53:38 -0400
Subject: [PATCH 051/105] Fix jellyfish installation - don't use make install

---
 install_kraken.sh | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/install_kraken.sh b/install_kraken.sh
index 12c50f1..0e662b6 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -50,20 +50,24 @@ fi
 # on OS X.
 export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")
 
+mkdir -p "$KRAKEN_DIR"
 if [ "$INSTALL_JELLYFISH" == "1" ]; then
   WD=`pwd`
-  cd /tmp
-  wget http://www.cbcb.umd.edu/software/jellyfish/jellyfish-1.1.11.tar.gz
-  tar xvvf jellyfish-1.1.11.tar.gz
-  cd jellyfish-1.1.11
-  ./configure
+  cd $KRAKEN_DIR
+  if [[ ! -d jellyfish ]]; then
+    wget http://www.cbcb.umd.edu/software/jellyfish/jellyfish-1.1.11.tar.gz
+    tar xvvf jellyfish-1.1.11.tar.gz
+    mv jellyfish-1.1.11 jellyfish
+  fi
+  cd jellyfish
+  [[ -f Makefile ]] || ./configure
   make
-  cp bin/jellyfish $KRAKEN_DIR
+  #make install ## doest not work for me on OSX
+  #cp $KRAKEN_DIR/jellyfish-install/bin/jellyfish $KRAKEN_DIR
   #rm -r jellyfish-1.1.11.tar.gz jellyfish-1.1.11
   cd $WD
 fi
 
-mkdir -p "$KRAKEN_DIR"
 #make -C src clean
 make -C $DIR/src install
 for file in $DIR/scripts/*

From b32fdd8a7733bd84c7c89222d9f7c5d3f3a268be Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 27 Sep 2017 23:54:34 -0400
Subject: [PATCH 052/105] Add parameters --library-dir and --taxonomy-dir

---
 scripts/krakenu-build       | 17 +++++++--
 scripts/krakenu-build_db.sh | 76 +++++++++++++++++++++----------------
 2 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/scripts/krakenu-build b/scripts/krakenu-build
index 1461663..59b0d50 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenu-build
@@ -68,7 +68,10 @@ my (
   $build_lca_database,
 
   $add_taxonomy_ids_for_genome,
-  $add_taxonomy_ids_for_seq
+  $add_taxonomy_ids_for_seq,
+
+  $library_dir,
+  $taxonomy_dir
 
 );
 
@@ -125,8 +128,12 @@ GetOptions(
 
   "taxids-for-genomes" => \$add_taxonomy_ids_for_genome,
   "taxids-for-sequences" => \$add_taxonomy_ids_for_seq,
+
   "lca-database!" => \$build_lca_database,
-  "uid-database!" => \$build_uid_database
+  "uid-database!" => \$build_uid_database,
+
+  "library-dir=s" => \$library_dir,
+  "taxonomy-dir=s" => \$taxonomy_dir
 ) or usage();
 
 if (@ARGV) {
@@ -235,7 +242,7 @@ Task options (exactly one must be selected):
   --version                  Print version information
 
 Options:
-  --db NAME                  Kraken DB/library name (mandatory except for
+  --db DBDIR                 Kraken DB directory (mandatory except for
                              --help/--version)
   --threads #                Number of threads (def: $DEF_THREAD_CT)
   --new-db NAME              New Kraken DB name (shrink task only; mandatory
@@ -263,6 +270,8 @@ Options:
   --taxids-for-sequences     Add taxonomy IDs for sequences, starting with 1bio.
                              Can be useful to resolve classifications with multiple genomes
                              for one taxonomy ID.
+  --library-dir DIR          Use DIR for reference sequences instead of DBDIR/library.
+  --taxonomy-dir DIR         Use DIR for taxonomy instead of DBDIR/taxonomy.
 EOF
   exit $exit_code;
 }
@@ -316,6 +325,8 @@ sub build_database {
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome;
   $ENV{"KRAKEN_UID_DATABASE"} = $build_uid_database;
   $ENV{"KRAKEN_LCA_DATABASE"} = $build_lca_database;
+  $ENV{"KRAKEN_LIBRARY_DIR"} = $library_dir;
+  $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir;
   my $opt = ($verbose? "-x" : "");
   exec "krakenu-build_db.sh";
 }
diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index 89dc7af..48260ad 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -76,15 +76,34 @@ fi
 
 if [ "$KRAKEN_REBUILD_DATABASE" == "1" ]
 then
-  rm -f database.* *.map lca.complete library-files.txt
+  rm -f database.* *.map lca.complete library-files.txt uid_database.* taxDB
 fi
 
-if [ ! -f "library-files.txt" ]; then
+LIBRARY_DIR="library/"
+[[ "$KRAKEN_LIBRARY_DIR" != "" ]] && LIBRARY_DIR="$KRAKEN_LIBRARY_DIR"
+
+TAXONOMY_DIR="library/"
+[[ "$KRAKEN_TAXONOMY_DIR" != "" ]] && TAXONOMY_DIR="$KRAKEN_TAXONOMY_DIR"
+
+if [ ! -s "library-files.txt" ]; then
     echo "Finding all library files"
-    find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt
+    find $FIND_OPTS $LIBRARY_DIR '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt
 fi
+
+files0() {
+  cat library-files.txt | tr '\n' '\0'
+}
+cat_library() {
+  cat library-files.txt | tr '\n' '\0' | xargs -0 cat
+}
+
 N_FILES=`cat library-files.txt | wc -l`
-echo "Found $N_FILES sequence files (*.{fna,fa,ffn} in the library)"
+if [[ "$N_FILES" -eq 0 ]]; then
+  echo "ERROR: No fna, fa, or ffn files found in $LIBRARY_DIR!";
+  exit 1
+fi
+echo "Found $N_FILES sequence files (*.{fna,fa,ffn}) in the library directory."
+
 
 if [ -e "database.jdb" ] || [ -e "database0.kdb" ]
 then
@@ -98,13 +117,12 @@ else
   # Estimate hash size as 1.15 * chars in library FASTA files
   if [ -z "$KRAKEN_HASH_SIZE" ]
   then
-    KRAKEN_HASH_SIZE=$(find $FIND_OPTS library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -printf '%s\n' | perl -nle '$sum += $_; END {print int(1.15 * $sum)}')
+    KRAKEN_HASH_SIZE=$( files0 | xargs -0 stat -f%z | perl -nle '$sum += $_; END {print int(1.15 * $sum)}')
     echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
   fi
 
-  cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
-    $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
-      -o database /dev/fd/0
+  $JELLYFISH_BIN count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
+    -o database <( cat_library )
 
   # Merge only if necessary
   if [ -e "database_1" ]
@@ -181,16 +199,13 @@ else
   echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]"
 fi
 
-if [ -e "seqid2taxid.map" ]
+if [ -s "seqid2taxid.map" ]
 then
   echo "Skipping step 4, seqID to taxID map already complete."
 else
   echo "Creating seqID to taxID map (step 4 of 6).."
   start_time1=$(date "+%s.%N")
-  #cat library-files.txt | tr '\n' '\0' | xargs -0 grep '^>' | sed 's/.//' | sed 's/ .*//' | sort > library-headers.txt
-  #join -t $'\t' nucl_gb.accession2taxid.sorted library-headers.txt > seqid2taxid.map.tmp
-  #mv seqid2taxid.map.tmp seqid2taxid.map
-  find library -name '*.map' -exec cat {} \; > seqid2taxid.map
+  find -L $LIBRARY_DIR/ -name '*.map' -exec cat {} \; > seqid2taxid.map
   line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
 
   echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
@@ -233,33 +248,34 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
     fi
     start_time1=$(date "+%s.%N")
     set -x
-    cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
       set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \
       -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kmer_count \
-      -F /dev/fd/0 > seqid2taxid-plus.map
-
-    ## Make a classification report
-    cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
-    krakenu --db . --report-file $(basename `pwd`).report --threads 10 --fasta-input /dev/stdin > $(basename `pwd`).kraken
+      -F <( cat_library ) > seqid2taxid-plus.map
     set +x
     if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
       mv seqid2taxid.map seqid2taxid.map.orig
       mv seqid2taxid-plus.map seqid2taxid.map
     fi
+
     echo "LCA database created. [$(report_time_elapsed $start_time1)]"
   fi
+  ## Make a classification report
+  if [[ ! -s $(basename `pwd`).report ]]; then
+    echo "Creating database summary report ..."
+    krakenu --db . --report-file $(basename `pwd`).report --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $(basename `pwd`).kraken
+  fi
 fi
 
 
 if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
   if [ -e "uid_database.complete" ]
   then
-    echo "Skipping step 6.3, UIDs already set."
+    echo "Skipping step 6.3, UID datanbase already generated."
   else
     echo "Building UID database (step 6.3 of 6)..."
     PARAM=""
     if [[ "$KRAKEN_LCA_DATABASE" == "0" ]]; then
-      if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" && ]]; then
+      if [[ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ]]; then
   	echo " Adding taxonomy IDs for sequences"
   	PARAM=" -a"
       fi
@@ -269,26 +285,20 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
       fi
     fi
     start_time1=$(date "+%s.%N")
-    cat library-files.txt | tr '\n' '\0' | xargs -0 cat | \
       set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \
-      -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
+        -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kmer_count -F <( cat_library )
     touch "uid_database.complete"
   
     echo "UID Database created. [$(report_time_elapsed $start_time1)]"
   fi
-fi
 
-if [ -s "uid_database.count" ]
-then
-  echo "Skipping step 6.4, uid_database.kmer_count exists."
-else
-  echo "Creating uid_database.kmer_count (step 6.4 of 6)... "
-  start_time1=$(date "+%s.%N")
-  time $JELLYFISH_BIN histo --high 100000000 uid_database.kdb > uid_database.kmer_count
-  echo "uid_database.kmer_count finished. [$(report_time_elapsed $start_time1)]"
+  ## Make a classification report
+  if [[ ! -s $(basename `pwd`).uid_report ]]; then
+    echo "Creating database summary report ..."
+    krakenu --db . --report-file $(basename `pwd`).uid_report --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $(basename `pwd`).uid_kraken
+  fi
 fi
 
-
 echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]
 You can delete all files but database.{kdb,idx} and taxDB now, if you want"
 

From 3c4e7e44f70056b9b9db55afb532b11d91f0fbf1 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 27 Sep 2017 23:55:18 -0400
Subject: [PATCH 053/105] Look for locally installed jellyfish, first

---
 scripts/krakenu-check_for_jellyfish.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenu-check_for_jellyfish.sh
index 311e307..c2aa2d7 100755
--- a/scripts/krakenu-check_for_jellyfish.sh
+++ b/scripts/krakenu-check_for_jellyfish.sh
@@ -26,8 +26,8 @@ set -e  # Stop on error
 set -o pipefail  # Stop on failures in non-final pipeline commands
 
 JELLYFISH_BIN=""
-for JF in $(dirname $0)/jellyfish jellyfish1 jellyfish; do
-  if hash $JF 2>/dev/null; then
+for JF in $(dirname $0)/jellyfish/bin/jellyfish  /usr/local/opt/jellyfish-1.1/bin/jellyfish jellyfish1 jellyfish; do
+  if test -f $JF || hash $JF 2>/dev/null; then
     JELLYFISH_BIN=$JF;
     break
   fi

From 3b7642d1c0c6c45ed8578309afa1eaac8c099e79 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 27 Sep 2017 23:58:50 -0400
Subject: [PATCH 054/105] Add slurp_file for file reading - fix for OSX

---
 src/quickfile.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++
 src/quickfile.hpp |  4 ++++
 src/set_lcas.cpp  | 34 +++++++++++++++++-----------------
 3 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/src/quickfile.cpp b/src/quickfile.cpp
index ddabe9a..c518dd9 100644
--- a/src/quickfile.cpp
+++ b/src/quickfile.cpp
@@ -129,4 +129,48 @@ void QuickFile::close_file() {
   valid = false;
 }
 
+// from http://programanddesign.com/cpp/human-readable-file-size-in-c/
+char* readable_fs(double size/*in bytes*/, char *buf) {
+    int i = 0;
+    const char* units[] = {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"};
+    while (size > 1024) {
+        size /= 1024;
+        i++;
+    }
+    sprintf(buf, "%.*f %s", i, size, units[i]);
+    return buf;
+}
+
+
+
+std::vector<char> slurp_file(std::string filename, size_t lSize) {
+  FILE * pFile;
+  size_t result;
+
+  pFile = fopen ( filename.c_str() , "rb" );
+  if (pFile==NULL) {fputs ("File error",stderr); exit (1);}
+
+  if (lSize > 0) {
+    // obtain file size:
+    fseek (pFile , 0 , SEEK_END);
+    lSize = ftell (pFile);
+    rewind (pFile);
+  }
+  
+  char buf[50];
+  readable_fs(lSize, buf);
+  std::cerr << "Getting " << filename << " into memory (" << buf << ") ..."; 
+
+  // copy the file into the vector:
+  std::vector<char> buffer(lSize);
+  result = fread (buffer.data(),1,lSize,pFile);
+  if (result != lSize) {fputs ("Reading error",stderr); exit (3);}
+  fclose (pFile);
+
+  std::cerr << " Done" << std::endl;
+  return(std::move(buffer));
+}
+
+
+
 } // namespace
diff --git a/src/quickfile.hpp b/src/quickfile.hpp
index 5533580..8f57642 100644
--- a/src/quickfile.hpp
+++ b/src/quickfile.hpp
@@ -21,6 +21,7 @@
 #define QUICKFILE_HPP
 
 #include "kraken_headers.hpp"
+#include <vector>
 
 namespace kraken {
   class QuickFile {
@@ -43,6 +44,9 @@ namespace kraken {
     char *fptr;
     size_t filesize;
   };
+
+  std::vector<char> slurp_file(std::string filename, size_t lSize = 0);
+
 }
 
 #endif
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 504e2b6..8db1033 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -78,6 +78,7 @@ TaxonomyDB<uint32_t, ReadCounts> taxdb;
 
 const string prefix = "kraken:taxid|";
 
+
 int main(int argc, char **argv) {
   #ifdef _OPENMP
   omp_set_num_threads(1);
@@ -105,31 +106,24 @@ int main(int argc, char **argv) {
       cerr << "Something went wrong while creating the file." << endl;
       exit(1);
     }
+  }
 
+  if (!Operate_in_RAM && Output_DB_filename.size() > 0) {
+      cerr << "You need to operate in RAM (flag -M) to use output to a different file (flag -o)" << endl;
+      return 1;
   }
 
   QuickFile db_file(DB_filename, "rw");
-
-  char *temp_ptr = NULL;
   size_t db_file_size = db_file.size();
+  vector<char> dat;
   if (Operate_in_RAM) {
-    cerr << "Getting " << DB_filename << " into memory ... ";
     db_file.close_file();
-    temp_ptr = new char[ db_file_size ];
-    ifstream ifs(DB_filename.c_str(), ifstream::binary);
-    ifs.read(temp_ptr, db_file_size);
-    ifs.close();
-    Database = KrakenDB(temp_ptr);
-    cerr << "done" << endl;
+    dat = slurp_file(DB_filename, db_file_size);
+    Database = KrakenDB(dat.data());
   } else {
     if (Output_DB_filename.size() > 0) {
-      cerr << "You need to operate in RAM (flag -M) to use output to a different file (flag -o)" << endl;
-      return 1;
+      //system("cp " + DB_filename + " " + Output_DB_filename);
     }
-    //std::ifstream ifs("input.txt", std::ios::binary);
-    //std::ofstream ofs("output.txt", std::ios::binary);
-    //ofs << ifs.rdbuf();
-
     Database = KrakenDB(db_file.ptr());
   }
 
@@ -160,9 +154,9 @@ int main(int argc, char **argv) {
     }
     cerr << "Writing database from RAM back to " << DB_filename << " ..." << endl;
     ofstream ofs(DB_filename.c_str(), ofstream::binary);
-    ofs.write(temp_ptr, db_file_size);
+    ofs.write(dat.data(), db_file_size);
     ofs.close();
-    delete temp_ptr;
+    dat.clear();
   }
 
   UID_map_file.close();
@@ -204,6 +198,8 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
     TaxonomyDB<uint32_t, ReadCounts>& taxdb, unordered_map<uint32_t,uint32_t>& Parent_map, 
     bool Add_taxIds_for_Assembly, bool Add_taxIds_for_Sequences) {
 
+  cerr << "Reading sequence ID to taxonomy ID mapping ... ";
+
   unordered_map<string, uint32_t> ID_to_taxon_map;
   ifstream map_file(ID_to_taxon_map_filename.c_str());
   if (map_file.rdstate() & ifstream::failbit) {
@@ -243,6 +239,10 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
     }
     ID_to_taxon_map[seq_id] = taxid;
   }
+  if (ID_to_taxon_map.size() == 0) {
+    cerr << "Error: No ID mappings present!!" << endl;
+  }
+  cerr << " Done - read " << ID_to_taxon_map.size() << " mappings." << endl;
   return std::move(ID_to_taxon_map);
 }
 

From da0978e4d4e22705231f59d659be249d331336e5 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 28 Sep 2017 01:31:12 -0400
Subject: [PATCH 055/105] Added classification rater (not working yet)

---
 src/grade_classification.cpp | 76 ++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 src/grade_classification.cpp

diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
new file mode 100644
index 0000000..8f9b1e0
--- /dev/null
+++ b/src/grade_classification.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ * licnsed under GPLv3
+ */
+
+#include "taxdb.h"
+#include "quickfile.hpp"
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+
+using namespace std;
+
+unordered_map<string, uint32_t> read_seqid_mapping(string filename) {
+  unordered_map<string, uint32_t> ID_to_taxon_map;
+  ifstream map_file(filename.c_str());
+  if (map_file.rdstate() & ifstream::failbit) {
+    err(EX_NOINPUT, "can't open %s", filename.c_str());
+  }
+  string line, seq_id;
+  uint32_t taxid;
+
+  while (map_file.good()) {
+    getline(map_file, line);
+    if (line.empty())
+      break;
+    istringstream iss(line);
+    iss >> seq_id >> taxid;
+    ID_to_taxon_map[seq_id] = taxid;
+  }
+  map_file.close();
+  return ID_to_taxon_map;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 4) {
+    std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file\n";
+    return 1;
+  }
+  TaxonomyDB<uint32_t, uint32_t> taxdb = TaxonomyDB<uint32_t, uint32_t>(argv[1], false);
+  unordered_map<string, uint32_t> seqid_map = read_seqid_mapping(argv[2]);
+  cerr << "Read " << seqid_map.size() << " taxa mappings" << endl;
+  
+  ifstream k_file(argv[3]);
+  if (k_file.rdstate() & ifstream::failbit) {
+    err(EX_NOINPUT, "can't open %s", argv[3]);
+  }
+  string line, classification_state, read_id, seq_id;
+  uint32_t taxid;
+  uint32_t seq_taxid;
+
+  while (k_file.good()) {
+    getline(k_file, line);
+    if (line.empty())
+      continue;
+    istringstream iss(line);
+    iss >> classification_state >> read_id >> taxid;
+    seq_id = read_id.substr(read_id.find_last_of("_")+1);
+    auto it = seqid_map.find(seq_id);
+    if (it == seqid_map.end()) {
+      cerr << "ERROR: Couldn't find taxid for " << seq_id << endl;
+    } else {
+      seq_taxid = it->second;
+      size_t distance_between_taxids;
+      string lowest_common_rank;
+      seq_taxid = taxdb.getTaxIDAtRank(seq_taxid, "species");
+      taxid = taxdb.getTaxIDAtRank(taxid, "species");
+      pair<uint32_t, int> lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, taxid);
+      string lca_rank = taxdb.getRank(lca_taxid_dist.first);
+      cout << seq_taxid << '\t'  << taxid << '\t' << lca_rank << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << endl;
+    }
+  }
+  k_file.close();
+
+
+}

From f49630a3b8648ba57d57116ba93829e7e26e402c Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sat, 30 Sep 2017 13:15:51 -0400
Subject: [PATCH 056/105] Allow multiple library directories on command line

---
 scripts/krakenu-build       |  7 ++++---
 scripts/krakenu-build_db.sh | 14 +++++++-------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/scripts/krakenu-build b/scripts/krakenu-build
index 59b0d50..8f72697 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenu-build
@@ -70,11 +70,12 @@ my (
   $add_taxonomy_ids_for_genome,
   $add_taxonomy_ids_for_seq,
 
-  $library_dir,
   $taxonomy_dir
 
 );
 
+my @library_dirs;
+
 my $verbose = 0;
 
 $threads = $DEF_THREAD_CT;
@@ -132,7 +133,7 @@ GetOptions(
   "lca-database!" => \$build_lca_database,
   "uid-database!" => \$build_uid_database,
 
-  "library-dir=s" => \$library_dir,
+  "library-dir=s" => \@library_dirs,
   "taxonomy-dir=s" => \$taxonomy_dir
 ) or usage();
 
@@ -325,7 +326,7 @@ sub build_database {
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome;
   $ENV{"KRAKEN_UID_DATABASE"} = $build_uid_database;
   $ENV{"KRAKEN_LCA_DATABASE"} = $build_lca_database;
-  $ENV{"KRAKEN_LIBRARY_DIR"} = $library_dir;
+  $ENV{"KRAKEN_LIBRARY_DIRS"} = "@library_dirs";
   $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir;
   my $opt = ($verbose? "-x" : "");
   exec "krakenu-build_db.sh";
diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index 48260ad..fb79fac 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -80,9 +80,9 @@ then
 fi
 
 LIBRARY_DIR="library/"
-[[ "$KRAKEN_LIBRARY_DIR" != "" ]] && LIBRARY_DIR="$KRAKEN_LIBRARY_DIR"
+[[ "$KRAKEN_LIBRARY_DIRS" != "" ]] && LIBRARY_DIR="$KRAKEN_LIBRARY_DIRS"
 
-TAXONOMY_DIR="library/"
+TAXONOMY_DIR="taxonomy/"
 [[ "$KRAKEN_TAXONOMY_DIR" != "" ]] && TAXONOMY_DIR="$KRAKEN_TAXONOMY_DIR"
 
 if [ ! -s "library-files.txt" ]; then
@@ -218,15 +218,15 @@ then
 else
   echo "Creating taxDB (step 5 of 6)... "
   start_time1=$(date "+%s.%N")
-  if [ ! -f taxonomy/names.dmp ] || [ ! -f taxonomy/nodes.dmp ]; then
-    echo "taxonomy/names.dmp or taxonomy/nodes.dmp does not exist - downloading it ..."
-    [ -d taxonomy ] || mkdir taxonomy
-    cd taxonomy
+  if [ ! -f $TAXONOMY_DIR/names.dmp ] || [ ! -f $TAXONOMY_DIR/nodes.dmp ]; then
+    echo "$TAXONOMY_DIR/names.dmp or $TAXONOMY_DIR/nodes.dmp does not exist - downloading it ..."
+    [ -d $TAXONOMY_DIR ] || mkdir $TAXONOMY_DIR
+    cd $TAXONOMY_DIR
     wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz
     tar zxf taxdump.tar.gz
     cd ..
   fi
-  build_taxdb taxonomy/names.dmp taxonomy/nodes.dmp | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp
+  build_taxdb $TAXONOMY_DIR/names.dmp $TAXONOMY_DIR/nodes.dmp | sort -t$'\t' -rnk6,6 -rnk5,5 > taxDB.tmp
   mv taxDB.tmp taxDB
   echo "taxDB construction finished. [$(report_time_elapsed $start_time1)]"
 fi

From 314f49c3966a02bb712b33bcbda71b0969b50364 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sat, 30 Sep 2017 13:17:44 -0400
Subject: [PATCH 057/105] Update on tests

---
 src/Makefile                 |   7 +-
 src/grade_classification.cpp | 129 +++++++++++++++++++++++---
 src/krakendb.cpp             |  17 +++-
 src/krakendb.hpp             |   6 +-
 src/taxdb.h                  | 173 +++++++++++++++++++++++++++++++++--
 tests/build-dbs.sh           |  10 +-
 tests/classify-reads.sh      |   8 +-
 tests/init.sh                |   7 +-
 tests/simulate-reads.sh      |  44 ++++++++-
 9 files changed, 372 insertions(+), 29 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 82246e9..f127108 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,7 @@
 CXX = g++
-CXXFLAGS = -Wall -std=c++11 -fopenmp -O2 -g -Wfatal-errors
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb
+FOPENMP?=-fopenmp
+CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors
+PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
 .PHONY: all install clean
@@ -19,6 +20,8 @@ db_sort: krakendb.o quickfile.o
 
 set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
 
+grade_classification: taxdb.h
+
 classify: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
 	$(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS)
 
diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
index 8f9b1e0..f787065 100644
--- a/src/grade_classification.cpp
+++ b/src/grade_classification.cpp
@@ -8,9 +8,12 @@
 #include <iostream>
 #include <fstream>
 #include <unordered_map>
+#include <iomanip>
 
 using namespace std;
 
+using TAXID = uint32_t;
+
 unordered_map<string, uint32_t> read_seqid_mapping(string filename) {
   unordered_map<string, uint32_t> ID_to_taxon_map;
   ifstream map_file(filename.c_str());
@@ -33,20 +36,35 @@ unordered_map<string, uint32_t> read_seqid_mapping(string filename) {
 }
 
 int main(int argc, char **argv) {
-  if (argc != 4) {
-    std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file\n";
+  if (argc != 5) {
+    std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file result_file\n";
     return 1;
   }
   TaxonomyDB<uint32_t, uint32_t> taxdb = TaxonomyDB<uint32_t, uint32_t>(argv[1], false);
   unordered_map<string, uint32_t> seqid_map = read_seqid_mapping(argv[2]);
   cerr << "Read " << seqid_map.size() << " taxa mappings" << endl;
   
+  ofstream out_file(argv[4]);
+  set<string> all_ranks;
+  unordered_map< string, size_t > rank_counts;
+  map< int, set<TAXID> > simulated_taxids_at_rank;
+  map< int, set<TAXID> > identified_taxids_at_rank;
+  map< int, size_t > correct_reads_at_rank;
+  map< int, size_t > incorrect_reads_at_rank;
+  map< int, size_t > reads_at_higher_rank;
+  size_t total_reads = 0;
+  size_t unidentified_reads = 0;
+  
+
+  vector<TaxRank::RANK> ranks_of_interest = {TaxRank::RANK::assembly, TaxRank::RANK::species, TaxRank::RANK::genus, TaxRank::RANK::family, TaxRank::RANK::order}; 
+
   ifstream k_file(argv[3]);
   if (k_file.rdstate() & ifstream::failbit) {
     err(EX_NOINPUT, "can't open %s", argv[3]);
   }
+
   string line, classification_state, read_id, seq_id;
-  uint32_t taxid;
+  uint32_t identified_taxid;
   uint32_t seq_taxid;
 
   while (k_file.good()) {
@@ -54,23 +72,110 @@ int main(int argc, char **argv) {
     if (line.empty())
       continue;
     istringstream iss(line);
-    iss >> classification_state >> read_id >> taxid;
-    seq_id = read_id.substr(read_id.find_last_of("_")+1);
+    iss >> classification_state >> read_id >> identified_taxid;
+
+    ++total_reads;
+    if (identified_taxid == 0) {
+      ++unidentified_reads;
+    }
+
+    // sequence id is after the 5th underscore with random_reads.sh - find it
+    size_t pos = 0;
+    size_t count = 0;
+    do {
+      pos = read_id.find("_", pos) + 1;
+      ++count;
+    } while (count <= 5 && pos != std::string::npos);
+
+    seq_id = read_id.substr(pos);
     auto it = seqid_map.find(seq_id);
     if (it == seqid_map.end()) {
       cerr << "ERROR: Couldn't find taxid for " << seq_id << endl;
+      exit(1);
     } else {
       seq_taxid = it->second;
-      size_t distance_between_taxids;
-      string lowest_common_rank;
-      seq_taxid = taxdb.getTaxIDAtRank(seq_taxid, "species");
-      taxid = taxdb.getTaxIDAtRank(taxid, "species");
-      pair<uint32_t, int> lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, taxid);
-      string lca_rank = taxdb.getRank(lca_taxid_dist.first);
-      cout << seq_taxid << '\t'  << taxid << '\t' << lca_rank << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << endl;
+
+      // go up to species level or next proper (i.e. not 'no rank') rank for
+      //  both real and assigned taxon
+      if (0) {
+        seq_taxid = taxdb.getTaxIDAtRank(seq_taxid, "species");
+        uint32_t identified_species_taxid = taxdb.getTaxIDAtRank(identified_taxid, "species");
+        if (identified_species_taxid != 0) {
+          identified_taxid = identified_species_taxid;
+        } else {
+          identified_taxid = taxdb.getTaxIDAtNextProperRank(identified_taxid);
+        }  
+      }
+      
+      string seq_species = taxdb.getScientificName(seq_taxid);
+      // getLowestCommonAncestor returns lca taxon as well as distance between the taxa
+      pair<uint32_t, int> lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, identified_taxid);
+      string lca_rank_string = taxdb.getNextProperRank(lca_taxid_dist.first);
+      TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string);
+
+      TaxRank::RANK identified_rank = TaxRank::toRank(taxdb.getRank(identified_taxid));
+      for (TaxRank::RANK rank : ranks_of_interest) {
+        TAXID simulated_taxid_at_rank = taxdb.getTaxIDAtRank(seq_taxid, TaxRank::toString(rank));
+        TAXID identified_taxid_at_rank = taxdb.getTaxIDAtRank(identified_taxid, TaxRank::toString(rank));
+        simulated_taxids_at_rank[rank].insert(simulated_taxid_at_rank);
+        // only consider identifications at the rank or more specific
+        //  alternative: count identifications that are further up, too
+        if (identified_rank <= rank) { 
+          identified_taxids_at_rank[rank].insert(identified_taxid_at_rank);
+          if (simulated_taxid_at_rank == identified_taxid_at_rank) {
+            ++correct_reads_at_rank[rank];
+          } else {
+            ++incorrect_reads_at_rank[rank];
+          }
+        } else {
+          ++reads_at_higher_rank[rank];
+        }
+      }
+
+      if (identified_taxid == 0) 
+        lca_rank_string = "unidentified";
+      ++rank_counts[lca_rank_string];
+      out_file << seq_species << '\t' << seq_taxid << '\t'  << identified_taxid << '\t' << lca_rank_string << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << '\n';
     }
   }
   k_file.close();
 
+  cout << "#LCA_RANK_READ_COUNTS" << endl;
+  for (const auto & kv : rank_counts) {
+    cout << kv.first << '\t' << kv.second << endl;
+  }
+  cout << "\n#rank; total_reads; correct; incorrect; at_higher_rank; unidentified" << endl;
+  for (TaxRank::RANK rank : ranks_of_interest) {
+    cout << TaxRank::toString(rank) << '\t' << total_reads 
+      << '\t' << correct_reads_at_rank[rank]
+      << '\t' << incorrect_reads_at_rank[rank]
+      << '\t' << reads_at_higher_rank[rank]
+      << '\t' << unidentified_reads 
+      << '\n';
+  }
 
+  cout << "\n#rank;P;TP;FP;sens;prec" << endl;
+  for (TaxRank::RANK rank : ranks_of_interest) {
+    size_t true_positives = 0;
+    size_t false_positives = 0;
+    
+    for (const auto & tid : identified_taxids_at_rank[rank]) {
+      if (simulated_taxids_at_rank[rank].count(tid) == 1) {
+        ++true_positives;
+      } else {
+        ++false_positives;
+      }
+    }
+
+    double sensitivity = 100.0*(double)true_positives/(double)simulated_taxids_at_rank[rank].size();
+    double specificity = 100.0*(double)true_positives/(double)(true_positives+false_positives);
+
+    cout << TaxRank::toString(rank)
+      << '\t' << simulated_taxids_at_rank[rank].size()
+      << '\t' << true_positives
+      << '\t' << false_positives << setprecision(2) << std::fixed
+      << '\t' << sensitivity << '%'
+      << '\t' << specificity << '%'
+      << '\n';
+  }
 }
diff --git a/src/krakendb.cpp b/src/krakendb.cpp
index de33901..cae738f 100644
--- a/src/krakendb.cpp
+++ b/src/krakendb.cpp
@@ -52,14 +52,21 @@ KrakenDB::KrakenDB() {
   key_len = 0;
   key_bits = 0;
   k = 0;
+  _filesize = 0;
 }
 
 // Assumes ptr points to start of a readable mmap'ed file
-KrakenDB::KrakenDB(char *ptr) {
+KrakenDB::KrakenDB(char *ptr, size_t filesize) {
+  _filesize = filesize;
   index_ptr = NULL;
   fptr = ptr;
-  if (strncmp(ptr, DATABASE_FILE_TYPE, strlen(DATABASE_FILE_TYPE)))
-    errx(EX_DATAERR, "database in improper format");
+  if (ptr == NULL) {
+    errx(EX_DATAERR, "pointer is NULL");
+  }
+  if (strncmp(ptr, DATABASE_FILE_TYPE, strlen(DATABASE_FILE_TYPE))) {
+    string msg = "database in improper format - found " + string(ptr, strlen(DATABASE_FILE_TYPE));
+    errx(EX_DATAERR, msg.c_str());
+  }
   memcpy(&key_bits, ptr + 8, 8);
   memcpy(&val_len, ptr + 16, 8);
   memcpy(&key_ct, ptr + 48, 8);
@@ -70,6 +77,10 @@ KrakenDB::KrakenDB(char *ptr) {
   std::cerr << "Loaded database with " << key_ct << " keys with k of " << (size_t)k << " [val_len " << val_len << ", key_len " << key_len << "]." << std::endl;
 }
 
+size_t KrakenDB::filesize() const {
+  return _filesize;
+}
+
 //using std::map to have the keys sorted
 std::map<uint32_t,uint64_t> KrakenDB::count_taxons() {
   char *ptr = get_pair_ptr();
diff --git a/src/krakendb.hpp b/src/krakendb.hpp
index 4683654..5a19a71 100644
--- a/src/krakendb.hpp
+++ b/src/krakendb.hpp
@@ -86,14 +86,18 @@ namespace kraken {
 
     void set_index(KrakenDBIndex *i_ptr);
 
+    size_t filesize() const;
+
     // Null constructor
     KrakenDB();
 
     // ptr points to start of mmap'ed DB in read or read/write mode
-    KrakenDB(char *ptr);
+    KrakenDB(char *ptr, size_t filesize = 0);
+
 
     private:
 
+    size_t _filesize;
     char *fptr;
     KrakenDBIndex *index_ptr;
     uint8_t k;
diff --git a/src/taxdb.h b/src/taxdb.h
index 0495c8c..28313f7 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -46,15 +46,109 @@ std::vector<std::string> in_betweens(const std::string &s, const char start_char
 
 std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields = 0, size_t end_chars = 0);
 
-
 std::vector<std::string> get_fields(const std::string &s, const std::string& delimiter, std::vector<size_t> fields); 
 
+// TODO: Consider using TaxRank instead of string in TaxonomyEntry
+//       However, then it would not be possible to define custom ranks..
+struct TaxRank {
+  // All ranks that appear in the NCBI taxonomy database,
+  //  plus 'sequence', 'assembly', and 'root'
+  //static constexpr vector<string> rank_strings = {
+  // "no rank", "sequence", "assembly",
+ // "subspecies", "species", "subgenus", "genus", "tribe", "subfamily",
+  //"family", "superfamily", "parvorder", "infraorder", "suborder",
+  //"order", "superorder", "parvclass", "infraclass", "subclass",
+  //"class", "superclass", "subphylum", "phylum", "kingdom",
+  //"superkingdom", "root"};
+
+  enum RANK { unknown, no_rank, sequence, assembly,
+    subspecies, species, subgenus, genus, tribe, subfamily,
+    family, superfamily, parvorder, infraorder, suborder,
+    order, superorder, parvclass, infraclass, subclass,
+    class_, superclass, subphylum, phylum, kingdom,
+    superkingdom, root
+  };
+
+  static const unordered_map<string, RANK> string_to_rank;
+
+  static const RANK toRank(const string& rank) {
+    return string_to_rank.at(rank);
+  }
+
+  static const char* toString(const TaxRank::RANK& rank) {
+    switch(rank) {
+      case RANK::unknown:           return "unknown";
+      case RANK::no_rank:          return "no rank";
+      case RANK::sequence:         return "sequence";
+      case RANK::assembly:         return "assembly";
+      case RANK::subspecies:       return "subspecies";
+      case RANK::species:          return "species";
+      case RANK::subgenus:         return "subgenus";
+      case RANK::genus:            return "genus";
+      case RANK::tribe:            return "tribe";
+      case RANK::subfamily:        return "subfamily";
+      case RANK::family:           return "family";
+      case RANK::superfamily:      return "superfamily";
+      case RANK::parvorder:        return "parvorder";
+      case RANK::infraorder:       return "infraorder";
+      case RANK::suborder:         return "suborder";
+      case RANK::order:            return "order";
+      case RANK::superorder:       return "superorder";
+      case RANK::parvclass:        return "parvclass";
+      case RANK::infraclass:       return "infraclass";
+      case RANK::subclass:         return "subclass";
+      case RANK::class_:            return "class";
+      case RANK::superclass:       return "superclass";
+      case RANK::subphylum:        return "subphylum";
+      case RANK::phylum:           return "phylum";
+      case RANK::kingdom:          return "kingdom";
+      case RANK::superkingdom:     return "superkingdom";
+      case RANK::root:             return "root";
+      default:
+        log_msg("Invalid rank!");
+    }
+    return "NA";
+  }
+  
+};
+
+const unordered_map<string, TaxRank::RANK> TaxRank::string_to_rank = {
+  {"unknown", TaxRank::unknown},
+  {"no rank", TaxRank::no_rank}, 
+  {"sequence", TaxRank::sequence},
+  {"assembly", TaxRank::assembly},
+  {"subspecies", TaxRank::subspecies},
+  {"species", TaxRank::species},
+  {"subgenus", TaxRank::subgenus},
+  {"genus", TaxRank::genus},
+  {"tribe", TaxRank::tribe},
+  {"subfamily", TaxRank::subfamily},
+  {"family", TaxRank::family},
+  {"superfamily", TaxRank::superfamily},
+  {"parvorder", TaxRank::parvorder},
+  {"infraorder", TaxRank::infraorder},
+  {"suborder", TaxRank::suborder},
+  {"order", TaxRank::order},
+  {"superorder", TaxRank::superorder},
+  {"parvclass", TaxRank::parvclass},
+  {"infraclass", TaxRank::infraclass},
+  {"subclass", TaxRank::subclass},
+  {"class", TaxRank::class_},
+  {"superclass", TaxRank::superclass},
+  {"subphylum", TaxRank::subphylum},
+  {"phylum", TaxRank::phylum},
+  {"kingdom", TaxRank::kingdom},
+  {"superkingdom", TaxRank::superkingdom},
+  {"root", TaxRank::root}
+};
+
+
 template<typename TAXID, typename READCOUNTS>
 class TaxonomyEntry {
  public:
   TAXID taxonomyID = 0;
   TAXID parentTaxonomyID = 0;
-  std::string rank;
+  string rank;
   std::string scientificName;
 
   TaxonomyEntry() {}
@@ -107,6 +201,9 @@ class TaxonomyDB {
   std::string getScientificName(const TAXID taxID) const;
   std::string getRank(const TAXID taxID) const;
   TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
+  pair<TAXID,int> getLowestCommonAncestor(TAXID a, TAXID b) const;
+  string getNextProperRank(TAXID a) const;
+  TAXID getTaxIDAtNextProperRank(TAXID a) const;
 
   TAXID getParentTaxID(const TAXID taxID) const;
   std::unordered_map<TAXID, TAXID> getParentMap() const;
@@ -132,6 +229,7 @@ class TaxonomyDB {
 
   std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
   bool genomeSizes_are_set = false;
+
  private:
 
   std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
@@ -471,6 +569,63 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
   return(taxIDsAndEntries);
 }
 
+template<typename TAXID, typename READCOUNTS>
+string TaxonomyDB<TAXID,READCOUNTS>::getNextProperRank(TAXID a) const {
+  if (a == 0) {
+    return "NA";
+  }
+  while (getRank(a) == "no rank" && a != getParentTaxID(a)) {
+    a = getParentTaxID(a);
+  }
+  if ( a == 1 ) {
+    return "root";
+  }
+  return getRank(a);
+}
+
+template<typename TAXID, typename READCOUNTS>
+TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtNextProperRank(TAXID a) const {
+  if (a == 0 || a == 1) {
+    return 0;
+  }
+  while (getRank(a) == "no rank" && a != getParentTaxID(a)) {
+    a = getParentTaxID(a);
+  }
+  return a;
+}
+
+template<typename TAXID, typename READCOUNTS>
+pair<TAXID,int> TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(TAXID a, TAXID b) const {
+    if (a == 0 || b == 0) {
+      return a ? pair<TAXID,int>(a,-1) : pair<TAXID,int>(b,-1); 
+    }
+
+    // create a path from a to the root
+    std::unordered_set<uint32_t> a_path;
+    int distA = 0;
+    while (a > 0 && a != getParentTaxID(a)) {
+      if (a == b)
+        return pair<TAXID,int>{a, distA};
+      a_path.insert(a);
+      a = getParentTaxID(a);
+      ++distA;
+    }
+
+    int distB = 0;
+    // search for b in the path from a to the root
+    while (b > 0 && b != getParentTaxID(b)) {
+      auto it = a_path.find(b);
+      if (it != a_path.end()) {
+        return pair<TAXID,int>(b, distB + std::distance(a_path.begin(), it));
+      }
+      b = getParentTaxID(b);
+      ++distB;
+    }
+    return pair<TAXID,int>(1, distA+distB);
+}
+
+
+
 template<typename TAXID, typename READCOUNTS>
 TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
     const std::vector<TAXID>& taxIDs) const {
@@ -623,11 +778,13 @@ std::string TaxonomyDB<TAXID,READCOUNTS>::getMetaPhlAnLineage(TAXID taxonomyID)
 template<typename TAXID, typename READCOUNTS>
 TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtRank(const TAXID taxID,
                                     const std::string& rank) const {
+  if (taxID == 0 || taxID == 1)
+    return 0;
   auto entry = taxIDsAndEntries.find(taxID);
-  //cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl;
+  // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl;
   while (entry != taxIDsAndEntries.end() &&
          entry->second.parentTaxonomyID != 1) {
-    //cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl;
+    // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl;
     if (entry->second.rank == rank) {
       return entry->second.taxonomyID;
     } else {
@@ -722,8 +879,12 @@ void TaxonomyDB<TAXID,READCOUNTS>::setReadCounts(const unordered_map<TAXID, READ
 
 
 template<typename TAXID, typename READCOUNTS>
-	TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
-	_report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, REPORTCOLS::SPACED_NAME};
+  TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, 
+    bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
+  _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, 
+    REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, 
+    REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, 
+    REPORTCOLS::SPACED_NAME};
 }
 
 
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index 3e489e3..8002fe9 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -9,5 +9,13 @@ mkdir -p $DIR/dbs/refseq-viral-plus/library
 [[ -L $DIR/dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s $DIR/data/library/viral-neighbors/ $DIR/dbs/refseq-viral-plus/library/
 
 export PATH="$DIR/install:$PATH"
-krakenu-build --db $DIR/dbs/refseq-viral --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy
+for K in 21 26 31; do
+  mkdir -p $DIR/dbs/refseq-viral-k$K
+  krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy
+
+  if [[ `uname` != "Darwin" ]]; then
+    krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --taxonomy-dir=$DIR/data/taxonomy
+
+  fi
+done
 
diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh
index 807d287..802b29b 100755
--- a/tests/classify-reads.sh
+++ b/tests/classify-reads.sh
@@ -7,4 +7,10 @@ CDIR=$DIR/classification-results
 mkdir -p $CDIR
 
 NAM=viral-neighbors-10m
-time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $CDIR/$NAM.krakenu.report > $CDIR/$NAM.krakenu
+for K in 21 26 31; do
+  KFILE=$CDIR/$NAM.k$K.krakenu
+  [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $KFILE.report > $KFILE 2> $KFILE.log
+  [[ -s $KFILE.results ]] || $DIR/install/grade_classification  $DIR/dbs/refseq-viral-k$K/taxDB $DIR/data/all-viral-neighbors.map $KFILE > $KFILE.results
+  [[ -s $KFILE.results.stats ]] || cut -f 4 $KFILE.results | sort | uniq -c | sort -n > $KFILE.results.stats
+
+done
diff --git a/tests/init.sh b/tests/init.sh
index a289d0d..0c341fa 100755
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -4,13 +4,16 @@ DIR=$1
 [[ "$DIR" == "" ]] && DIR=`pwd`
 
 ## Install KrakenU locally into install/
-$(dirname $0)/install_kraken.sh --install-jellyfish $DIR/install
+$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install
 
 ## Download taxonomy and genomic data into data/
-$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
+#$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
 
 for i in viral viral-neighbors archaea bacteria; do 
   if [[ ! -f "$DIR/data/all-$i.fna" ]]; then 
     find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
   fi
+  if [[ ! -f "$DIR/data/all-$i.map" ]]; then 
+    find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map
+  fi
 done
diff --git a/tests/simulate-reads.sh b/tests/simulate-reads.sh
index d5fd965..09d7db7 100755
--- a/tests/simulate-reads.sh
+++ b/tests/simulate-reads.sh
@@ -4,6 +4,48 @@ set -xeu
 
 [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
 SDIR=$DIR/simulated_reads
+CDIR=$DIR/classification-results
+mkdir -p $CDIR
 mkdir -p $SDIR
 
-randomreads.sh ref=$DIR/data/all-viral-neighbors.fna out=$SDIR/viral-neighbors-10m.fq reads=10m len=150
+run_krakenu_viral() {
+  FQ=$1
+  NAM=$2
+  K=$3
+  DAT=$4
+
+  KFILE=$CDIR/$NAM.k$K.krakenu
+  [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ --report-file $KFILE.report > $KFILE 2> $KFILE.log
+  [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map
+  [[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification  $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats
+}
+
+run_kraken_viral() {
+  FQ=$1
+  NAM=$2
+  K=$3
+  DAT=$4
+
+  KFILE=$CDIR/$NAM.k$K.kraken
+  [[ -s $KFILE ]] || time kraken --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ > $KFILE 2> $KFILE.log
+  [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map
+  #[[ -s $KFILE.results.stats ]] || 
+    $DIR/install/grade_classification  $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats
+}
+
+
+
+AB=1m
+for i in 1 2 3; do
+  for dat in viral viral-neighbors bacteria archaea; do
+    for len in 75 100 150; do
+      NAM=$dat.$AB${len}bp.$i
+      FQ=$SDIR/$NAM.fq
+      [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i
+      for K in 21 26 31; do
+        run_krakenu_viral $FQ $NAM $K $dat
+        run_kraken_viral $FQ $NAM $K $dat
+      done
+    done
+  done
+done

From 3898b25c75d47a2f74fc28cef6c02eeca56d3c4b Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 03:01:56 -0400
Subject: [PATCH 058/105] Added script to dump taxDB to NCBI dump format

---
 src/Makefile       |  2 +-
 src/dump_taxdb.cpp | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 src/dump_taxdb.cpp

diff --git a/src/Makefile b/src/Makefile
index f127108..0ed70b3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,7 +1,7 @@
 CXX = g++
 FOPENMP?=-fopenmp
 CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification
+PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
 .PHONY: all install clean
diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp
new file mode 100644
index 0000000..b2c73f0
--- /dev/null
+++ b/src/dump_taxdb.cpp
@@ -0,0 +1,34 @@
+#include "taxdb.h"
+#include "quickfile.hpp"
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+
+using namespace std;
+
+int main(int argc, char **argv) {
+  if (argc != 3) {
+    std::cerr << "Usage: build_taxdb taxDB names.dmp nodes.dmp\n";
+    return 1;
+  }
+  TaxonomyDB<uint32_t, uint32_t> taxdb {(string)argv[1]};
+  ofstream names_file(argv[2]);
+  names_file.exceptions(ifstream::failbit | ifstream::badbit);
+  ofstream nodes_file(argv[3]);
+  nodes_file.exceptions(ifstream::failbit | ifstream::badbit);
+
+  for (const auto &taxon : taxdb.taxIDsAndEntries) {
+    std::string scientificName;
+    nodes_file << taxon.second.taxonomyID 
+      << "\t|\t" << taxon.second.parentTaxonomyID
+      << "\t|\t" << taxon.second.rank
+      << "\t|\n"; // there are further columns, but Kraken does not care about them
+    
+    names_file << taxon.second.taxonomyID 
+      << "\t|\t" << taxon.second.scientificName
+      << "\t|\t" 
+      << "\t|\t" << "scientific name" << "\t|\n";
+  }
+  names_file.close();
+  nodes_file.close();
+}

From 94b4326d5878490eb7336fd03a1b7589eecb089f Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 03:03:27 -0400
Subject: [PATCH 059/105] Added dusting and 'standard' db to testing

---
 tests/build-dbs.sh | 16 +++++++---------
 tests/init.sh      | 21 ++++++++++++---------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index 8002fe9..082bac5 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -4,18 +4,16 @@ set -xeu
 
 [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
 
-mkdir -p $DIR/dbs/refseq-viral-plus/library
-[[ -L $DIR/dbs/refseq-viral-plus/library/viral ]] || ln -s $DIR/data/library/viral/ $DIR/dbs/refseq-viral-plus/library/
-[[ -L $DIR/dbs/refseq-viral-plus/library/viral-neighbors ]] || ln -s $DIR/data/library/viral-neighbors/ $DIR/dbs/refseq-viral-plus/library/
-
 export PATH="$DIR/install:$PATH"
-for K in 21 26 31; do
-  mkdir -p $DIR/dbs/refseq-viral-k$K
-  krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy
+for K in 31 26 21; do
+  #mkdir -p $DIR/dbs/refseq-viral-k$K
+  #krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy
 
   if [[ `uname` != "Darwin" ]]; then
-    krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --taxonomy-dir=$DIR/data/taxonomy
-
+    #mkdir -p $DIR/dbs/refseq-bacteria-k$K
+    #krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy
+    mkdir -p $DIR/dbs/refseq-oct2017-k$K
+    krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/libray/vertebrate_mammalia --taxonomy-dir=$DIR/data/taxonomy
   fi
 done
 
diff --git a/tests/init.sh b/tests/init.sh
index 0c341fa..f4c73d3 100755
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -1,19 +1,22 @@
 #!/bin/bash
 
-DIR=$1
-[[ "$DIR" == "" ]] && DIR=`pwd`
+set -xeu
+
+[[ $# -eq 1 ]] && DIR=$1 || DIR=`pwd`
 
 ## Install KrakenU locally into install/
-$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install
+#$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install
 
 ## Download taxonomy and genomic data into data/
 #$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
+#$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
 
 for i in viral viral-neighbors archaea bacteria; do 
-  if [[ ! -f "$DIR/data/all-$i.fna" ]]; then 
-    find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
-  fi
-  if [[ ! -f "$DIR/data/all-$i.map" ]]; then 
-    find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map
-  fi
+  [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
+  [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map
+  DUSTED_F="$DIR/data/all-$i-dusted.fna"
+  [[ -s $DUSTED_F ]] || dustmasker -infmt fasta -in $DIR/data/all-$i.fna -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > "$DUSTED_F"
+  mkdir -p $DIR/data/library/$i-dusted
+  [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna" ]] || ln "$DUSTED_F" "$DIR/data/library/$i-dusted/all-$i-dusted.fna"
+  [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna.map" ]] || ln "$DIR/data/all-$i.map" "$DIR/data/library/$i-dusted/all-$i.map"
 done

From 4130390f5db074de0fa03df9e6cfccfc4f3663f8 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 03:05:11 -0400
Subject: [PATCH 060/105] Make building work for OSX and Linux

---
 scripts/krakenu-build_db.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index fb79fac..00e708a 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -24,6 +24,7 @@
 set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 set -o pipefail  # Stop on failures in non-final pipeline commands
+set -x
 
 function report_time_elapsed() {
   set -x
@@ -90,8 +91,9 @@ if [ ! -s "library-files.txt" ]; then
     find $FIND_OPTS $LIBRARY_DIR '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' > library-files.txt
 fi
 
-files0() {
-  cat library-files.txt | tr '\n' '\0'
+file_sizes() {
+  ## stat -c is for Linux, stat -f is for BSD/OSX
+  cat library-files.txt | tr '\n' '\0' | xargs -0 -I '{}' sh -c "stat -c '%s\n' {} 2> /dev/null || stat -f '%z' {}"
 }
 cat_library() {
   cat library-files.txt | tr '\n' '\0' | xargs -0 cat
@@ -117,7 +119,7 @@ else
   # Estimate hash size as 1.15 * chars in library FASTA files
   if [ -z "$KRAKEN_HASH_SIZE" ]
   then
-    KRAKEN_HASH_SIZE=$( files0 | xargs -0 stat -f%z | perl -nle '$sum += $_; END {print int(1.15 * $sum)}')
+    KRAKEN_HASH_SIZE=$( file_sizes  | perl -nle '$sum += $_; END {print int(1.15 * $sum)}')
     echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
   fi
 

From cfd04227198252437d6874c9c0854d1ae4b882a0 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 03:05:29 -0400
Subject: [PATCH 061/105] Fixes for building and downloading

---
 scripts/krakenu-download | 22 ++++++++++++----------
 src/set_lcas.cpp         | 15 ++++++++-------
 src/taxdb.h              | 21 +++++++++++++--------
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/scripts/krakenu-download b/scripts/krakenu-download
index 7cf6fd3..4508f98 100755
--- a/scripts/krakenu-download
+++ b/scripts/krakenu-download
@@ -40,7 +40,6 @@ my $BASE_DIR;
 my $DB_DIR;
 my $N_PROC=5;
 my $CHANGE_HEADER=0;
-my $DOWNLOAD_RNA=0;
 my $DO_DUST=0;
 my $FILTER_UNPLACED=0;
 my $VERBOSE=0;
@@ -80,6 +79,7 @@ WHEN USING database refseq OR genbank:
  -c <refseq category>   Only download genomes in the specified refseq category. Default: any.
  -t <taxids>        Only download the specified taxonomy IDs, comma separated. Default: any.
  --fna <seq types>  Comma-separated list of sequence types, including genomic, rna, rna_from_genomic, cds_from_genomic. Default: $FNA_FILES.
+                    See the assembly project FTP site for available sequences
  -u                 Filter unplaced sequences.
  -m                 Mask low-complexity regions using dustmasker.
  -l                 Modify sequence header to include taxonomy ID for Kraken (i.e. add '>kraken:taxid|TAXID' to each sequence).
@@ -191,12 +191,11 @@ if ($INCLUDE_VIRAL_NEIGHBORS) {
 
 sub download(@) {
   my ($url, $file, $gunzipped_filename) = @_;
-  if (-s $file && !$OVERWRITE_FILES) {
+  if (!$OVERWRITE_FILES && (( defined $gunzipped_filename && -s $gunzipped_filename) || (!defined $gunzipped_filename && -s $file))) {
     print STDERR "Not fetching $url - file $file exists.\n" if $VERBOSE;
     return 1;
   }
 
-  start_fork() and return;
   if ($url =~ /^http/) {
     print STDERR "Fetching $url to $file ..." if $VERBOSE;
     if (!-d dirname($file)) {
@@ -206,7 +205,6 @@ sub download(@) {
     if (!$response->is_success) {
       print STDERR "\nFAIL: Error downloading $url!\n";
       print STDERR $response->status_line."\n";
-      exit;
     } else {
       print STDERR "SUCCESS\n" if $VERBOSE;
     }
@@ -221,14 +219,13 @@ sub download(@) {
     move($where, $file);
 
     if (defined $gunzipped_filename) {
-      print STDERR " GUNZIPPING";
+      print STDERR " GUNZIPPING" if $VERBOSE;
       gunzip $file => $gunzipped_filename or die "gunzip failed: $GunzipError";
       unlink $file;
       $file = $gunzipped_filename;
     }
     print STDERR " SUCCESS\n" if $VERBOSE;
   }
-  exit;
   #my $where = $ff->fetch(to=> dirname($file)) or die "\n$ff->error for $url!";
   return -s $file;
 }
@@ -261,7 +258,7 @@ sub wait_children() {
 }
 
 sub end_fork() {
-  exit() unless $N_PROC == 1;
+  exit() unless $N_PROC <= 1;
 }
 
 sub download_viral_neighbors(@) {
@@ -312,12 +309,14 @@ sub download_viral_neighbors(@) {
     $name1 =~ s/__/_/g;
     my $file = "$nbr_dir/$name1-tax$taxid/$nbr_ac.fna";
     my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac";
+    start_fork() and next;
     if (download($url,$file)) {
       print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac");
     }
+    end_fork();
   }
   print STDERR "\n";
-  wait_children;
+  wait_children();
 
 #  $pm->wait_all_children();
 }
@@ -396,7 +395,7 @@ sub download_taxonomy(@) {
 
 sub download_domain(@) {
   my ($domain_dir, $domain, $_assembly_level, $_taxid) = @_;
-  print STDERR "Downloading assembly summary file for $domain genomes.\n";
+  print STDERR "Downloading assembly summary file for $domain genomes, and filtering to assembly level $_assembly_level and taxid $_taxid.\n";
   die unless defined $domain_dir && defined $domain;
   if (-d $domain_dir) {
     print STDERR "WARNING: $domain_dir already exists - potentially overwriting files.\n";
@@ -451,6 +450,7 @@ sub download_domain(@) {
     my $bname1 = "${organism_name1}-tax${taxid}-${bname}";
     
     foreach my $ext (split(/,/, $FNA_FILES)) {
+      start_fork() and next;
       my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz";
       my $bfname = $bname1."_".$ext;
       my $fname = $bfname.".fna";
@@ -477,9 +477,11 @@ sub download_domain(@) {
         system("dustmasker -infmt fasta -in '$domain_dir/$fname' -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > '$domain_dir/${bfname}_dustmasked.fna'");
         unlink("$domain_dir/$fname");
       }
+      end_fork();
     }
   }
 
-#  $pm->wait_all_children;
+  wait_children();
+
   print STDERR "\n";
 }
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 8db1033..7e69bab 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -173,7 +173,7 @@ int main(int argc, char **argv) {
 }
 
 inline 
-uint32_t get_taxid(
+uint32_t get_new_taxid(
     unordered_map<string, uint32_t>& name_to_taxid_map, 
     unordered_map<uint32_t,uint32_t>& Parent_map,
     string name, uint32_t parent_taxid, const string & rank_name) {
@@ -182,8 +182,9 @@ uint32_t get_taxid(
   if (it == name_to_taxid_map.end()) {
     uint32_t new_taxid = ++New_taxid_start;
     bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name);
-    if (!insert_res)
-          cerr << "Taxonomy ID " << new_taxid << " already in Taxonomy DB? Shouldn't happen - run set_lcas without the -a option." << endl;
+    if (!insert_res) {
+      return 0;
+    }
     // insert_res shows if insert failed, but we don't care
     // cerr << "Adding assembly: " << name << " with taxid " << new_taxid << endl;
     Parent_map[new_taxid] = parent_taxid;
@@ -205,7 +206,7 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
   if (map_file.rdstate() & ifstream::failbit) {
     err(EX_NOINPUT, "can't open %s", ID_to_taxon_map_filename.c_str());
   }
-  string line, seq_id;
+  string line, seq_id, name;
   uint32_t taxid;
 
   // Used when adding new taxids for assembly or sequence
@@ -226,13 +227,13 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
 
     if (Add_taxIds_for_Assembly && iss.good()) {
       iss.get();
-      string name;
       getline(iss, name);
-      taxid = get_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly");
+      if (!name.empty())
+        taxid = get_new_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly");
     }
 
     if (Add_taxIds_for_Sequences) {
-      taxid = get_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence");
+      taxid = get_new_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence");
     }
     if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) {
       cout << seq_id << '\t' << taxid << '\n';
diff --git a/src/taxdb.h b/src/taxdb.h
index 28313f7..eb8fe78 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -561,9 +561,8 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
       taxonomyID, newEntry
     });
   }
-  taxIDsAndEntries.insert({
-	0, {0, 0, "no rank", "unclassified" }
-  });
+  taxIDsAndEntries.insert({0, {0, 0, "no rank", "unclassified" }});
+  //taxIDsAndEntries.insert({-1, {-1, 0, "no rank", "uncategorized" }});
   createPointers(taxIDsAndEntries);
   log_msg("done reading TaxDB, read " + std::to_string(taxIDsAndEntries.size()) + " taxa");
   return(taxIDsAndEntries);
@@ -671,15 +670,18 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
 template<typename TAXID, typename READCOUNTS>
 bool TaxonomyDB<TAXID, READCOUNTS>::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, 
     std::string rank_, std::string scientificName_) {
+  if (parentTaxonomyID_ == taxonomyID_) {
+    return false;
+  }
 
-  TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0);
-  
   auto parentIt = taxIDsAndEntries.find(parentTaxonomyID_);
-  if (parentIt == taxIDsAndEntries.end() || parentTaxonomyID_ == taxonomyID_) {
-    cerr << "ERROR while inserting taxonomy entry - taxonomy ID " << taxonomyID_  <<"; parent taxonomy ID " << parentTaxonomyID_ << "!" << endl;
+  if (parentIt == taxIDsAndEntries.end()) {
+    cerr << "ERROR with taxon [" << taxonomyID_  <<";"<<rank_<<";"<<scientificName_<<"] - parent taxon " << parentTaxonomyID_ << " not in database!" << endl;
     return false;
   }
 
+  TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0);
+
   newEntry.parent = &(parentIt->second);
   auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry});
   parentIt->second.children.push_back(&insert_res.first->second);
@@ -933,7 +935,10 @@ void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string ra
 		// B: print normal results
 		printReport(_taxdb.taxIDsAndEntries.at(1),0u);
 		// C: Print Unclassified stuff
-		//printReport(_taxdb.taxIDsAndEntries.at(-1),0u);
+		auto it = _taxdb.taxIDsAndEntries.find(-1);
+		if (it != _taxdb.taxIDsAndEntries.end()) {
+		  printReport(it->second,0u);
+		}
 	} else {
 		// print stuff at a certain level ..
 		//_uid_abundance;

From a88535a01bb3ce8bb22ea26b8d2c0aa4bc11901b Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 13:12:00 -0400
Subject: [PATCH 062/105] Fixed contaminants download

---
 scripts/krakenu-download | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/krakenu-download b/scripts/krakenu-download
index 4508f98..1d67438 100755
--- a/scripts/krakenu-download
+++ b/scripts/krakenu-download
@@ -344,9 +344,9 @@ sub download_contaminats(@) {
 
   # download UniVec and EmVec database
   download("ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec","$CONTAMINANT_DIR/UniVec.fna");
-  download("ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz","$CONTAMINANT_DIR/emvec.dat.gz");
+  download("ftp://ftp.ebi.ac.uk/pub/databases/emvec/emvec.dat.gz","$CONTAMINANT_DIR/emvec.dat.gz", "$CONTAMINANT_DIR/emvec.dat");
 
-  open(my $E1, "|-", "gunzip -c emvec.dat.gz");
+  open(my $E1, "<", "$CONTAMINANT_DIR/emvec.dat");
   open(my $E2, ">", "$CONTAMINANT_DIR/EmVec.fna");
 
   my ($ac,$de);
@@ -360,12 +360,11 @@ sub download_contaminats(@) {
    } elsif (/^SQ/) {
       $in_seq = 1;
       print $E2 ">$ac $de\n";
-      print "$ac\t$CONTAMINANT_TAXID\tEmVec\n";
     } elsif ($in_seq) {
       if (/^\s+[agct]/) {
         s/\s+[0-9]+$//;
        s/ //g;
-       print $_;
+       print $E2 $_;
       } else {
         $in_seq = 0;
       }
@@ -373,13 +372,14 @@ sub download_contaminats(@) {
   }
   close($E2);
   close($E1);
-  unlink("emvec.dat.gz");
+  unlink("$CONTAMINANT_DIR/emvec.dat");
  
   if ( $CHANGE_HEADER ) {
     system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/UniVec.fna");
     system("sed -i 's/^>/>taxid|$CONTAMINANT_TAXID /' $CONTAMINANT_DIR/EmVec.fna");
   } else {
     print_header_lines("$CONTAMINANT_DIR/UniVec.fna", $CONTAMINANT_TAXID, "UniVec");
+    print_header_lines("$CONTAMINANT_DIR/EmVec.fna", $CONTAMINANT_TAXID, "EmVec");
   }
 }
 

From 0145ef457994e1f2540bfb90f4da02c955c905de Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 14:04:54 -0400
Subject: [PATCH 063/105] Fix for Linux/OSX building

---
 scripts/krakenu-build_db.sh | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index 00e708a..a39432f 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -92,8 +92,11 @@ if [ ! -s "library-files.txt" ]; then
 fi
 
 file_sizes() {
-  ## stat -c is for Linux, stat -f is for BSD/OSX
-  cat library-files.txt | tr '\n' '\0' | xargs -0 -I '{}' sh -c "stat -c '%s\n' {} 2> /dev/null || stat -f '%z' {}"
+  if [[ `uname` == "Darwin" ]]; then
+    cat library-files.txt | tr '\n' '\0' | xargs -0 stat -f '%z'
+  else
+    cat library-files.txt | tr '\n' '\0' | xargs -0 stat -c '%s\n'
+  fi
 }
 cat_library() {
   cat library-files.txt | tr '\n' '\0' | xargs -0 cat
@@ -234,7 +237,7 @@ else
 fi
 
 if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
-  if [ -e "database.kdb" ]
+  if [ -s "database.kdb" ]
   then
     echo "Skipping step 6, LCAs already set."
   else
@@ -262,15 +265,16 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
     echo "LCA database created. [$(report_time_elapsed $start_time1)]"
   fi
   ## Make a classification report
-  if [[ ! -s $(basename `pwd`).report ]]; then
-    echo "Creating database summary report ..."
-    krakenu --db . --report-file $(basename `pwd`).report --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $(basename `pwd`).kraken
+  REPNAME=database
+  if [[ ! -s $REPNAME.report.tsv ]]; then
+    echo "Creating database summary report $REPNAME.report.tsv ..."
+    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv
   fi
 fi
 
 
 if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
-  if [ -e "uid_database.complete" ]
+  if [ -s "uid_database.kdb" ]
   then
     echo "Skipping step 6.3, UID datanbase already generated."
   else
@@ -289,15 +293,15 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
     start_time1=$(date "+%s.%N")
       set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \
         -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kmer_count -F <( cat_library )
-    touch "uid_database.complete"
   
     echo "UID Database created. [$(report_time_elapsed $start_time1)]"
   fi
 
   ## Make a classification report
-  if [[ ! -s $(basename `pwd`).uid_report ]]; then
-    echo "Creating database summary report ..."
-    krakenu --db . --report-file $(basename `pwd`).uid_report --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $(basename `pwd`).uid_kraken
+  REPNAME=uid_database
+  if [[ ! -s $REPNAME.report.tsv ]]; then
+    echo "Creating UID database summary report $REPNAME.report.tsv ..."
+    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $REPNAME.kraken.tsv
   fi
 fi
 

From bb1e65da3968fbc583661e32aba405724fe4a910 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 14:06:01 -0400
Subject: [PATCH 064/105] Minor speed improvements in classify

---
 src/classify.cpp | 113 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 27 deletions(-)

diff --git a/src/classify.cpp b/src/classify.cpp
index b5e196f..703b5d8 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -40,6 +40,10 @@ void process_file(char *filename);
 bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
                        unordered_map<uint32_t, ReadCounts>&);
+inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna);
+string hitlist_string(const vector<uint32_t> &taxa);
+
+
 set<uint32_t> get_ancestry(uint32_t taxon);
 void report_stats(struct timeval time1, struct timeval time2);
 unordered_map<uint32_t, ReadCounts> taxon_counts; // stats per taxon
@@ -350,6 +354,20 @@ uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& cu
 	return taxon;
 }
 
+
+inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna) {
+      if (Fastq_input) {
+        (*oss_ptr) << "@" << dna.header_line << endl
+            << dna.seq << endl
+            << "+" << endl
+            << dna.quals << endl;
+      }
+      else {
+        (*oss_ptr) << ">" << dna.header_line << endl
+            << dna.seq << endl;
+      }
+}
+
 inline
 void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_t& last_counter, uint32_t current_taxon) {
   if (last_taxon == current_taxon) {
@@ -367,10 +385,47 @@ void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_
   }
 }
 
+string hitlist_string(const vector<uint32_t> &taxa)
+{
+  uint32_t last_code = taxa[0];
+  int code_count = 1;
+  ostringstream hitlist;
+
+  for (size_t i = 1; i < taxa.size(); i++) {
+    uint32_t code = taxa[i];
+
+    if (code == last_code) {
+      code_count++;
+    }
+    else {
+      if (last_code >= 0) {
+        hitlist << last_code << ":" << code_count << " ";
+      }
+      else {
+        hitlist << "A:" << code_count << " ";
+      }
+      code_count = 1;
+      last_code = code;
+    }
+  }
+  if (last_code == -1) {
+    hitlist << "A:" << code_count;
+  }
+  else {
+    hitlist << last_code << ":" << code_count;
+  }
+  return hitlist.str();
+}
+
+
 bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
                        unordered_map<uint32_t, ReadCounts>& my_taxon_counts) {
-  // TODO: use vector::reserve
+  size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1;
+  vector<uint32_t> taxa;
+  taxa.reserve(n_kmers);
+  //vector<uint8_t> ambig_list;
+  //ambig_list.reserve(n_kmers);
   unordered_map<uint32_t, uint32_t> hit_counts;
   uint64_t *kmer_ptr;
   uint32_t taxon = 0;
@@ -383,7 +438,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
     int64_t current_max_pos = 0;
   };
 
-  string hitlist_string;
+  //string hitlist_string;
   uint32_t last_taxon;
   uint32_t last_counter;
 
@@ -394,9 +449,12 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
     while ((kmer_ptr = scanner.next_kmer()) != NULL) {
       taxon = 0;
       if (scanner.ambig_kmer()) {
-        append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon);
+        //append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon);
+        //ambig_list.push_back(1);
+        taxa.push_back(-1);
       }
       else {
+        //ambig_list.push_back(0);
 
         // go through multiple databases to map k-mer
         for (size_t i=0; i<KrakenDatabases.size(); ++i) {
@@ -410,12 +468,17 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
         my_taxon_counts[taxon].add_kmer(*kmer_ptr);
 
         if (taxon) {
+          if (taxon == -1) {
+            cerr << "ERROR: Invalid taxon (-1)" << endl;
+            exit(1);
+          }
           hit_counts[taxon]++;
           if (Quick_mode && ++hits >= Minimum_hit_count)
             break;
         }
+        taxa.push_back(taxon);
       }
-      append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon);
+      //append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon);
     }
   }
 
@@ -434,24 +497,16 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
       call = resolve_tree(hit_counts, Parent_map);
   }
 
+
+  #pragma omp atomic
   ++(my_taxon_counts[call].n_reads);
 
-  if (Print_unclassified || Print_classified) {
-    ostringstream *oss_ptr = call ? &coss : &uoss;
-    bool print = call ? Print_classified : Print_unclassified;
-    if (print) {
-      if (Fastq_input) {
-        (*oss_ptr) << "@" << dna.header_line << endl
-            << dna.seq << endl
-            << "+" << endl
-            << dna.quals << endl;
-      }
-      else {
-        (*oss_ptr) << ">" << dna.header_line << endl
-            << dna.seq << endl;
-      }
-    }
-  }
+  if (Print_unclassified && !call) 
+    print_sequence(&uoss, dna);
+
+  if (Print_classified && call)
+    print_sequence(&coss, dna);
+
 
   if (! Print_kraken)
     return call;
@@ -464,19 +519,23 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
       return false;
     koss << "U\t";
   }
-  koss << dna.id << "\t" << call << "\t" << dna.seq.size() << "\t";
+  koss << dna.id << '\t' << call << '\t' << dna.seq.size() << '\t';
 
   if (Quick_mode) {
     koss << "Q:" << hits;
   }
   else {
-    if (hitlist_string.empty() && last_counter == 0)
+    if (taxa.empty())
       koss << "0:0";
-    else {
-      koss << hitlist_string
-           << (last_taxon == ambig_taxon? "A" :  std::to_string(last_taxon))
-           << ':' << std::to_string(last_counter);
-    }
+    else
+      koss << hitlist_string(taxa);
+    //if (hitlist_string.empty() && last_counter == 0)
+    //  koss << "0:0";
+    //else {
+    //  koss << hitlist_string
+    //       << (last_taxon == ambig_taxon? "A" :  std::to_string(last_taxon))
+    //       << ':' << std::to_string(last_counter);
+    //}
   }
 
   if (Print_sequence)

From 7bbd9862eeb7567da6f89a35074e3bcf381023d7 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 14:06:24 -0400
Subject: [PATCH 065/105] Correct number of arguments

---
 src/dump_taxdb.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp
index b2c73f0..79e668f 100644
--- a/src/dump_taxdb.cpp
+++ b/src/dump_taxdb.cpp
@@ -7,8 +7,8 @@
 using namespace std;
 
 int main(int argc, char **argv) {
-  if (argc != 3) {
-    std::cerr << "Usage: build_taxdb taxDB names.dmp nodes.dmp\n";
+  if (argc != 4) {
+    std::cerr << "Usage: dump_taxdb taxDB names.dmp nodes.dmp\n";
     return 1;
   }
   TaxonomyDB<uint32_t, uint32_t> taxdb {(string)argv[1]};

From 10213696ebaea78e229325ba5e61ccacfbc51c8c Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 14:06:53 -0400
Subject: [PATCH 066/105] Ignore taxa not in taxonomy DB

---
 src/grade_classification.cpp |  5 +++++
 src/taxdb.h                  | 17 +++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
index f787065..edfc999 100644
--- a/src/grade_classification.cpp
+++ b/src/grade_classification.cpp
@@ -94,6 +94,11 @@ int main(int argc, char **argv) {
       exit(1);
     } else {
       seq_taxid = it->second;
+      if (!taxdb.hasTaxon(seq_taxid)) {
+        cerr << "Ignoring taxon " << seq_taxid << " - not in database" << endl;
+        continue;
+      }
+      //cerr <<"seqid" << seq_taxid;
 
       // go up to species level or next proper (i.e. not 'no rank') rank for
       //  both real and assigned taxon
diff --git a/src/taxdb.h b/src/taxdb.h
index eb8fe78..df45568 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -214,6 +214,7 @@ class TaxonomyDB {
   TaxonomyEntry<TAXID,READCOUNTS> getEntry(TAXID taxID) const;
 
   bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_);
+  bool hasTaxon(TAXID taxonomyID_);
 
   size_t distance(TAXID taxID1, TAXID taxID2) const;
 
@@ -667,6 +668,12 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
   return consensus;
 }
 
+
+template<typename TAXID, typename READCOUNTS>
+bool TaxonomyDB<TAXID, READCOUNTS>::hasTaxon(TAXID taxonomyID_) {
+  return taxIDsAndEntries.find(taxonomyID_) != taxIDsAndEntries.end();
+}
+
 template<typename TAXID, typename READCOUNTS>
 bool TaxonomyDB<TAXID, READCOUNTS>::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, 
     std::string rank_, std::string scientificName_) {
@@ -684,8 +691,9 @@ bool TaxonomyDB<TAXID, READCOUNTS>::insert(TAXID taxonomyID_, TAXID parentTaxono
 
   newEntry.parent = &(parentIt->second);
   auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry});
-  parentIt->second.children.push_back(&insert_res.first->second);
-
+  if (insert_res.second) {
+    parentIt->second.children.push_back(&insert_res.first->second);
+  }
   return insert_res.second;
 
 }
@@ -784,8 +792,9 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtRank(const TAXID taxID,
     return 0;
   auto entry = taxIDsAndEntries.find(taxID);
   // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl;
-  while (entry != taxIDsAndEntries.end() &&
-         entry->second.parentTaxonomyID != 1) {
+  while (entry != taxIDsAndEntries.end() 
+      && entry->second.parentTaxonomyID != 1 
+      && entry->second.parentTaxonomyID != entry->first) {
     // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl;
     if (entry->second.rank == rank) {
       return entry->second.taxonomyID;

From c6b2d04ca59e95e08bc8f39221584cfdca16dfbc Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 14:08:08 -0400
Subject: [PATCH 067/105] Add comments

---
 src/set_lcas.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 7e69bab..7a20dc5 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -274,6 +274,7 @@ void process_single_file() {
     if (it != ID_to_taxon_map.end()) {
       taxid = it->second;
     } else if (dna.id.size() >= prefix.size() && dna.id.substr(0,prefix.size()) == prefix) {
+      // if the AC is not in the map, check if the fasta entry starts with '>kraken:taxid'
         taxid = std::stol(dna.id.substr(prefix.size()));
         if (taxid == 0) {
           cerr << "Error: taxonomy ID is zero for sequence '" << dna.id << "'?!" << endl;
@@ -288,6 +289,7 @@ void process_single_file() {
     }
     
     if (Add_taxIds_for_Sequences) {
+      // Update entry based on header line
       auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
       if (entryIt == taxdb.taxIDsAndEntries.end()) {
         cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["<<dna.header_line<<"]" << endl;
@@ -296,6 +298,11 @@ void process_single_file() {
       }
     }
 
+    // TODO: Allow exclusion of certain taxids in the building process
+    //if (Excluded_taxons.count(taxid) > 0) {
+      // exclude taxid!
+    //}
+
     if (taxid) {
       #pragma omp parallel for schedule(dynamic)
       for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)

From 131daed7c77e6860d963bc0674609734a0a457eb Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 14:11:49 -0400
Subject: [PATCH 068/105] Update to read sim

---
 tests/classify-reads.sh                          | 16 ----------------
 ...ulate-reads.sh => test-on-simulated-reads.sh} |  0
 2 files changed, 16 deletions(-)
 delete mode 100755 tests/classify-reads.sh
 rename tests/{simulate-reads.sh => test-on-simulated-reads.sh} (100%)

diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh
deleted file mode 100755
index 802b29b..0000000
--- a/tests/classify-reads.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -xeu
-
-[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
-SDIR=$DIR/simulated_reads
-CDIR=$DIR/classification-results
-mkdir -p $CDIR
-
-NAM=viral-neighbors-10m
-for K in 21 26 31; do
-  KFILE=$CDIR/$NAM.k$K.krakenu
-  [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $KFILE.report > $KFILE 2> $KFILE.log
-  [[ -s $KFILE.results ]] || $DIR/install/grade_classification  $DIR/dbs/refseq-viral-k$K/taxDB $DIR/data/all-viral-neighbors.map $KFILE > $KFILE.results
-  [[ -s $KFILE.results.stats ]] || cut -f 4 $KFILE.results | sort | uniq -c | sort -n > $KFILE.results.stats
-
-done
diff --git a/tests/simulate-reads.sh b/tests/test-on-simulated-reads.sh
similarity index 100%
rename from tests/simulate-reads.sh
rename to tests/test-on-simulated-reads.sh

From 3f5faf10c1abc3eac079e789c7e1581e3ea726b4 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 14:11:49 -0400
Subject: [PATCH 069/105] Update to read sim

---
 tests/classify-reads.sh                          | 16 ----------------
 ...ulate-reads.sh => test-on-simulated-reads.sh} |  0
 2 files changed, 16 deletions(-)
 delete mode 100755 tests/classify-reads.sh
 rename tests/{simulate-reads.sh => test-on-simulated-reads.sh} (100%)

diff --git a/tests/classify-reads.sh b/tests/classify-reads.sh
deleted file mode 100755
index 802b29b..0000000
--- a/tests/classify-reads.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -xeu
-
-[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
-SDIR=$DIR/simulated_reads
-CDIR=$DIR/classification-results
-mkdir -p $CDIR
-
-NAM=viral-neighbors-10m
-for K in 21 26 31; do
-  KFILE=$CDIR/$NAM.k$K.krakenu
-  [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq ~/kraken-hll-test/simulated_reads/$NAM.fq --report-file $KFILE.report > $KFILE 2> $KFILE.log
-  [[ -s $KFILE.results ]] || $DIR/install/grade_classification  $DIR/dbs/refseq-viral-k$K/taxDB $DIR/data/all-viral-neighbors.map $KFILE > $KFILE.results
-  [[ -s $KFILE.results.stats ]] || cut -f 4 $KFILE.results | sort | uniq -c | sort -n > $KFILE.results.stats
-
-done
diff --git a/tests/simulate-reads.sh b/tests/test-on-simulated-reads.sh
similarity index 100%
rename from tests/simulate-reads.sh
rename to tests/test-on-simulated-reads.sh

From 6e909c1d28777be9ddd26cd150137cd404a72da7 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:29:28 -0400
Subject: [PATCH 070/105] Various improvements and fixes for building and
 classification

---
 scripts/krakenu                  |   7 +-
 scripts/krakenu-build_db.sh      |   8 +--
 src/Makefile                     |  16 +++--
 src/classify.cpp                 | 117 +++++++++++++++++++++----------
 src/taxdb.h                      |  32 ++++++++-
 src/uid_mapping.cpp              |  62 ++++++++++------
 src/uid_mapping.hpp              |   7 +-
 tests/build-dbs.sh               |  11 +--
 tests/test-on-simulated-reads.sh |  58 ++++++++-------
 9 files changed, 214 insertions(+), 104 deletions(-)

diff --git a/scripts/krakenu b/scripts/krakenu
index 243bcda..006a078 100755
--- a/scripts/krakenu
+++ b/scripts/krakenu
@@ -97,7 +97,9 @@ if ($@) {
   die "$PROG: $@";
 }
 
-my @kdb_files = map { "$_/database.kdb" } @db_prefix;
+my $database = $uid_mapping? "uid_database.kdb" : "database.kdb";
+my @kdb_files = map { "$_/$database" } @db_prefix;
+
 my @idx_files = map { "$_/database.idx" } @db_prefix;
 
 foreach my $file (@kdb_files,@idx_files) {
@@ -148,7 +150,7 @@ push @flags, "-r", $report_file if defined $report_file;
 push @flags, "-a", $db_prefix[0]."/taxDB";
 push @flags, "-s" if $print_sequence;
 if ($uid_mapping) {
-  my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid";
+  my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid.map";
   if (!-f $uid_mapping_file) {
     print STDERR "Missing required file $uid_mapping_file for UID mapping.\n";
     exit(1); 
@@ -220,6 +222,7 @@ Usage: $PROG [options] <filename(s)>
 Options:
   --db NAME               Name for Kraken DB (default: $default_db)
   --report-file FILENAME  Write Kraken report to FILENAME
+  --uid-mapping           Map using UID database
   --threads NUM           Number of threads (default: $def_thread_ct)
   --fasta-input           Input is FASTA format
   --fastq-input           Input is FASTQ format
diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index a39432f..96f1aa8 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -254,7 +254,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
     start_time1=$(date "+%s.%N")
     set -x
       set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \
-      -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kmer_count \
+      -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kdb.counts \
       -F <( cat_library ) > seqid2taxid-plus.map
     set +x
     if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
@@ -292,7 +292,7 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
     fi
     start_time1=$(date "+%s.%N")
       set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -I uid_to_taxid.map -o uid_database.kdb -i database.idx -v \
-        -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kmer_count -F <( cat_library )
+        -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c uid_database.kdb.counts -F <( cat_library )
   
     echo "UID Database created. [$(report_time_elapsed $start_time1)]"
   fi
@@ -300,8 +300,8 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
   ## Make a classification report
   REPNAME=uid_database
   if [[ ! -s $REPNAME.report.tsv ]]; then
-    echo "Creating UID database summary report $REPNAME.report.tsv ..."
-    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <(cat_library) > $REPNAME.kraken.tsv
+    #echo "Creating UID database summary report $REPNAME.report.tsv ..."
+    #krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
   fi
 fi
 
diff --git a/src/Makefile b/src/Makefile
index 0ed70b3..38e8e21 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,7 +1,8 @@
 CXX = g++
 FOPENMP?=-fopenmp
 CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb
+#CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
+PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
 .PHONY: all install clean
@@ -18,17 +19,21 @@ db_shrink: krakendb.o quickfile.o
 
 db_sort: krakendb.o quickfile.o
 
-set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
+set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o
 
 grade_classification: taxdb.h
 
-classify: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.cpp
-	$(CXX) $(CXXFLAGS) -o classify classify.cpp $^ $(LIBFLAGS)
+read_uid_mapping: quickfile.o
+
+classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h
+	$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)
 
 build_taxdb: taxdb.h
 
 make_seqid_to_taxid_map: quickfile.o
 
+read_uid_mapping: quickfile.o krakenutil.o uid_mapping.o
+
 krakenutil.o: krakenutil.cpp krakenutil.hpp taxdb.h
 	$(CXX) $(CXXFLAGS) -c krakenutil.cpp
 
@@ -40,3 +45,6 @@ seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp
 
 quickfile.o: quickfile.cpp quickfile.hpp
 	$(CXX) $(CXXFLAGS) -c quickfile.cpp
+
+uid_mapping.o: krakenutil.hpp uid_mapping.hpp uid_mapping.cpp
+	$(CXX) $(CXXFLAGS) -c uid_mapping.cpp
diff --git a/src/classify.cpp b/src/classify.cpp
index 703b5d8..a6076d3 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -41,7 +41,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
                        unordered_map<uint32_t, ReadCounts>&);
 inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna);
-string hitlist_string(const vector<uint32_t> &taxa);
+string hitlist_string(const vector<uint32_t> &taxa, const vector<char>& ambig_list);
 
 
 set<uint32_t> get_ancestry(uint32_t taxon);
@@ -211,7 +211,7 @@ int main(int argc, char **argv) {
      Report_output = cout_or_file(Report_output_file);
   }
 
-  cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl;
+  //cerr << "Print_kraken: " << Print_kraken << "; Print_kraken_report: " << Print_kraken_report << "; k: " << uint32_t(KrakenDatabases[0]->get_k()) << endl;
 
   struct timeval tv1, tv2;
   gettimeofday(&tv1, NULL);
@@ -222,21 +222,26 @@ int main(int argc, char **argv) {
   std::cerr << "Finishing up ..\n";
 
   if (Print_kraken_report) {
+    for (auto fname : DB_filenames) {
+      ifstream ifs(fname + ".counts");
+      if (ifs.good()) {
+        ifs.close();
+        taxdb.readGenomeSizes(fname+".counts");
+      }
+    }
+
 	taxdb.setReadCounts(taxon_counts);
 	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, false);
 	rep.setReportCols({ 
-		"percReadsClade",
-		"numReadsClade", 
-		"numReadsTaxon", 
-		"numUniqueKmersClade", 
-		"numUniqueKmersTaxon", 
-		"numKmersClade", 
-		"numKmersTaxon", 
-		"numKmersInDatabaseClade", 
-		"numKmersInDatabaseTaxon", 
+		"%",
+		"reads", 
+    "taxReads",
+    "kmers",
+    "dup",
+    "cov", 
 		"taxID", 
-		"taxRank", 
-		"indentedName"});
+		"rank", 
+		"taxName"});
 	rep.printReport("kraken","blu");
   }
 
@@ -336,8 +341,9 @@ void process_file(char *filename) {
         total_sequences += work_unit.size();
         total_bases += total_nt;
         //if (Print_Progress && total_sequences % 100000 < work_unit.size()) 
-        if (Print_Progress && total_sequences % 100000 < work_unit.size()) 
+        if (Print_Progress) {  
           cerr << "\rProcessed " << total_sequences << " sequences (" << total_classified << " classified) ...";
+        }
       }
     }
   }  // end parallel section
@@ -345,13 +351,13 @@ void process_file(char *filename) {
   delete reader;
 }
 
+inline
 uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& current_bin_key,
 		int64_t& current_min_pos, int64_t& current_max_pos) {
 	uint32_t* val_ptr = database.kmer_query(
 			database.canonical_representation(*kmer_ptr), &current_bin_key,
 			&current_min_pos, &current_max_pos);
-	uint32_t taxon = val_ptr ? *val_ptr : 0;
-	return taxon;
+	return val_ptr ? *val_ptr : 0;
 }
 
 
@@ -385,7 +391,45 @@ void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_
   }
 }
 
-string hitlist_string(const vector<uint32_t> &taxa)
+string hitlist_string(const vector<uint32_t> &taxa, const vector<uint8_t> &ambig)
+{
+  int64_t last_code;
+  int code_count = 1;
+  ostringstream hitlist;
+
+  if (ambig[0])   { last_code = -1; }
+  else            { last_code = taxa[0]; }
+
+  for (size_t i = 1; i < taxa.size(); i++) {
+    int64_t code;
+    if (ambig[i]) { code = -1; }
+    else          { code = taxa[i]; }
+
+    if (code == last_code) {
+      code_count++;
+    }
+    else {
+      if (last_code >= 0) {
+        hitlist << last_code << ":" << code_count << " ";
+      }
+      else {
+        hitlist << "A:" << code_count << " ";
+      }
+      code_count = 1;
+      last_code = code;
+    }
+  }
+  if (last_code >= 0) {
+    hitlist << last_code << ":" << code_count;
+  }
+  else {
+    hitlist << "A:" << code_count;
+  }
+  return hitlist.str();
+}
+
+
+string hitlist_string_depr(const vector<uint32_t> &taxa)
 {
   uint32_t last_code = taxa[0];
   int code_count = 1;
@@ -421,11 +465,8 @@ string hitlist_string(const vector<uint32_t> &taxa)
 bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
                        unordered_map<uint32_t, ReadCounts>& my_taxon_counts) {
-  size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1;
   vector<uint32_t> taxa;
-  taxa.reserve(n_kmers);
-  //vector<uint8_t> ambig_list;
-  //ambig_list.reserve(n_kmers);
+  vector<uint8_t> ambig_list;
   unordered_map<uint32_t, uint32_t> hit_counts;
   uint64_t *kmer_ptr;
   uint32_t taxon = 0;
@@ -445,39 +486,40 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
   vector<db_status> db_statuses(KrakenDatabases.size());
 
   if (dna.seq.size() >= KrakenDatabases[0]->get_k()) {
+    size_t n_kmers = dna.seq.size()-KrakenDatabases[0]->get_k()+1;
+    taxa.reserve(n_kmers);
+    ambig_list.reserve(n_kmers);
     KmerScanner scanner(dna.seq);
     while ((kmer_ptr = scanner.next_kmer()) != NULL) {
       taxon = 0;
       if (scanner.ambig_kmer()) {
         //append_hitlist_string(hitlist_string, last_taxon, last_counter, ambig_taxon);
-        //ambig_list.push_back(1);
-        taxa.push_back(-1);
+        ambig_list.push_back(1);
       }
       else {
-        //ambig_list.push_back(0);
-
+        ambig_list.push_back(0);
         // go through multiple databases to map k-mer
         for (size_t i=0; i<KrakenDatabases.size(); ++i) {
-          taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr,
-              db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);
+        taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr,
+        db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);
+
+	  //uint32_t* val_ptr = KrakenDatabases[i]->kmer_query(
+		//	KrakenDatabases[i]->canonical_representation(*kmer_ptr), &db_statuses[i].current_bin_key,
+		//	&db_statuses[i].current_min_pos, &db_statuses[i].current_max_pos);
+	  //taxon = val_ptr ? *val_ptr : 0;
           if (taxon) break;
         }
 
-        //cerr << "taxon for " << *kmer_ptr << " is " << taxon << endl;
-
+        // cerr << "taxon for " << *kmer_ptr << " is " << taxon << endl;
         my_taxon_counts[taxon].add_kmer(*kmer_ptr);
 
         if (taxon) {
-          if (taxon == -1) {
-            cerr << "ERROR: Invalid taxon (-1)" << endl;
-            exit(1);
-          }
           hit_counts[taxon]++;
           if (Quick_mode && ++hits >= Minimum_hit_count)
             break;
         }
-        taxa.push_back(taxon);
       }
+      taxa.push_back(taxon);
       //append_hitlist_string(hitlist_string, last_taxon, last_counter, taxon);
     }
   }
@@ -488,7 +530,8 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
       cerr << "Quick mode not available when mapping UIDs" << endl;
       exit(1);
     } else {
-      call = resolve_uids2(hit_counts, Parent_map, (const uint32_t *)UID_to_TaxID_map_file.ptr(), UID_to_TaxID_map_file.size());
+      call = resolve_uids2(hit_counts, Parent_map, 
+        UID_to_TaxID_map_file.ptr(), UID_to_TaxID_map_file.size());
     }
   } else {
     if (Quick_mode)
@@ -497,8 +540,6 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
       call = resolve_tree(hit_counts, Parent_map);
   }
 
-
-  #pragma omp atomic
   ++(my_taxon_counts[call].n_reads);
 
   if (Print_unclassified && !call) 
@@ -528,7 +569,7 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
     if (taxa.empty())
       koss << "0:0";
     else
-      koss << hitlist_string(taxa);
+      koss << hitlist_string(taxa, ambig_list);
     //if (hitlist_string.empty() && last_counter == 0)
     //  koss << "0:0";
     //else {
diff --git a/src/taxdb.h b/src/taxdb.h
index df45568..f49d9dc 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -29,6 +29,7 @@
 #include <unordered_map>
 #include <vector>
 #include <unordered_set>
+#include <iomanip>
 #include "report-cols.h"
 
 using namespace std;
@@ -223,6 +224,7 @@ class TaxonomyDB {
 
   void setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes);
   void setReadCounts(const std::unordered_map<TAXID, READCOUNTS>& readCounts);
+  void readGenomeSizes(string file);
   void setGenomeSize(const TAXID taxid, const uint64_t genomeSize);
   void addReadCount(const TAXID taxid, const READCOUNTS& readCounts_);
 
@@ -876,6 +878,23 @@ void TaxonomyDB<TAXID,READCOUNTS>::setGenomeSize(const TAXID taxid, const uint64
 }
 
 
+template<typename TAXID, typename READCOUNTS>
+void TaxonomyDB<TAXID,READCOUNTS>::readGenomeSizes(string file) {
+  for (auto& entry : taxIDsAndEntries) {
+    entry.second.genomeSize = 0;
+    entry.second.genomeSizeOfChildren = 0;
+  }
+  log_msg("Reading genome sizes from " + file);
+  std::ifstream inFile(file);
+  if (!inFile.is_open())
+    throw std::runtime_error("unable to open file " + file);
+  TAXID taxonomyID;
+  uint64_t size;
+  while (!inFile.eof()) {
+    inFile >> taxonomyID >> size;
+    setGenomeSize(taxonomyID, size);
+  }
+}
 
 template<typename TAXID, typename READCOUNTS>
 void TaxonomyDB<TAXID,READCOUNTS>::setReadCounts(const unordered_map<TAXID, READCOUNTS>& readCounts) {
@@ -967,22 +986,29 @@ void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID,READCOUNTS>& t
 
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
+
+  long long unique_kmers_for_clade = ( tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality());
+  double genome_size = double(tax.genomeSize+tax.genomeSizeOfChildren);
+
 	for (auto& col : _report_cols) {
 		switch (col) {
 		case REPORTCOLS::NAME:              _reportOfb << tax.scientificName ; break;
 		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
 		case REPORTCOLS::TAX_ID:            _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
 		case REPORTCOLS::DEPTH:             _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:       _reportOfb << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
+		case REPORTCOLS::PERCENTAGE:       _reportOfb << setprecision(4) << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
 		//case REPORTCOLS::ABUNDANCE:      _reportOfb << 100*counts.abundance[0]; break;
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
 		case REPORTCOLS::NUM_READS:        _reportOfb << reads(tax.readCounts); break;
 		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break;
 		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break;
-		case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE:  _reportOfb << (tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality()); break;
+		case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE:  _reportOfb << unique_kmers_for_clade; break;
 		case REPORTCOLS::NUM_KMERS:        _reportOfb << tax.readCounts.n_kmers; break;
 		case REPORTCOLS::NUM_KMERS_CLADE:  _reportOfb << tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers; break;
-		case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
+    case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
+    case REPORTCOLS::CLADE_KMER_COVERAGE: if (genome_size == 0) { _reportOfb << "NA"; } else {
+       _reportOfb << setprecision(4) << (unique_kmers_for_clade  / genome_size); }; break;
+    case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers) / unique_kmers_for_clade ); break;
 		case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp
index 966a685..4a80946 100644
--- a/src/uid_mapping.cpp
+++ b/src/uid_mapping.cpp
@@ -122,7 +122,7 @@ namespace kraken {
   uint32_t resolve_uids2(
       const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
       const unordered_map<uint32_t, uint32_t> &parent_map,
-      const uint32_t* fptr, const size_t fsize) {
+      const char* fptr, const size_t fsize) {
 
     unordered_map<uint32_t, uint32_t> taxid_counts;
     unordered_map<uint32_t, double> frac_taxid_counts;
@@ -131,33 +131,17 @@ namespace kraken {
       return(0);
     }
 
-    for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
-      uint32_t next_uid = it->first;
-      if (next_uid == 0) {
+    for (const auto& it : uid_hit_counts) {
+      if (it.first == 0) {
         continue;
       }
-      uint32_t taxid;
       // TODO: Just get a uint64_t and shift the bits, probably faster
-      vector<uint32_t> taxids;
-      do {
-        // Check if the accessed memory is out of range
-				//   -- move this to a DEBUG-only assert
-				// UID-1 is used because UIDs start at 1
-				uint32_t offset = (next_uid-1)*UID_BLOCK_SIZE;
-        if (offset >= fsize) {
-          cerr << "It seems you are trying to access a block after the file end: \n" <<
-          " fptr: " << fptr << "; uid: " << next_uid << "; " << " addr: " << (offset + INT_SIZE) << endl;
-          exit(1);
-        }
-        taxid = *(fptr + offset);
-        next_uid = *(fptr+ offset + INT_SIZE);
-        taxid_counts[taxid] += it->second;
-        taxids.push_back(taxid);
-      } while (next_uid != 0);
+      vector<uint32_t> taxids = get_taxids_for_uid(it.first, fptr);
 
-      double frac_count = (double)it->second / (double)taxids.size();
+      double frac_count = (double)it.second / (double)taxids.size();
       for (uint32_t taxid : taxids) {
         frac_taxid_counts[taxid] += frac_count;
+        taxid_counts[taxid] += it.second;
       }
     }
 
@@ -194,3 +178,37 @@ namespace kraken {
   }
 
 }
+
+vector<uint32_t> get_taxids_for_uid(const uint32_t uid, const char* fptr) {
+  size_t int_size = sizeof(int);
+  size_t block_size = sizeof(int)*2;
+  // TODO: Just get a uint64_t and shift the bits, probably faster
+  uint32_t taxid  = *(uint32_t*)(fptr+(uid-1)*block_size);
+  uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size);
+
+  vector<uint32_t> taxids = {taxid};
+  while (parent_uid != 0) {
+    // TODO: Consider checking if the accessed meory is out of range. 
+      // if (offset >= fsize) {
+      //   cerr << "It seems you are trying to access a block after the file end: \n" <<
+      //      " fptr: " << fptr << "; uid: " << next_uid << "; " << " addr: " << (offset + INT_SIZE) << endl;
+      //  exit(1);
+      //}
+    taxid  = *(uint32_t*)(fptr+(parent_uid-1)*block_size);
+    parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size);
+    taxids.push_back(taxid);
+  }
+  //std::sort(taxids.begin(), taxids.end());
+  return(taxids);
+}
+
+vector<uint32_t> get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map<uint32_t, vector<uint32_t> >& uid_map ) {
+  auto it = uid_map.find(uid);
+  if (it != uid_map.end()) {
+    return it->second;
+  } 
+  vector<uint32_t> taxids = get_taxids_for_uid(uid, fptr);
+  uid_map[uid] = taxids;
+  return(taxids);
+}
+
diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp
index 7c7d0fa..1f84c40 100644
--- a/src/uid_mapping.hpp
+++ b/src/uid_mapping.hpp
@@ -40,6 +40,11 @@ uint32_t resolve_uids(
 uint32_t resolve_uids2(
       const unordered_map<uint32_t, uint32_t> &uid_hit_counts,
       const unordered_map<uint32_t, uint32_t> &parent_map,
-      const uint32_t* fptr, const size_t fsize);
+      const char* fptr, const size_t fsize);
 }
+
+vector<uint32_t> get_taxids_for_uid(const uint32_t uid, const char* fptr);
+
+vector<uint32_t> get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map<uint32_t, vector<uint32_t> >& uid_map );
+
 #endif
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index 082bac5..9922df7 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -6,12 +6,15 @@ set -xeu
 
 export PATH="$DIR/install:$PATH"
 for K in 31 26 21; do
-  #mkdir -p $DIR/dbs/refseq-viral-k$K
-  #krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy
+  mkdir -p $DIR/dbs/refseq-viral-k$K
+  time krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy 2>&1 | tee $DIR/dbs/refseq-viral-k$K/build.log
+
+  mkdir -p $DIR/dbs/refseq-viral-k$K/taxonomy
+  dump_taxdb $DIR/dbs/refseq-viral-k$K/taxDB $DIR/dbs/refseq-viral-k$K/taxonomy/names.dmp $DIR/dbs/refseq-viral-k$K/taxonomy/nodes.dmp
 
   if [[ `uname` != "Darwin" ]]; then
-    #mkdir -p $DIR/dbs/refseq-bacteria-k$K
-    #krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy
+    mkdir -p $DIR/dbs/refseq-bacteria-k$K
+    krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy
     mkdir -p $DIR/dbs/refseq-oct2017-k$K
     krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/libray/vertebrate_mammalia --taxonomy-dir=$DIR/data/taxonomy
   fi
diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh
index 09d7db7..df18b14 100755
--- a/tests/test-on-simulated-reads.sh
+++ b/tests/test-on-simulated-reads.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -xeu
+set -eu
 
 [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
 SDIR=$DIR/simulated_reads
@@ -8,33 +8,38 @@ CDIR=$DIR/classification-results
 mkdir -p $CDIR
 mkdir -p $SDIR
 
-run_krakenu_viral() {
-  FQ=$1
-  NAM=$2
-  K=$3
-  DAT=$4
-
-  KFILE=$CDIR/$NAM.k$K.krakenu
-  [[ -s $KFILE ]] || time $DIR/install/krakenu --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ --report-file $KFILE.report > $KFILE 2> $KFILE.log
-  [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map
-  [[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification  $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats
-}
+[[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10
 
-run_kraken_viral() {
+run_kraken() {
   FQ=$1
   NAM=$2
-  K=$3
-  DAT=$4
-
-  KFILE=$CDIR/$NAM.k$K.kraken
-  [[ -s $KFILE ]] || time kraken --threads 4 --db $DIR/dbs/refseq-viral-k$K --fastq $FQ > $KFILE 2> $KFILE.log
-  [[ "$DAT" == "viral" ]] && SEQMAP=$DIR/dbs/refseq-viral-k$K/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map
-  #[[ -s $KFILE.results.stats ]] || 
-    $DIR/install/grade_classification  $DIR/dbs/refseq-viral-k$K/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats
+  DAT=$3
+  DB_DAT=$4
+  DB_K=$5
+  PROG=$6
+  DB=refseq-$DB_DAT-k$K
+  mkdir -p $CDIR/against-$DB
+  KFILE=$CDIR/against-$DB/$NAM.against-$DB.$PROG
+
+  if [[ "$PROG" == "kraken" ]]; then 
+    CMD="kraken"
+  elif [[ "$PROG" == "krakenu" ]]; then
+    CMD="$DIR/install/krakenu --report-file $KFILE.report"
+  elif [[ "$PROG" == "krakenuid" ]]; then
+    CMD="$DIR/install/krakenu --report-file $KFILE.report --uid-mapping"
+  else 
+    echo "Unknown $PROG"
+    return;
+  fi
+
+  if [[ ! -s $KFILE ]]; then 
+    echo "$CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE"
+    time $CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE 2>&1 | tee $KFILE.log
+  fi
+  #[[ "$DAT" == "$DB_DAT" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map
+  #[[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification  $DIR/dbs/$DB/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats
 }
 
-
-
 AB=1m
 for i in 1 2 3; do
   for dat in viral viral-neighbors bacteria archaea; do
@@ -42,9 +47,10 @@ for i in 1 2 3; do
       NAM=$dat.$AB${len}bp.$i
       FQ=$SDIR/$NAM.fq
       [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i
-      for K in 21 26 31; do
-        run_krakenu_viral $FQ $NAM $K $dat
-        run_kraken_viral $FQ $NAM $K $dat
+      for K in 31; do
+        run_kraken $FQ $NAM $dat viral $K kraken
+        run_kraken $FQ $NAM $dat viral $K krakenu
+        #run_kraken $FQ $NAM $dat viral $K krakenuid
       done
     done
   done

From 69815b814cef06b39fe0dc43fd8ee31c7805bfa1 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:30:54 -0400
Subject: [PATCH 071/105] Special treatment for host and conaminant taxids

---
 src/set_lcas.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 7e69bab..653089c 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -77,6 +77,8 @@ KrakenDB Database;
 TaxonomyDB<uint32_t, ReadCounts> taxdb;
 
 const string prefix = "kraken:taxid|";
+unordered_set<uint32_t> host_taxids = {9606};
+uint32_t contaminant_taxids = {32630};
 
 
 int main(int argc, char **argv) {
@@ -232,7 +234,7 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
         taxid = get_new_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly");
     }
 
-    if (Add_taxIds_for_Sequences) {
+    if (Add_taxIds_for_Sequences && taxid != 9606) {
       taxid = get_new_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence");
     }
     if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) {
@@ -287,7 +289,7 @@ void process_single_file() {
         continue;
     }
     
-    if (Add_taxIds_for_Sequences) {
+    if (Add_taxIds_for_Sequences && taxid != 9606) {
       auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
       if (entryIt == taxdb.taxIDsAndEntries.end()) {
         cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["<<dna.header_line<<"]" << endl;
@@ -297,6 +299,9 @@ void process_single_file() {
     }
 
     if (taxid) {
+      if (Parent_map.find(taxid) == Parent_map.end()) {
+        cerr << "Ignoring sequence for taxID " << taxid << " - not in taxDB\n";
+      }
       #pragma omp parallel for schedule(dynamic)
       for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
         set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
@@ -383,7 +388,7 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
     if (Use_uids_instead_of_taxids) {
       #pragma omp critical(new_uid)
       *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file);
-    } else if (!force_taxid) {
+    } else if (!force_taxid && taxid != contaminant_taxids) {
       *val_ptr = lca(Parent_map, taxid, *val_ptr);
     } else {
       // When force_taxid is set, do not compute lca, but assign the taxid

From 04fa4007500b2306799b6ed2bcc313714eb651b4 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:31:18 -0400
Subject: [PATCH 072/105] Added build

---
 tests/build-dbs.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index 082bac5..677d1c8 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -10,10 +10,10 @@ for K in 31 26 21; do
   #krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy
 
   if [[ `uname` != "Darwin" ]]; then
-    #mkdir -p $DIR/dbs/refseq-bacteria-k$K
-    #krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy
     mkdir -p $DIR/dbs/refseq-oct2017-k$K
-    krakenu-build --kmer-len $K --threads 4 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/libray/vertebrate_mammalia --taxonomy-dir=$DIR/data/taxonomy
+    krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/library/vertebrate_mammalian --library-dir=$DIR/data/library/contaminants --taxonomy-dir=$DIR/data/taxonomy
+    mkdir -p $DIR/dbs/refseq-bacteria-k$K
+    krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy
   fi
 done
 

From 4206549d90ef449450beccfb6257f6f364967882 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:36:08 -0400
Subject: [PATCH 073/105] Add new columns

---
 src/report-cols.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/report-cols.h b/src/report-cols.h
index 2392bd8..e5fa0a5 100644
--- a/src/report-cols.h
+++ b/src/report-cols.h
@@ -26,6 +26,8 @@ enum class REPORTCOLS : uint8_t {
 	NUM_UNIQUE_KMERS_CLADE,
 	NUM_KMERS_IN_DATABASE,
 	NUM_KMERS_IN_DATABASE_CLADE,
+	CLADE_KMER_COVERAGE,
+	CLADE_KMER_DUPLICITY,
 	TOTAL_SCORE,
 	TOTAL_HIT_LENGTH,
 	ABUNDANCE,
@@ -37,8 +39,10 @@ enum class REPORTCOLS : uint8_t {
 static const std::map<std::string, REPORTCOLS> report_col_name_map = {
 		{"name", REPORTCOLS::NAME},
 		{"indentedName", REPORTCOLS::SPACED_NAME},
+		{"taxName", REPORTCOLS::SPACED_NAME},
 		{"taxID", REPORTCOLS::TAX_ID},
 		{"taxRank", REPORTCOLS::TAX_RANK},
+		{"rank", REPORTCOLS::TAX_RANK},
 		{"depth", REPORTCOLS::DEPTH},
 		{"genomeSize", REPORTCOLS::GENOME_SIZE},
 		{"numReadsTaxon", REPORTCOLS::NUM_READS},
@@ -54,8 +58,25 @@ static const std::map<std::string, REPORTCOLS> report_col_name_map = {
 		{"abundance", REPORTCOLS::ABUNDANCE},
 		{"abundance_len", REPORTCOLS::ABUNDANCE_LEN},
 
+		{"taxReads", REPORTCOLS::NUM_READS},
+		{"reads", REPORTCOLS::NUM_READS_CLADE},
+		{"cladeReads", REPORTCOLS::NUM_READS_CLADE},
+		{"taxKmers", REPORTCOLS::NUM_KMERS},
+		{"cladeKmers", REPORTCOLS::NUM_KMERS_CLADE},
+		{"kmers", REPORTCOLS::NUM_UNIQUE_KMERS_CLADE},
+		{"kmerDup", REPORTCOLS::CLADE_KMER_DUPLICITY},
+		{"dup", REPORTCOLS::CLADE_KMER_DUPLICITY},
+		{"kmerCov", REPORTCOLS::CLADE_KMER_COVERAGE},
+		{"cov", REPORTCOLS::CLADE_KMER_COVERAGE},
+		{"specificTaxKmers", REPORTCOLS::NUM_UNIQUE_KMERS},
+		{"specificCladeKmers", REPORTCOLS::NUM_UNIQUE_KMERS_CLADE},
+		{"taxKmersInDB", REPORTCOLS::NUM_KMERS_IN_DATABASE},
+		{"cladeKmersInDB", REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE},
+
+		{"cladePerc", REPORTCOLS::PERCENTAGE},
 		{"percReadsClade", REPORTCOLS::PERCENTAGE},
 		{"percent", REPORTCOLS::PERCENTAGE},
+		{"%", REPORTCOLS::PERCENTAGE},
 		{"taxId", REPORTCOLS::TAX_ID},
 		{"reads_clade", REPORTCOLS::NUM_READS_CLADE}, // Change to clade reads!
 		{"reads_stay", REPORTCOLS::NUM_READS}, // Change to clade reads!

From e84091ad848bbfb78b3164d69c85602842ed6a19 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:36:26 -0400
Subject: [PATCH 074/105] Up default precision to 12

---
 src/hyperloglogplus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
index 8cd2bdc..b4d9a81 100644
--- a/src/hyperloglogplus.h
+++ b/src/hyperloglogplus.h
@@ -237,7 +237,7 @@ class HyperLogLogPlusMinus {
 	 * @param precision
 	 * @param sparse
 	 */
-	HyperLogLogPlusMinus(uint8_t precision=10, bool sparse=true):p(precision),sparse(sparse) {
+	HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) {
 		if (precision > 18 || precision < 4) {
 	        throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
 		}

From 0afdcf769a05f92240fabcd0af2febd48f93590a Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:36:53 -0400
Subject: [PATCH 075/105] use unordered maps

---
 src/grade_classification.cpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
index edfc999..8159ca8 100644
--- a/src/grade_classification.cpp
+++ b/src/grade_classification.cpp
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <fstream>
 #include <unordered_map>
+#include <unordered_set>
 #include <iomanip>
 
 using namespace std;
@@ -45,13 +46,14 @@ int main(int argc, char **argv) {
   cerr << "Read " << seqid_map.size() << " taxa mappings" << endl;
   
   ofstream out_file(argv[4]);
-  set<string> all_ranks;
+  unordered_set<string> all_ranks;
   unordered_map< string, size_t > rank_counts;
-  map< int, set<TAXID> > simulated_taxids_at_rank;
-  map< int, set<TAXID> > identified_taxids_at_rank;
-  map< int, size_t > correct_reads_at_rank;
-  map< int, size_t > incorrect_reads_at_rank;
-  map< int, size_t > reads_at_higher_rank;
+  unordered_map< int, set<TAXID> > simulated_taxids_at_rank;
+  unordered_map< int, set<TAXID> > identified_taxids_at_rank;
+  unordered_map< int, size_t > correct_reads_at_rank;
+  unordered_map< int, size_t > incorrect_reads_at_rank;
+  unordered_map< int, size_t > reads_at_higher_rank;
+  unordered_set<uint32_t> ignored_taxa;
   size_t total_reads = 0;
   size_t unidentified_reads = 0;
   
@@ -95,7 +97,10 @@ int main(int argc, char **argv) {
     } else {
       seq_taxid = it->second;
       if (!taxdb.hasTaxon(seq_taxid)) {
-        cerr << "Ignoring taxon " << seq_taxid << " - not in database" << endl;
+        if (ignored_taxa.count(seq_taxid) == 0) {
+          cerr << "Ignoring taxon " << seq_taxid << " - not in database" << endl;
+          ignored_taxa.insert(seq_taxid);
+        }
         continue;
       }
       //cerr <<"seqid" << seq_taxid;

From 13adecff1625843aff578a2c5f0aff0fdb482766 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:38:16 -0400
Subject: [PATCH 076/105] Check library directories exist

---
 scripts/krakenu-build | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/scripts/krakenu-build b/scripts/krakenu-build
index 8f72697..e90b353 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenu-build
@@ -321,6 +321,17 @@ sub standard_installation {
 }
 
 sub build_database {
+  foreach (@library_dirs) {
+    if (!-d $_) {
+      print STDERR "Library directory $_ does not exist!\n";
+      exit(1);
+    }
+  }
+  if (! -d $taxonomy_dir) {
+    print STDERR "Taxonomy directory $taxonomy_dir does not exist!\n";
+    exit(1);
+  }
+
   $ENV{"KRAKEN_REBUILD_DATABASE"} = (defined $rebuild? 1 : 0);
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_SEQ"} = $add_taxonomy_ids_for_seq;
   $ENV{"KRAKEN_ADD_TAXIDS_FOR_GENOME"} = $add_taxonomy_ids_for_genome;

From 1eda03ee57e15c094969c06bd2abfde6e1b1acdc Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 1 Oct 2017 22:39:24 -0400
Subject: [PATCH 077/105] Use uid_mapping headers

---
 src/read_uid_mapping.cpp | 36 ++++--------------------------------
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp
index 0ac84db..6c40d65 100644
--- a/src/read_uid_mapping.cpp
+++ b/src/read_uid_mapping.cpp
@@ -1,4 +1,5 @@
 
+#include "uid_mapping.hpp"
 #include "kraken_headers.hpp"
 #include "quickfile.hpp"
 #include <unordered_map>
@@ -7,39 +8,10 @@
 using namespace std;
 using namespace kraken;
 
-inline
-vector<uint32_t> get_taxids_for_uid(uint32_t uid, char* fptr) {
-  size_t int_size = sizeof(int);
-  size_t block_size = sizeof(int)*2;
-  // TODO: Just get a uint64_t and shift the bits, probably faster
-  uint32_t taxid  = *(uint32_t*)(fptr+(uid-1)*block_size);
-  uint32_t parent_uid = *(uint32_t*)(fptr+(uid-1)*block_size + int_size);
-  
-  vector<uint32_t> taxids = {taxid};
-  while (parent_uid != 0) {
-    taxid  = *(uint32_t*)(fptr+(parent_uid-1)*block_size);
-    parent_uid = *(uint32_t*)(fptr+(parent_uid-1)*block_size + int_size);
-    taxids.push_back(taxid);
-  }
-  std::sort(taxids.begin(), taxids.end());
-  return(taxids);
-}
-
-inline
-vector<uint32_t> get_taxids_for_uid_from_map(uint32_t uid, char* fptr, unordered_map<uint32_t, vector<uint32_t> >& uid_map ) {
-  auto it = uid_map.find(uid);
-  if (it != uid_map.end()) {
-    return it->second;
-  } 
-  vector<uint32_t> taxids = get_taxids_for_uid(uid, fptr);
-  uid_map[uid] = taxids;
-  return(taxids);
-}
-
 int main(int argc, char **argv) {
   if (argc < 2) {
     std::cerr << "Usage: read_uid_mapping <uid mappingfile> [<uid>]"
-        "The file is supposed to have lines terminated by '\n'."
+        "The file is supposed to have lines terminated by '\n'.it.second"
          << std::endl;
     return 1;
   }
@@ -61,10 +33,10 @@ int main(int argc, char **argv) {
       cout << ++i << '\t' << *taxid_ptr << '\t' << *parent_uid << endl;
     }
   } else {
-    unordered_map<uint32_t, vector<uint32_t> > UID_to_TaxID_map;
+    //unordered_map<uint32_t, vector<uint32_t> > UID_to_TaxID_map;
     for (int i=2; i <argc; ++i) {
       uint32_t UID = atol(argv[i]);
-      vector<uint32_t> taxids = get_taxids_for_uid(UID, UID_to_TaxID_map, fptr);
+      vector<uint32_t> taxids = get_taxids_for_uid(UID, fptr);
       cout << UID << '\t';
       for (auto t : taxids) {
         cout << t << ' ';

From b84380b234d75eb9ade47f250127f7d5b34640dd Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 2 Oct 2017 14:01:01 -0400
Subject: [PATCH 078/105] update init.sh

---
 tests/init.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/init.sh b/tests/init.sh
index f4c73d3..7d74c10 100755
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -10,6 +10,7 @@ set -xeu
 ## Download taxonomy and genomic data into data/
 #$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
 #$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
+$DIR/install/krakenu-download --db $DIR/data -R contaminants
 
 for i in viral viral-neighbors archaea bacteria; do 
   [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
@@ -18,5 +19,5 @@ for i in viral viral-neighbors archaea bacteria; do
   [[ -s $DUSTED_F ]] || dustmasker -infmt fasta -in $DIR/data/all-$i.fna -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > "$DUSTED_F"
   mkdir -p $DIR/data/library/$i-dusted
   [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna" ]] || ln "$DUSTED_F" "$DIR/data/library/$i-dusted/all-$i-dusted.fna"
-  [[ -f "$DIR/data/library/$i-dusted/all-$i-dusted.fna.map" ]] || ln "$DIR/data/all-$i.map" "$DIR/data/library/$i-dusted/all-$i.map"
+  [[ -f "$DIR/data/library/$i-dusted/all-$i.map" ]] || ln "$DIR/data/all-$i.map" "$DIR/data/library/$i-dusted/all-$i.map"
 done

From a352122ea68ec3f20f9bfff555b5ce98bf939b61 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 2 Oct 2017 15:33:26 -0400
Subject: [PATCH 079/105] Fix bug in reading names and nodes

---
 src/taxdb.h | 58 +++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/taxdb.h b/src/taxdb.h
index f49d9dc..1593c10 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -445,25 +445,20 @@ void TaxonomyDB<TAXID,READCOUNTS>::parseNodesDump(const std::string nodesDumpFil
   TAXID taxonomyID;
   TAXID parentTaxonomyID;
   std::string rank;
+  char delim;
 
-  while (nodesDumpFile.good()) {
-    getline(nodesDumpFile, line);
-    std::vector<std::string> tokens = tokenise(line, "\t|\t", 3, 2);
-    if (tokens.size() < 3) {
-	  continue;
-	}
-
-	taxonomyID = string_to_T<TAXID>(tokens[0]);
-    parentTaxonomyID = string_to_T<TAXID>(tokens[1]);
-    rank = tokens[2];
-
+  while (nodesDumpFile >> taxonomyID >> delim >> parentTaxonomyID) {
+    nodesDumpFile.ignore(3);
+    getline(nodesDumpFile, rank, '\t');
     auto entryIt = taxIDsAndEntries.find(taxonomyID);
-	if (entryIt == taxIDsAndEntries.end()) {
-	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, parentTaxonomyID, rank);
-	} else {
+    if (entryIt == taxIDsAndEntries.end()) {
+       taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, parentTaxonomyID, rank);
+    } else {
       entryIt->second.parentTaxonomyID = parentTaxonomyID;
       entryIt->second.rank = rank;
     }
+
+    nodesDumpFile.ignore(2560, '\n');
   }
 }
 
@@ -475,22 +470,25 @@ void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFil
   std::string line;
 
   TAXID taxonomyID;
-  std::string scientificName;
+  std::string scientificName, type;
   while (namesDumpFile.good()) {
-    getline(namesDumpFile, line);
-    std::vector<std::string> tokens = tokenise(line, "\t|\t", 4, 2);
-    if (tokens.size() < 4 || tokens[3] != "scientific name") {
-	  continue;
-	}
-    taxonomyID = string_to_T<TAXID>(tokens[0]);
-    scientificName = tokens[1];
-
-    auto entryIt = taxIDsAndEntries.find(taxonomyID);
-	if (entryIt == taxIDsAndEntries.end()) {
-	  taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, scientificName);
-	} else {
-      entryIt->second.scientificName = scientificName;
+    namesDumpFile >> taxonomyID;
+    namesDumpFile.ignore(3);
+    getline(namesDumpFile, scientificName, '\t');
+    namesDumpFile.ignore(3);
+    namesDumpFile.ignore(256, '|');
+    namesDumpFile.ignore(1);
+    getline(namesDumpFile, type, '\t');
+
+    if (type == "scientific name") {
+      auto entryIt = taxIDsAndEntries.find(taxonomyID);
+     if (entryIt == taxIDsAndEntries.end()) {
+        taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, scientificName);
+      } else {
+        entryIt->second.scientificName = scientificName;
+      }
     }
+    namesDumpFile.ignore(2560, '\n');
   }
 }
 
@@ -549,6 +547,10 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
   std::string line;
   while (!inFile.eof()) {
     inFile >> taxonomyID >> parentTaxonomyID;
+    if (taxonomyID > 1 && taxonomyID == parentTaxonomyID) {
+      cerr << "ERROR: the parent of " << taxonomyID << " is itself. Should not happend!\n";
+      exit(1);
+    }
     inFile.get(); // read tab
     std::getline(inFile, scientificName, '\t');
     if (hasGenomeSizes) {

From f05a219624c58131edea7e7b6e8c2e3289e80ec3 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 18 Oct 2017 10:51:16 -0400
Subject: [PATCH 080/105] Dump taxdb without ending separators (for
 kraken-report)

---
 src/dump_taxdb.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp
index 79e668f..3e0d442 100644
--- a/src/dump_taxdb.cpp
+++ b/src/dump_taxdb.cpp
@@ -11,6 +11,8 @@ int main(int argc, char **argv) {
     std::cerr << "Usage: dump_taxdb taxDB names.dmp nodes.dmp\n";
     return 1;
   }
+
+	cerr << "Reading taxonomy database from " << argv[1] << ", writing nodes dump to " << argv[3] << " and names dump to " << argv[2] << "." << endl;
   TaxonomyDB<uint32_t, uint32_t> taxdb {(string)argv[1]};
   ofstream names_file(argv[2]);
   names_file.exceptions(ifstream::failbit | ifstream::badbit);
@@ -22,12 +24,12 @@ int main(int argc, char **argv) {
     nodes_file << taxon.second.taxonomyID 
       << "\t|\t" << taxon.second.parentTaxonomyID
       << "\t|\t" << taxon.second.rank
-      << "\t|\n"; // there are further columns, but Kraken does not care about them
+      << endl; // there are further columns, but Kraken does not care about them
     
     names_file << taxon.second.taxonomyID 
       << "\t|\t" << taxon.second.scientificName
       << "\t|\t" 
-      << "\t|\t" << "scientific name" << "\t|\n";
+      << "\t|\t" << "scientific name" << endl;
   }
   names_file.close();
   nodes_file.close();

From 2172caa02b029024bad88f8624b35e9e80d33bdc Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 18 Oct 2017 10:55:49 -0400
Subject: [PATCH 081/105] Allow to run build_taxdb with a taxdb as consistency
 check

---
 src/build_taxdb.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index 6f33763..763a8a0 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -26,11 +26,22 @@
 using namespace std;
 
 int main(int argc, char **argv) {
-	if (argc < 3 || argc > 4) {
-      std::cerr << "Usage: build_taxdb names.dmp nodes.dmp [taxon-counts]\n";
+	if (argc < 2 || argc > 4) {
+      std::cerr << "USAGE:\n" 
+        << "With two or three arguments, echo taxDB based on NCBI taxonomy dump:\n"
+        << "build_taxdb names.dmp nodes.dmp [taxon-counts]\n"
+        << "\n"
+        << "With one argument, read in taxDB and echo it again for consistency checks:\n"
+        << "build_taxdb taxDB\n";
       return 1;
     }
-    TaxonomyDB<uint32_t, uint32_t> taxdb {(string)argv[1], (string)argv[2]};
+        
+    TaxonomyDB<uint32_t, uint32_t> taxdb;
+    if (argc == 2) {
+    taxdb = TaxonomyDB<uint32_t, uint32_t> ((string)argv[1]);
+    } else {
+    taxdb = TaxonomyDB<uint32_t, uint32_t> ((string)argv[1], (string)argv[2]);
+    }
     if (argc == 4) {
         ifstream ifs(argv[3]);
         uint32_t taxon; uint64_t count;

From eb9447ebbb386853737455701b2a19c08627af60 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 18 Oct 2017 10:56:28 -0400
Subject: [PATCH 082/105] Update to classification grading

---
 src/grade_classification.cpp     | 70 ++++++++++++++++++++------------
 tests/build-dbs.sh               | 49 ++++++++++++++++------
 tests/test-on-simulated-reads.sh | 49 +++++++++++++++-------
 3 files changed, 116 insertions(+), 52 deletions(-)

diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
index 8159ca8..c4dec80 100644
--- a/src/grade_classification.cpp
+++ b/src/grade_classification.cpp
@@ -24,15 +24,12 @@ unordered_map<string, uint32_t> read_seqid_mapping(string filename) {
   string line, seq_id;
   uint32_t taxid;
 
-  while (map_file.good()) {
-    getline(map_file, line);
-    if (line.empty())
-      break;
-    istringstream iss(line);
-    iss >> seq_id >> taxid;
+  while (map_file >> seq_id >> taxid) {
     ID_to_taxon_map[seq_id] = taxid;
+    map_file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
   }
   map_file.close();
+  cerr << "Read " << ID_to_taxon_map.size() << " taxa mappings" << endl;
   return ID_to_taxon_map;
 }
 
@@ -43,7 +40,6 @@ int main(int argc, char **argv) {
   }
   TaxonomyDB<uint32_t, uint32_t> taxdb = TaxonomyDB<uint32_t, uint32_t>(argv[1], false);
   unordered_map<string, uint32_t> seqid_map = read_seqid_mapping(argv[2]);
-  cerr << "Read " << seqid_map.size() << " taxa mappings" << endl;
   
   ofstream out_file(argv[4]);
   unordered_set<string> all_ranks;
@@ -74,7 +70,11 @@ int main(int argc, char **argv) {
     if (line.empty())
       continue;
     istringstream iss(line);
-    iss >> classification_state >> read_id >> identified_taxid;
+    string l;
+    string classi;
+    iss >> classification_state >> read_id >> identified_taxid >> l;
+    iss.get();
+    getline(iss,classi);
 
     ++total_reads;
     if (identified_taxid == 0) {
@@ -117,7 +117,7 @@ int main(int argc, char **argv) {
         }  
       }
       
-      string seq_species = taxdb.getScientificName(seq_taxid);
+      string seq_name = taxdb.getScientificName(seq_taxid);
       // getLowestCommonAncestor returns lca taxon as well as distance between the taxa
       pair<uint32_t, int> lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, identified_taxid);
       string lca_rank_string = taxdb.getNextProperRank(lca_taxid_dist.first);
@@ -144,31 +144,49 @@ int main(int argc, char **argv) {
 
       if (identified_taxid == 0) 
         lca_rank_string = "unidentified";
+
       ++rank_counts[lca_rank_string];
-      out_file << seq_species << '\t' << seq_taxid << '\t'  << identified_taxid << '\t' << lca_rank_string << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << '\n';
+      out_file 
+        << read_id << '\t' << seq_name << '\t' << seq_taxid << '\t'  
+        << identified_taxid << '\t' << taxdb.getRank(taxdb.getTaxIDAtNextProperRank(identified_taxid)) << '\t'
+        << lca_rank_string << '\t' << lca_taxid_dist.first << '\t' << lca_taxid_dist.second << '\t' << classi << '\n';
     }
   }
   k_file.close();
 
-  cout << "#LCA_RANK_READ_COUNTS" << endl;
-  for (const auto & kv : rank_counts) {
-    cout << kv.first << '\t' << kv.second << endl;
+  char delim = '\t';
+
+  if (0) {
+    cout << "#LCA_RANK_READ_COUNTS" << endl;
+    for (const auto & kv : rank_counts) {
+      cout << kv.first << delim << kv.second << endl;
+    }
+    cout << endl;
   }
-  cout << "\n#rank; total_reads; correct; incorrect; at_higher_rank; unidentified" << endl;
+
+  cout << "#rank" << delim << "total_reads" << delim << "correct"<< delim << "incorrect"<< delim << "sensitivity" << delim << "precision"  << delim << "higher_rank" << delim << "unidentified" << endl;
   for (TaxRank::RANK rank : ranks_of_interest) {
-    cout << TaxRank::toString(rank) << '\t' << total_reads 
-      << '\t' << correct_reads_at_rank[rank]
-      << '\t' << incorrect_reads_at_rank[rank]
-      << '\t' << reads_at_higher_rank[rank]
-      << '\t' << unidentified_reads 
+    size_t true_positives = correct_reads_at_rank.at(rank);
+    size_t false_positives = incorrect_reads_at_rank.at(rank);
+    double sensitivity = 100.0*(double)true_positives/(double)total_reads;
+    double specificity = 100.0*(double)true_positives/(double)(true_positives+false_positives);
+    cout << TaxRank::toString(rank) << delim << total_reads 
+      << delim << true_positives
+      << delim << false_positives
+      << delim << sensitivity << '%'
+      << delim << specificity << '%'
+      << delim << reads_at_higher_rank.at(rank)
+      << delim << unidentified_reads 
+      << setprecision(2) << std::fixed
       << '\n';
   }
 
-  cout << "\n#rank;P;TP;FP;sens;prec" << endl;
+  cout << "#rank" << delim << "true_count" << delim << "correct" << delim << "incorrect" << delim << "recall" << delim << "precision" << endl;
   for (TaxRank::RANK rank : ranks_of_interest) {
     size_t true_positives = 0;
     size_t false_positives = 0;
     
+    if (identified_taxids_at_rank.find(rank) != identified_taxids_at_rank.end()) {
     for (const auto & tid : identified_taxids_at_rank[rank]) {
       if (simulated_taxids_at_rank[rank].count(tid) == 1) {
         ++true_positives;
@@ -176,16 +194,18 @@ int main(int argc, char **argv) {
         ++false_positives;
       }
     }
+    }
 
     double sensitivity = 100.0*(double)true_positives/(double)simulated_taxids_at_rank[rank].size();
     double specificity = 100.0*(double)true_positives/(double)(true_positives+false_positives);
 
     cout << TaxRank::toString(rank)
-      << '\t' << simulated_taxids_at_rank[rank].size()
-      << '\t' << true_positives
-      << '\t' << false_positives << setprecision(2) << std::fixed
-      << '\t' << sensitivity << '%'
-      << '\t' << specificity << '%'
+      << delim << simulated_taxids_at_rank[rank].size()
+      << delim << true_positives
+      << delim << false_positives 
+      << setprecision(2) << std::fixed
+      << delim << sensitivity << '%'
+      << delim << specificity << '%'
       << '\n';
   }
 }
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index 864cb0c..affc323 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -1,22 +1,47 @@
 #!/bin/bash
 
-set -xeu
+set -eu
 
 [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
+[[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10
 
-export PATH="$DIR/install:$PATH"
-for K in 31 26 21; do
-  mkdir -p $DIR/dbs/refseq-viral-k$K
-  time krakenu-build --kmer-len $K --minimizer-len 12 --threads 4 --db $DIR/dbs/refseq-viral-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral --taxonomy-dir=$DIR/data/taxonomy 2>&1 | tee $DIR/dbs/refseq-viral-k$K/build.log
 
-  mkdir -p $DIR/dbs/refseq-viral-k$K/taxonomy
-  dump_taxdb $DIR/dbs/refseq-viral-k$K/taxDB $DIR/dbs/refseq-viral-k$K/taxonomy/names.dmp $DIR/dbs/refseq-viral-k$K/taxonomy/nodes.dmp
+build_db() {
+  K=$1; shift
+  NAM=$1; shift
 
-  if [[ `uname` != "Darwin" ]]; then
-    mkdir -p $DIR/dbs/refseq-oct2017-k$K
-    krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-oct2017-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/viral-dusted --library-dir=$DIR/data/library/viral-neighbors-dusted --library-dir=$DIR/data/library/bacteria-dusted --library-dir=$DIR/data/library/archaea-dusted --library-dir=$DIR/data/library/vertebrate_mammalian --library-dir=$DIR/data/library/contaminants --taxonomy-dir=$DIR/data/taxonomy
-    mkdir -p $DIR/dbs/refseq-bacteria-k$K
-    krakenu-build --kmer-len $K --threads 20 --db $DIR/dbs/refseq-bacteria-k$K --build --taxids-for-genomes --taxids-for-sequences --library-dir=$DIR/data/library/bacteria --library-dir=$DIR/data/library/archaea --taxonomy-dir=$DIR/data/taxonomy
+  DB_NAM=refseq-$NAM-k$K
+  DB_DIR=$DIR/dbs/$DB_NAM
+
+  mkdir -p $DB_DIR
+  CMD="krakenu-build --kmer-len $K --minimizer-len 12 --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
+  for L in $@; do
+    CMD="$CMD  --library-dir=$DIR/data/library/$L"
+  done
+  if [[ ! -f "$DB_DIR/is.busy" ]]; then
+    echo "EXECUTING $CMD"
+    touch $DB_DIR/is.busy
+    $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log
+    if [[ ! -f "$DB_DIR/taxonomy/nodes.dmp" ]]; then
+      mkdir -p $DB_DIR/taxonomy
+      echo "EXECUTING dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp"
+      dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp
+    fi
+    rm $DB_DIR/is.busy
+  else 
+    echo "IGNORING $DB_DIR"
+  fi
+}
+
+#export PATH="$DIR/install:$PATH"
+for K in 31 21; do
+  if [[ `uname` == "Darwin" ]]; then
+    build_db $K viral viral
+    build_db $K all-viral viral viral-neighbors
+  else
+    build_db $K oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
+                         vertebrate_mammalian contaminants
+    #build_db $K bacteria bacteria archaea
   fi
 done
 
diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh
index df18b14..580f218 100755
--- a/tests/test-on-simulated-reads.sh
+++ b/tests/test-on-simulated-reads.sh
@@ -5,21 +5,27 @@ set -eu
 [[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
 SDIR=$DIR/simulated_reads
 CDIR=$DIR/classification-results
+CCDIR=$DIR/classification-stats
 mkdir -p $CDIR
+mkdir -p $CCDIR
 mkdir -p $SDIR
 
 [[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10
 
 run_kraken() {
-  FQ=$1
-  NAM=$2
-  DAT=$3
-  DB_DAT=$4
-  DB_K=$5
-  PROG=$6
-  DB=refseq-$DB_DAT-k$K
+  local FQ=$1
+  local NAM=$2
+  local DAT=$3
+  local DB_DAT=$4
+  local DB_K=$5
+  local PROG=$6
+  local ALWAYS_SEQMAP=$7;
+  local DB=refseq-$DB_DAT-k$K
+
   mkdir -p $CDIR/against-$DB
-  KFILE=$CDIR/against-$DB/$NAM.against-$DB.$PROG
+  mkdir -p $CCDIR/against-$DB
+  local KFILE=$CDIR/against-$DB/$NAM.against-$DB.$PROG
+  local KKFILE=$CCDIR/against-$DB/$NAM.against-$DB.$PROG
 
   if [[ "$PROG" == "kraken" ]]; then 
     CMD="kraken"
@@ -36,21 +42,34 @@ run_kraken() {
     echo "$CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE"
     time $CMD --threads $THREADS --db $DIR/dbs/$DB --fastq $FQ --output $KFILE 2>&1 | tee $KFILE.log
   fi
-  #[[ "$DAT" == "$DB_DAT" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map
-  #[[ -s $KFILE.results.stats ]] || $DIR/install/grade_classification  $DIR/dbs/$DB/taxDB $SEQMAP $KFILE $KFILE.results > $KFILE.results.stats
+  
+  [[ "$DAT" == "$DB_DAT" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map || SEQMAP=$DIR/data/all-$DAT.map
+  [[ "$ALWAYS_SEQMAP" == "ALWAYS_SEQMAP" ]] && SEQMAP=$DIR/dbs/$DB/seqid2taxid.map
+
+  if [[ ! -s "$KKFILE.results.stats" ]]; then
+    $DIR/install/grade_classification  $DIR/dbs/$DB/taxDB $SEQMAP $KFILE $KKFILE.results > $KKFILE.results.stats
+  else
+    echo "$KKFILE.results.stats exist"
+  fi
 }
 
 AB=1m
-for i in 1 2 3; do
+for i in 1; do # 2 3
   for dat in viral viral-neighbors bacteria archaea; do
-    for len in 75 100 150; do
+    for len in 100; do ## 75 150
       NAM=$dat.$AB${len}bp.$i
       FQ=$SDIR/$NAM.fq
       [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i
       for K in 31; do
-        run_kraken $FQ $NAM $dat viral $K kraken
-        run_kraken $FQ $NAM $dat viral $K krakenu
-        #run_kraken $FQ $NAM $dat viral $K krakenuid
+        # run_kraken $FQ $NAM $dat viral $K krakenuid
+        if [[ `uname` != "Darwin" ]]; then
+          run_kraken $FQ $NAM $dat oct2017 $K kraken ALWAYS_SEQMAP
+          run_kraken $FQ $NAM $dat oct2017 $K krakenu ALWAYS_SEQMAP
+          run_kraken $FQ $NAM $dat oct2017 $K krakenuid ALWAYS_SEQMAP
+        else
+          run_kraken $FQ $NAM $dat viral $K kraken
+          run_kraken $FQ $NAM $dat viral $K krakenu
+        fi
       done
     done
   done

From 75b24aefb9db83addd023215bb6883958c44f0e4 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 18 Oct 2017 10:57:25 -0400
Subject: [PATCH 083/105] Do not give separate taxids for human sequences
 (fixed now)

---
 src/set_lcas.cpp | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index a1a545a..3872c4d 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -227,6 +227,8 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
       continue;
     }
 
+    uint32_t orig_taxid = taxid;
+
     if (Add_taxIds_for_Assembly && iss.good()) {
       iss.get();
       getline(iss, name);
@@ -234,7 +236,7 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
         taxid = get_new_taxid(name_to_taxid_map, Parent_map, name, taxid, "assembly");
     }
 
-    if (Add_taxIds_for_Sequences && taxid != 9606) {
+    if (Add_taxIds_for_Sequences && orig_taxid != 9606) {
       taxid = get_new_taxid(name_to_taxid_map, Parent_map, seq_id, taxid, "sequence");
     }
     if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) {
@@ -289,6 +291,12 @@ void process_single_file() {
         ++seqs_skipped;
         continue;
     }
+
+    if (Parent_map.find(taxid) == Parent_map.end()) {
+      cerr << "Skipping sequence " << dna.id << " since taxonomy ID " << taxid << " is not in taxonomy database!" << endl;
+      ++ seqs_skipped;
+      continue;
+    }
     
     if (Add_taxIds_for_Sequences && taxid != 9606) {
       // Update entry based on header line
@@ -308,12 +316,12 @@ void process_single_file() {
     if (taxid) {
       if (Parent_map.find(taxid) == Parent_map.end()) {
         cerr << "Ignoring sequence for taxID " << taxid << " - not in taxDB\n";
+      } else {
+        #pragma omp parallel for schedule(dynamic)
+        for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
+          set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+         ++seqs_processed;
       }
-      #pragma omp parallel for schedule(dynamic)
-      for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
-        set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
-
-        ++seqs_processed;
     } else {
       if (verbose) 
         cerr << "Skipping sequence with header [" << dna.header_line << "] - no taxid" << endl;
@@ -396,7 +404,9 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
       #pragma omp critical(new_uid)
       *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file);
     } else if (!force_taxid && taxid != contaminant_taxids) {
-      *val_ptr = lca(Parent_map, taxid, *val_ptr);
+      if (Parent_map.find(taxid) != Parent_map.end()) {
+        *val_ptr = lca(Parent_map, taxid, *val_ptr);
+      }
     } else {
       // When force_taxid is set, do not compute lca, but assign the taxid
       // of the (last) sequence to k-mers

From f15112d5002abb349d956d849f6efec15982382e Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 18 Oct 2017 10:57:49 -0400
Subject: [PATCH 084/105] Add species subgroup and group ranks

---
 src/taxdb.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/taxdb.h b/src/taxdb.h
index 1593c10..b522866 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -63,7 +63,7 @@ struct TaxRank {
   //"superkingdom", "root"};
 
   enum RANK { unknown, no_rank, sequence, assembly,
-    subspecies, species, subgenus, genus, tribe, subfamily,
+    subspecies, species, species_subgroup, species_group, subgenus, genus, tribe, subfamily,
     family, superfamily, parvorder, infraorder, suborder,
     order, superorder, parvclass, infraclass, subclass,
     class_, superclass, subphylum, phylum, kingdom,
@@ -73,7 +73,12 @@ struct TaxRank {
   static const unordered_map<string, RANK> string_to_rank;
 
   static const RANK toRank(const string& rank) {
-    return string_to_rank.at(rank);
+    const auto& it = string_to_rank.find(rank);
+    if (it == string_to_rank.end()) {
+      cerr << "ERROR: Could not find rank " << rank << endl;
+      exit(1);
+    }
+    return it->second;
   }
 
   static const char* toString(const TaxRank::RANK& rank) {
@@ -84,6 +89,8 @@ struct TaxRank {
       case RANK::assembly:         return "assembly";
       case RANK::subspecies:       return "subspecies";
       case RANK::species:          return "species";
+      case RANK::species_subgroup: return "species subgroup";
+      case RANK::species_group:    return "species group";
       case RANK::subgenus:         return "subgenus";
       case RANK::genus:            return "genus";
       case RANK::tribe:            return "tribe";
@@ -120,6 +127,8 @@ const unordered_map<string, TaxRank::RANK> TaxRank::string_to_rank = {
   {"assembly", TaxRank::assembly},
   {"subspecies", TaxRank::subspecies},
   {"species", TaxRank::species},
+  {"species subgroup", TaxRank::species_subgroup},
+  {"species group", TaxRank::species_group},
   {"subgenus", TaxRank::subgenus},
   {"genus", TaxRank::genus},
   {"tribe", TaxRank::tribe},

From 3090304362b635866a211fe91d68c5462117de4b Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 18 Oct 2017 11:00:02 -0400
Subject: [PATCH 085/105] Don't use .at() for vector access

---
 src/uid_mapping.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp
index 4a80946..d2100d3 100644
--- a/src/uid_mapping.cpp
+++ b/src/uid_mapping.cpp
@@ -29,7 +29,7 @@ namespace kraken {
         cerr << "kmer_uid ("<< kmer_uid <<") greater than UID vector size ("<< UID_to_taxids_vec.size()<<")!!" << endl;
         exit(1);
       }
-      taxid_set = *(UID_to_taxids_vec.at(kmer_uid-1));
+      taxid_set = *(UID_to_taxids_vec[kmer_uid-1]);
       auto it = std::lower_bound( taxid_set.begin(), taxid_set.end(), taxid); // find proper position in descending order
       if (it == taxid_set.end() || *it != taxid) {
         // add the taxid to the set, in the right position such that it remains sorted

From 9c11d877d49e5efee7288e2eb385946c94d2de78 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 18 Oct 2017 11:34:49 -0400
Subject: [PATCH 086/105] Test modification of lca

---
 src/krakenutil.cpp | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index 2da433e..365238f 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -55,20 +55,23 @@ namespace kraken {
       return a ? a : b;
 
     // create a path from a to the root
-    std::unordered_set<uint32_t> a_path;
-    while (a > 0 && a != parent_map.at(a)) {
-	  if (a == b)
-		  return a;
+    std::vector<uint32_t> a_path;
+    do {
+      if (a == b) 
+        return a; 
       a_path.insert(a);
       a = parent_map.at(a);
-    }
+    } while (a != a_path.back())
 
     // search for b in the path from a to the root
-    while (b > 0 && b != parent_map.at(b)) {
-      if (a_path.count(b) > 0)
+    uint32_t last_b = 0;
+    do {
+      if (a_path.find(b) != a_path.end())
         return b;
+
+      last_b = b;
       b = parent_map.at(b);
-    }
+    } while (last_b != b)
     return 1;
   }
 
@@ -77,8 +80,8 @@ namespace kraken {
   // Tree resolution: take all hit taxa (plus ancestors), then
   // return leaf of highest weighted leaf-to-root path.
   uint32_t resolve_tree(const unordered_map<uint32_t, uint32_t> &hit_counts,
-                        const unordered_map<uint32_t, uint32_t> &parent_map)
-  {
+                        const unordered_map<uint32_t, uint32_t> &parent_map) {
+  
     set<uint32_t> max_taxa;
     uint32_t max_taxon = 0, max_score = 0;
 
@@ -93,6 +96,10 @@ namespace kraken {
         if (it2 != hit_counts.end()) {
           score += it2->second;
         }
+        if (node == parent_map.at(node)) {
+          cerr << "Taxon " << node << " has itself as parent!" << endl;
+          break;
+        }
         node = parent_map.at(node);
 
       }

From 7d1ab245ada965bae731f0505d3dfbe46fa9115d Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 23 Oct 2017 10:10:50 -0400
Subject: [PATCH 087/105] Update

---
 scripts/krakenu-build_db.sh |  6 +++---
 src/Makefile                |  6 +++---
 src/krakenutil.cpp          | 30 +++++++++++++++++++++++-----
 src/set_lcas.cpp            | 39 +++++++++++++++++++++++--------------
 tests/build-dbs.sh          | 16 ++++++++++-----
 tests/init.sh               |  5 +++--
 6 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index 96f1aa8..e4707bd 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -255,7 +255,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
     set -x
       set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \
       -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kdb.counts \
-      -F <( cat_library ) > seqid2taxid-plus.map
+      -F <( cat_library ) -T > seqid2taxid-plus.map
     set +x
     if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
       mv seqid2taxid.map seqid2taxid.map.orig
@@ -300,8 +300,8 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
   ## Make a classification report
   REPNAME=uid_database
   if [[ ! -s $REPNAME.report.tsv ]]; then
-    #echo "Creating UID database summary report $REPNAME.report.tsv ..."
-    #krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
+    echo "Creating UID database summary report $REPNAME.report.tsv ..."
+    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
   fi
 fi
 
diff --git a/src/Makefile b/src/Makefile
index 2145bd7..2603974 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,8 +1,8 @@
 CXX = g++
 FOPENMP?=-fopenmp
-CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors
+CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -g -Wfatal-errors
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
+PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
 .PHONY: all install clean
@@ -28,7 +28,7 @@ read_uid_mapping: quickfile.o
 classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h
 	$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)
 
-build_taxdb: taxdb.h report-cols.h
+build_taxdb: taxdb.h report-cols.h quickfile.o
 
 make_seqid_to_taxid_map: quickfile.o
 
diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index 365238f..d58cf39 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -21,6 +21,7 @@
 #include "kraken_headers.hpp"
 #include "krakenutil.hpp"
 #include <unordered_set>
+#include<algorithm>
 
 using namespace std;
 
@@ -50,7 +51,26 @@ namespace kraken {
   // Return lowest common ancestor of a and b
   // LCA(0,x) = LCA(x,0) = x
   // Default ancestor is 1 (root of tree)
-  uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b) {
+  uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map,
+    uint32_t a, uint32_t b)
+  {
+    if (a == 0 || b == 0)
+      return a ? a : b;
+
+    unordered_set<uint32_t> a_path;
+    while (a > 0) {
+      a_path.insert(a);
+      a = parent_map.at(a);
+    }
+    while (b > 0) {
+      if (a_path.count(b) > 0)
+        return b;
+      b = parent_map.at(b);
+    }
+    return 1;
+  }
+
+  uint32_t lca_vec(const unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b) {
     if (a == 0 || b == 0)
       return a ? a : b;
 
@@ -59,19 +79,19 @@ namespace kraken {
     do {
       if (a == b) 
         return a; 
-      a_path.insert(a);
+      a_path.push_back(a);
       a = parent_map.at(a);
-    } while (a != a_path.back())
+    } while (a != a_path.back());
 
     // search for b in the path from a to the root
     uint32_t last_b = 0;
     do {
-      if (a_path.find(b) != a_path.end())
+      if (std::find(a_path.begin(), a_path.end(), b) != a_path.end())
         return b;
 
       last_b = b;
       b = parent_map.at(b);
-    } while (last_b != b)
+    } while (last_b != b);
     return 1;
   }
 
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 3872c4d..2de456a 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -39,7 +39,7 @@ void usage(int exit_code=EX_USAGE);
 void process_files();
 void process_single_file();
 void process_file(string filename, uint32_t taxid);
-void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish);
+void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid = false);
 
 int Num_threads = 1;
 string DB_filename, Index_filename,
@@ -47,8 +47,8 @@ string DB_filename, Index_filename,
   Kmer_count_filename,
   File_to_taxon_map_filename,
   ID_to_taxon_map_filename, Multi_fasta_filename;
-bool force_taxid = false;
-int New_taxid_start = 1000000000;
+bool force_contaminant_taxid = false;
+uint32_t New_taxid_start = 1000000000;
 
 bool Allow_extra_kmers = false;
 bool verbose = false;
@@ -88,7 +88,7 @@ int main(int argc, char **argv) {
 
   parse_command_line(argc, argv);
 
-  if (!TaxDB_filename.empty() && !force_taxid) {
+  if (!TaxDB_filename.empty()) {
     taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_filename);
     for (const auto & tax : taxdb.taxIDsAndEntries) {
       if (tax.first != 0)
@@ -211,6 +211,15 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
   string line, seq_id, name;
   uint32_t taxid;
 
+  if (Add_taxIds_for_Assembly && Add_taxIds_for_Sequences) {
+    for (const auto& k : taxdb.taxIDsAndEntries) {
+      if (k.first >= New_taxid_start) {
+        New_taxid_start = k.first;
+      } 
+    }
+    cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl;
+  }
+
   // Used when adding new taxids for assembly or sequence
   unordered_map<string, uint32_t> name_to_taxid_map;
 
@@ -298,6 +307,8 @@ void process_single_file() {
       continue;
     }
     
+    bool is_contaminant_taxid = taxid == 32630 || taxid == 81077;
+    
     if (Add_taxIds_for_Sequences && taxid != 9606) {
       // Update entry based on header line
       auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
@@ -319,7 +330,7 @@ void process_single_file() {
       } else {
         #pragma omp parallel for schedule(dynamic)
         for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
-          set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+          set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1, is_contaminant_taxid);
          ++seqs_processed;
       }
     } else {
@@ -378,7 +389,7 @@ void process_sequence(DNASequence dna) {
   // Or maybe asembly_summary file?
 }
 
-void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
+void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid) {
   KmerScanner scanner(seq, start, finish);
   uint64_t *kmer_ptr;
   uint32_t *val_ptr;
@@ -403,14 +414,12 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
     if (Use_uids_instead_of_taxids) {
       #pragma omp critical(new_uid)
       *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file);
-    } else if (!force_taxid && taxid != contaminant_taxids) {
-      if (Parent_map.find(taxid) != Parent_map.end()) {
-        *val_ptr = lca(Parent_map, taxid, *val_ptr);
-      }
-    } else {
-      // When force_taxid is set, do not compute lca, but assign the taxid
+    } else if (force_contaminant_taxid && is_contaminant_taxid) {
+      // When force_contaminant_taxid is set, do not compute lca, but assign the taxid
       // of the (last) sequence to k-mers
       *val_ptr = taxid;
+    } else {
+      *val_ptr = lca(Parent_map, taxid, *val_ptr);
     }
   }
 }
@@ -454,7 +463,7 @@ void parse_command_line(int argc, char **argv) {
         #endif
         break;
       case 'T' :
-        force_taxid = true;
+        force_contaminant_taxid = true;
         break;
       case 'v' :
         verbose = true;
@@ -516,8 +525,8 @@ void usage(int exit_code) {
        << "  -f filename      File to taxon map" << endl
        << "  -F filename      Multi-FASTA file with sequence data" << endl
        << "  -m filename      Sequence ID to taxon map" << endl
-       << "  -a               Add taxonomy IDs (starting with "<<New_taxid_start<<") for assemblies (third column in seqid2taxid.map) to Taxonomy DB" << endl
-       << "  -A               Add taxonomy IDs (starting with "<<New_taxid_start<<") for sequences to Taxonomy DB" << endl
+       << "  -a               Add taxonomy IDs (starting with "<<(New_taxid_start+1)<<") for assemblies (third column in seqid2taxid.map) to Taxonomy DB" << endl
+       << "  -A               Add taxonomy IDs (starting with "<<(New_taxid_start+1)<<") for sequences to Taxonomy DB" << endl
        << "  -T               Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
        << "  -I filename      Write UIDs into database, and output (binary) UID-to-taxid map to filename" << endl
        << "  -p               Pretend - do not write database back to disk (when working in RAM)" << endl
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index affc323..d1ca09c 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -7,14 +7,15 @@ set -eu
 
 
 build_db() {
-  K=$1; shift
-  NAM=$1; shift
+  local K=$1; shift
+  local NAM=$1; shift
+  local MIN=15
 
-  DB_NAM=refseq-$NAM-k$K
+  local DB_NAM=refseq-$NAM-k$K
   DB_DIR=$DIR/dbs/$DB_NAM
 
   mkdir -p $DB_DIR
-  CMD="krakenu-build --kmer-len $K --minimizer-len 12 --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
+  CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
   for L in $@; do
     CMD="$CMD  --library-dir=$DIR/data/library/$L"
   done
@@ -34,13 +35,18 @@ build_db() {
 }
 
 #export PATH="$DIR/install:$PATH"
-for K in 31 21; do
+for K in 31; do
   if [[ `uname` == "Darwin" ]]; then
     build_db $K viral viral
     build_db $K all-viral viral viral-neighbors
   else
     build_db $K oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
                          vertebrate_mammalian contaminants
+    
+    EUKD=$DIR/dbs/refseq-euk-oct2017-k31
+    [[ -d $EUKD ]] || mkdir -p $EUKD
+    [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD
+    build_db $K euk-oct2017 fungi protozoa
     #build_db $K bacteria bacteria archaea
   fi
 done
diff --git a/tests/init.sh b/tests/init.sh
index 7d74c10..6ec60a0 100755
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -9,10 +9,11 @@ set -xeu
 
 ## Download taxonomy and genomic data into data/
 #$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
+$DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
 #$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
-$DIR/install/krakenu-download --db $DIR/data -R contaminants
+#$DIR/install/krakenu-download --db $DIR/data -R contaminants
 
-for i in viral viral-neighbors archaea bacteria; do 
+for i in fungi protozoa viral viral-neighbors archaea bacteria; do 
   [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
   [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map
   DUSTED_F="$DIR/data/all-$i-dusted.fna"

From 5ea9f0602e37a44c9b9806405e70468639a84aa8 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 25 Oct 2017 12:51:41 -0400
Subject: [PATCH 088/105] Update

---
 scripts/krakenu-build_db.sh |   1 -
 scripts/krakenu-download    | 136 +++++++++++++++++++++++++++---------
 src/classify.cpp            |   4 +-
 src/readcounts.hpp          |  34 +++++----
 src/set_lcas.cpp            |  27 ++++---
 tests/build-dbs.sh          |  45 ++++++------
 tests/init.sh               |  13 ++--
 7 files changed, 173 insertions(+), 87 deletions(-)

diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
index e4707bd..f8c1450 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenu-build_db.sh
@@ -24,7 +24,6 @@
 set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 set -o pipefail  # Stop on failures in non-final pipeline commands
-set -x
 
 function report_time_elapsed() {
   set -x
diff --git a/scripts/krakenu-download b/scripts/krakenu-download
index 1d67438..b70a24f 100755
--- a/scripts/krakenu-download
+++ b/scripts/krakenu-download
@@ -23,6 +23,7 @@ sub download(@);
 sub print_header_lines(@);
 sub download_domain(@);
 sub download_viral_neighbors(@);
+sub download_viral_neighbors2(@);
 
 my $FTP="ftp://ftp.ncbi.nih.gov";
 my @ALL_GENOMES=qw/bacteria viral archaea fungi protozoa invertebrate plant vertebrate_mammalian vertebrate_other/;
@@ -68,7 +69,7 @@ ARGUMENT
 COMMON OPTIONS
  -o <directory>     Folder to which the files are downloaded. Default: '.'
  --db <directory>   Alternative to -o: Download to <directory>/{library,taxonomy}.
- -P <# of threads>  Number of processes when downloading (uses xargs). Default: '$N_PROC'
+ --threads <# of threads>  Number of processes when downloading (uses xargs). Default: '$N_PROC'
  --rsync, -R        Download using rsync.
  --overwrite        Redownload and overwrite files with the same name.
  -v                 Verbose.
@@ -174,10 +175,13 @@ foreach my $DATABASE (@ARGV) {
   }
 }
 
+my %taxid_name_map;
+
 if ($INCLUDE_VIRAL_NEIGHBORS) {
   if (!$downloaded_viral_refseq) {
     print STDERR "--include-viral-neighbors only works when RefSeq viral is downloaded in the same session!";
   } else {
+
     my $nbr_lib_dir = $add_dir? "$BASE_DIR/library/viral-neighbors" : "$BASE_DIR/viral-neighbors";
     my $viral_lib_dir = $add_dir? "$BASE_DIR/library/viral" : "$BASE_DIR/viral";
     download_viral_neighbors($viral_lib_dir, $nbr_lib_dir);
@@ -261,25 +265,64 @@ sub end_fork() {
   exit() unless $N_PROC <= 1;
 }
 
+sub download_viral_neighbors2(@) {
+  my ($viral_dir, $nbr_dir) = @_;
+  my $dir = get_dir($BASE_DIR,"taxonomy");
+  print STDERR "Reading names file ...\n";
+  my $names_file = "$dir/names.dmp";
+  open (my $N, "<", $names_file);
+  while (<$N>) {
+    my ($taxid, undef, $name, undef, $type) = split /\t|\t/;
+    next unless $type eq "scientific name";
+    $taxid_name_map{$taxid} = $name;
+  }
+  close($N);
+
+  print STDERR "Downloading nucl_gb.accession2taxid ...\n";
+  my $url = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz";
+  download($url, "$dir/nucl_gb.accession2taxid.gz");
+
+  print STDERR "Sorting mapping file ...\n";
+  system("gunzip -c $dir/nucl_gb.accession2taxid.gz | cut -f 2,3 | sort --parallel $N_PROC > $dir/nucl_gb.accession2taxid.sorted") unless -s "$dir/nucl_gb.accession2taxid.sorted";
+
+  if (!-f "$nbr_dir/all-nbrs.fa"){
+  my $FMT="fasta";
+  my $TERM="Viruses[Organism]+NOT+cellular+organisms[ORGN]+NOT+wgs[PROP]+NOT+AC_000001:AC_999999[pacc]+NOT+gbdiv+syn[prop]+AND+nuccore+genome+samespecies[Filter]";
+  my $ESEARCH_URL="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
+  ## TODO: Go through it 10,000 entries at a time
+  my $URL_PARAMS=`curl -g "$ESEARCH_URL?db=nuccore&usehistory=y&retmax=1&retmode=json&term=$TERM" | grep -e 'querykey' -e 'webenv' | sed -e 's/^ *"querykey": "/query_key=/' -e 's/^ *"webenv": "/WebEnv=/' -e 's/",//' | paste -sd\\&`;
+  chomp $URL_PARAMS;
+  download("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&query_key=1&$URL_PARAMS&rettype=fasta", "$nbr_dir/all-nbrs.fa");
+}
+
+}
+
 sub download_viral_neighbors(@) {
   my ($viral_dir, $nbr_dir) = @_;
-  print STDERR "Reading map files from $viral_dir ... \n";
-  my %ac_to_taxid;
-  foreach my $f (glob("$viral_dir/*.map")) {
-    open (my $F, "<", $f);
-    while (<$F>) {
-      chomp;
-      my ($ac, $taxid, $name) = split(/\t/);
-      $ac =~ s/\.[0-9]*$//;
-      $ac_to_taxid{$ac} = [$name, $taxid];
-    }
-    close ($F);
+
+  print STDERR "Reading names file ...\n";
+  my $dir = get_dir($BASE_DIR,"taxonomy");
+  my $names_file = "$dir/names.dmp";
+  open (my $N, "<", $names_file);
+  while (<$N>) {
+    next unless /scientific name/;
+    my ($taxid, $name) = split /\t\|\t/;
+    $taxid_name_map{$taxid} = $name;
   }
+  close($N);
+
+  print STDERR "Downloading nucl_gb.accession2taxid ...\n";
+  my $url = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz";
+  download($url, "$dir/nucl_gb.accession2taxid.gz");
+
+  my $sorted_map_f = "$dir/nucl_gb.accession2taxid.sorted";
+  print STDERR "Sorting mapping file ...\n";
+  system("gunzip -c $dir/nucl_gb.accession2taxid.gz | cut -f 2,3 | sort --parallel $N_PROC > $sorted_map_f") unless -s $sorted_map_f;
 
   print STDERR "Downloading viral neighbors into $nbr_dir ...\n";
-  my $url = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2";
+  my $url1 = "https://www.ncbi.nlm.nih.gov/genomes/GenomesGroup.cgi?taxid=10239&cmd=download2";
   my $nbr_file = "$nbr_dir/viral_neighbors-taxid10239.nbr";
-  download($url, $nbr_file);
+  download($url1, $nbr_file);
   open(my $F, "<", $nbr_file);
   my @file = <$F>;
   close($F);
@@ -292,28 +335,37 @@ sub download_viral_neighbors(@) {
     ++$i;
     print STDERR "\r  Downloading viral neighbor sequence $i/$n_genomes ..." unless $VERBOSE;
 #    my $pid = $pm->start and next;
-
+    chomp;
     my ($rep_acs, $nbr_ac, undef, undef, $nname, $sname) = split /\t/;
-    my ($name, $taxid);
-    foreach my $rep_ac (split (/,/, $rep_acs)) {
-      if (defined $ac_to_taxid{$rep_ac}) {
-        ($name, $taxid) = @{$ac_to_taxid{$rep_ac}};
-        last;
-      } 
+    my $taxid = `look $nbr_ac $sorted_map_f | cut -f 2`;
+    chomp $taxid;
+
+    if (!defined $taxid || !defined $taxid_name_map{$taxid}) {
+      my $res = `curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=$nbr_ac&rettype=fasta&retmode=xml" | head -n 12  | egrep '<TSeq_taxid>|<TSeq_orgname>'  | sed -e 's#</.*>##' -e 's#.*<.*>##' | paste -sd\$'\\t'`;
+      chomp $res;
+      ($taxid) = split /\t/, $res;
     }
-    if (!defined $taxid) {
-      print STDERR "\nNo mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname]!\n";
-      next;
+
+    my $name = $taxid_name_map{$taxid};
+    if (!defined $taxid || !defined $name) {
+       print STDERR "\nNo mapping for viral neighbor $nbr_ac [rep: $rep_acs, $nname, $taxid]!\n";
+       next;
     }
     (my $name1 = $name) =~ s/[^a-zA-Z0-9_]/_/g;
     $name1 =~ s/__/_/g;
     my $file = "$nbr_dir/$name1-tax$taxid/$nbr_ac.fna";
+    system("mkdir -p $nbr_dir/$name1-tax$taxid");
+    if (-s "$nbr_dir/$nbr_ac.fna") {
+      system("mv -v $nbr_dir/$nbr_ac.fna $nbr_dir/$name1-tax$taxid/$nbr_ac.fna");
+    }
     my $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=$nbr_ac";
-    start_fork() and next;
-    if (download($url,$file)) {
-      print_header_lines($file, $taxid, "$nname $sname neighbor $nbr_ac");
+    if (! -s $file || ! -s "$file.map") {
+      start_fork() and next;
+      if (download($url,$file)) {
+       print_header_lines($file, $taxid, "$nname neighbors");
+      }
+      end_fork();
     }
-    end_fork();
   }
   print STDERR "\n";
   wait_children();
@@ -330,7 +382,11 @@ sub print_header_lines(@) {
   while (<$G>) {
     next unless /^>([^ ]*)/;
     my $ac = $1;
-    print $F "$ac\t$taxid\t$name\n";
+    if (defined $name) {
+      print $F "$ac\t$taxid\t$name\n";
+    } else {
+      print $F "$ac\t$taxid\n";
+    }
   }
   close($G);
   close($F);
@@ -395,7 +451,8 @@ sub download_taxonomy(@) {
 
 sub download_domain(@) {
   my ($domain_dir, $domain, $_assembly_level, $_taxid) = @_;
-  print STDERR "Downloading assembly summary file for $domain genomes, and filtering to assembly level $_assembly_level and taxid $_taxid.\n";
+  print STDERR "Downloading assembly summary file for $domain genomes, and filtering to assembly level $_assembly_level";
+  print STDERR (defined $_taxid? "and taxid $_taxid.\n" : ".\n");
   die unless defined $domain_dir && defined $domain;
   if (-d $domain_dir) {
     print STDERR "WARNING: $domain_dir already exists - potentially overwriting files.\n";
@@ -430,6 +487,9 @@ sub download_domain(@) {
   close $A2;
   close $A1;
 
+  my $downloaded_files = 0;
+  my $existing_files = 0;
+
   my $i = 0;
   foreach my $g (@genomes_to_dl) {
     my ($ftp_path, $taxid, $organism_name, $infraspecific_name, $assembly_accession) = @$g;
@@ -450,15 +510,22 @@ sub download_domain(@) {
     my $bname1 = "${organism_name1}-tax${taxid}-${bname}";
     
     foreach my $ext (split(/,/, $FNA_FILES)) {
-      start_fork() and next;
       my $full_ftp_path = "$ftp_path/${bname}_${ext}.fna.gz";
       my $bfname = $bname1."_".$ext;
       my $fname = $bfname.".fna";
 
+      if (!$OVERWRITE_FILES && -s "$domain_dir/$fname") {
+        ++$existing_files;
+      } else {
+        ++$downloaded_files;
+      }
+
       if (!$OVERWRITE_FILES && -s "$domain_dir/$fname") {
         print STDERR "$domain_dir/$fname exists - not downloading.. \n" if $VERBOSE;
       } else {
+        start_fork() and next;
         download($full_ftp_path, "$domain_dir/$fname.gz", "$domain_dir/$fname");
+        end_fork();
       }
 
       if ($CHANGE_HEADER) {
@@ -470,18 +537,19 @@ sub download_domain(@) {
 
       ## Output sequenceID to taxonomy ID map to STDOUT
       
-      print_header_lines("$domain_dir/$fname", $taxid, "$organism_name $assembly_accession");
+      print_header_lines("$domain_dir/$fname", $taxid, "$assembly_accession $organism_name");
 
       if ($DO_DUST) {
+        start_fork() and next;
         ## TODO: Consider hard-masking only low-complexity stretches with 10 or more bps
         system("dustmasker -infmt fasta -in '$domain_dir/$fname' -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > '$domain_dir/${bfname}_dustmasked.fna'");
         unlink("$domain_dir/$fname");
+        end_fork();
       }
-      end_fork();
     }
   }
 
   wait_children();
 
-  print STDERR "\n";
+  print STDERR "  downloaded $downloaded_files files, $existing_files already existed.\n";
 }
diff --git a/src/classify.cpp b/src/classify.cpp
index a6076d3..1652b45 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -480,8 +480,8 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
   };
 
   //string hitlist_string;
-  uint32_t last_taxon;
-  uint32_t last_counter;
+  //uint32_t last_taxon;
+  //uint32_t last_counter;
 
   vector<db_status> db_statuses(KrakenDatabases.size());
 
diff --git a/src/readcounts.hpp b/src/readcounts.hpp
index 486edbd..e878381 100644
--- a/src/readcounts.hpp
+++ b/src/readcounts.hpp
@@ -7,24 +7,30 @@
 
 namespace kraken {
   struct ReadCounts {
-  	uint64_t n_reads = 0;
-  	uint64_t n_kmers = 0;
+    uint64_t n_reads = 0;
+    uint64_t n_kmers = 0;
     HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
-  	void add_kmer(uint64_t kmer) {
-  		++ n_kmers;
-  		kmers.add(kmer);
-  	}
-      ReadCounts& operator+=(const ReadCounts& b) {
-          n_reads += b.n_reads;
-          n_kmers += b.n_kmers;
-  		kmers += b.kmers;
-          return *this;
-      }
+
+    ReadCounts() { }
+
+    ReadCounts(size_t precision) : kmers(HyperLogLogPlusMinus<uint64_t>(precision)) {
+    }
+    
+    void add_kmer(uint64_t kmer) {
+      ++ n_kmers;
+      kmers.add(kmer);
+    }
+    
+    ReadCounts& operator+=(const ReadCounts& b) {
+      n_reads += b.n_reads;
+      n_kmers += b.n_kmers;
+      kmers += b.kmers;
+      return *this;
+    }
   };
   
-  inline
   uint64_t reads(const ReadCounts& read_count) {
-  	return(read_count.n_reads);
+    return(read_count.n_reads);
   }
 }
 #endif
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 2de456a..7a5c6e0 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -74,6 +74,7 @@ map< TaxidSet, uint32_t> Taxids_to_UID_map;
 unordered_map<string, uint32_t> ID_to_taxon_map;
 unordered_map<uint32_t, bool> SeqId_added;
 KrakenDB Database;
+const size_t hll_prec = 10;
 TaxonomyDB<uint32_t, ReadCounts> taxdb;
 
 const string prefix = "kraken:taxid|";
@@ -211,7 +212,7 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
   string line, seq_id, name;
   uint32_t taxid;
 
-  if (Add_taxIds_for_Assembly && Add_taxIds_for_Sequences) {
+  if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) {
     for (const auto& k : taxdb.taxIDsAndEntries) {
       if (k.first >= New_taxid_start) {
         New_taxid_start = k.first;
@@ -301,15 +302,15 @@ void process_single_file() {
         continue;
     }
 
-    if (Parent_map.find(taxid) == Parent_map.end()) {
+    auto it_p = Parent_map.find(taxid);
+    if (it_p == Parent_map.end()) {
       cerr << "Skipping sequence " << dna.id << " since taxonomy ID " << taxid << " is not in taxonomy database!" << endl;
       ++ seqs_skipped;
       continue;
     }
     
     bool is_contaminant_taxid = taxid == 32630 || taxid == 81077;
-    
-    if (Add_taxIds_for_Sequences && taxid != 9606) {
+    if (Add_taxIds_for_Sequences && taxid != 9606 && it_p->second != 9606) {
       // Update entry based on header line
       auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
       if (entryIt == taxdb.taxIDsAndEntries.end()) {
@@ -414,12 +415,20 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_
     if (Use_uids_instead_of_taxids) {
       #pragma omp critical(new_uid)
       *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file);
-    } else if (force_contaminant_taxid && is_contaminant_taxid) {
-      // When force_contaminant_taxid is set, do not compute lca, but assign the taxid
-      // of the (last) sequence to k-mers
-      *val_ptr = taxid;
     } else {
-      *val_ptr = lca(Parent_map, taxid, *val_ptr);
+      if (!force_contaminant_taxid) {
+        *val_ptr = lca(Parent_map, taxid, *val_ptr);
+      } else {
+        if (*val_ptr == 32630 || *val_ptr == 81077) {
+          // keep value
+        } else if (is_contaminant_taxid) {
+          // When force_contaminant_taxid is set, do not compute lca, but assign the taxid
+          // of the (last) sequence to k-mers
+          *val_ptr = taxid;
+        } else {
+          *val_ptr = lca(Parent_map, taxid, *val_ptr);
+        }
+      }
     }
   }
 }
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index d1ca09c..bfbd3f8 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -2,14 +2,15 @@
 
 set -eu
 
-[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
+#[[ "$#" -ne 1 ]] && DIR=`pwd` || DIR=$1
+DIR=`pwd`
 [[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10
 
 
 build_db() {
   local K=$1; shift
+  local MIN=$1; shift
   local NAM=$1; shift
-  local MIN=15
 
   local DB_NAM=refseq-$NAM-k$K
   DB_DIR=$DIR/dbs/$DB_NAM
@@ -19,7 +20,7 @@ build_db() {
   for L in $@; do
     CMD="$CMD  --library-dir=$DIR/data/library/$L"
   done
-  if [[ ! -f "$DB_DIR/is.busy" ]]; then
+  #if [[ ! -f "$DB_DIR/is.busy" ]]; then
     echo "EXECUTING $CMD"
     touch $DB_DIR/is.busy
     $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log
@@ -29,25 +30,27 @@ build_db() {
       dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp
     fi
     rm $DB_DIR/is.busy
-  else 
-    echo "IGNORING $DB_DIR"
-  fi
+  #else 
+  #  echo "IGNORING $DB_DIR"
+  #fi
 }
 
-#export PATH="$DIR/install:$PATH"
-for K in 31; do
-  if [[ `uname` == "Darwin" ]]; then
-    build_db $K viral viral
-    build_db $K all-viral viral viral-neighbors
-  else
-    build_db $K oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
-                         vertebrate_mammalian contaminants
-    
-    EUKD=$DIR/dbs/refseq-euk-oct2017-k31
-    [[ -d $EUKD ]] || mkdir -p $EUKD
-    [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD
-    build_db $K euk-oct2017 fungi protozoa
-    #build_db $K bacteria bacteria archaea
-  fi
+K=$1; shift;
+
+for VAR in $@; do
+  case "$VAR" in
+    viral)     build_db $K 12 viral viral ;;
+    all-viral) build_db $K 12 all-viral viral viral-neighbors  ;;
+    prok)      build_db $K 15 prok archaea-dusted bacteria-dusted ;;
+    oct2017)   build_db $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
+                               vertebrate_mammalian contaminants ;;
+    euk-oct2017)
+      EUKD=$DIR/dbs/refseq-euk-oct2017-k31
+      [[ -d $EUKD ]] || mkdir -p $EUKD
+      [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD
+      build_db $K euk-oct2017 fungi protozoa ;;
+  *) echo "Usage: $0 K {viral|all-viral|prok|oct2017|euk-oct2017}"
+     exit 1 ;;
+  esac
 done
 
diff --git a/tests/init.sh b/tests/init.sh
index 6ec60a0..d029fb1 100755
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -8,14 +8,15 @@ set -xeu
 #$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install
 
 ## Download taxonomy and genomic data into data/
-#$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
-$DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
-#$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
-#$DIR/install/krakenu-download --db $DIR/data -R contaminants
+time $DIR/install/krakenu-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria
+time $DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any
+time $DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
+time $DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
+time $DIR/install/krakenu-download --db $DIR/data -R contaminants
 
 for i in fungi protozoa viral viral-neighbors archaea bacteria; do 
-  [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
-  [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map
+  [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.fna
+  [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.map
   DUSTED_F="$DIR/data/all-$i-dusted.fna"
   [[ -s $DUSTED_F ]] || dustmasker -infmt fasta -in $DIR/data/all-$i.fna -level 20 -outfmt fasta | sed '/^>/! s/[^AGCT]/N/g' > "$DUSTED_F"
   mkdir -p $DIR/data/library/$i-dusted

From 970c4cac37a8beee4cf8061e3a58d03ce7677996 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 25 Oct 2017 17:15:05 -0400
Subject: [PATCH 089/105] Many small changes to make it working for GCC 4.4 /
 C++0x

---
 src/Makefile                 |   2 +-
 src/build_taxdb.cpp          |   6 +-
 src/classify.cpp             |  49 ++--
 src/dump_taxdb.cpp           |   8 +-
 src/grade_classification.cpp |  22 +-
 src/read_uid_mapping.cpp     |   4 +-
 src/readcounts.hpp           |   6 +-
 src/set_lcas.cpp             |  27 +-
 src/taxdb.h                  | 554 ++++++++++++++++++-----------------
 src/uid_mapping.cpp          |  21 +-
 src/uid_mapping.hpp          |   5 +-
 11 files changed, 373 insertions(+), 331 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2603974..e51a28f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 CXX = g++
 FOPENMP?=-fopenmp
-CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -g -Wfatal-errors
+CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -g -Wfatal-errors
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
 PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index 763a8a0..fc81cec 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -36,11 +36,11 @@ int main(int argc, char **argv) {
       return 1;
     }
         
-    TaxonomyDB<uint32_t, uint32_t> taxdb;
+    TaxonomyDB<uint32_t> taxdb;
     if (argc == 2) {
-    taxdb = TaxonomyDB<uint32_t, uint32_t> ((string)argv[1]);
+    taxdb = TaxonomyDB<uint32_t> ((string)argv[1]);
     } else {
-    taxdb = TaxonomyDB<uint32_t, uint32_t> ((string)argv[1], (string)argv[2]);
+    taxdb = TaxonomyDB<uint32_t> ((string)argv[1], (string)argv[2]);
     }
     if (argc == 4) {
         ifstream ifs(argv[3]);
diff --git a/src/classify.cpp b/src/classify.cpp
index 1652b45..f2ac91b 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -77,9 +77,16 @@ ostream *Report_output;
 vector<ofstream*> Open_fstreams;
 vector<ogzstream*> Open_gzstreams;
 size_t Work_unit_size = DEF_WORK_UNIT_SIZE;
-TaxonomyDB<uint32_t, ReadCounts> taxdb;
+TaxonomyDB<uint32_t> taxdb;
 static vector<KrakenDB*> KrakenDatabases (DB_filenames.size());
 
+struct db_status {
+  db_status() : current_bin_key(0), current_min_pos(1), current_max_pos(0) {}
+  uint64_t current_bin_key;
+  int64_t current_min_pos;
+  int64_t current_max_pos;
+};
+
 uint64_t total_classified = 0;
 uint64_t total_sequences = 0;
 uint64_t total_bases = 0;
@@ -146,12 +153,8 @@ int main(int argc, char **argv) {
 
   if (!TaxDB_file.empty()) {
     // TODO: Define if the taxDB has read counts or not!!
-	  taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_file, false);
-      for (const auto & tax : taxdb.taxIDsAndEntries) {
-          if (tax.first != 0)
-            Parent_map[tax.first] = tax.second.parentTaxonomyID;
-      }
-      Parent_map[1] = 0;
+	  taxdb = TaxonomyDB<uint32_t>(TaxDB_file, false);
+      Parent_map = taxdb.getParentMap();
   } else {
       cerr << "TaxDB argument is required!" << endl;
       return 1;
@@ -222,7 +225,8 @@ int main(int argc, char **argv) {
   std::cerr << "Finishing up ..\n";
 
   if (Print_kraken_report) {
-    for (auto fname : DB_filenames) {
+    for (size_t i = 0; i < DB_filenames.size(); ++i) {
+      const auto& fname = DB_filenames[i];
       ifstream ifs(fname + ".counts");
       if (ifs.good()) {
         ifs.close();
@@ -230,9 +234,8 @@ int main(int argc, char **argv) {
       }
     }
 
-	taxdb.setReadCounts(taxon_counts);
-	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, false);
-	rep.setReportCols({ 
+	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, taxon_counts, false);
+	rep.setReportCols(vector<string> { 
 		"%",
 		"reads", 
     "taxReads",
@@ -245,10 +248,13 @@ int main(int argc, char **argv) {
 	rep.printReport("kraken","blu");
   }
 
-  for (ofstream* ofs : Open_fstreams) {
+  for (size_t i = 0; i < Open_fstreams.size(); ++i) {
+    ofstream* ofs = Open_fstreams[i];
     ofs->close();
   }
-  for (ogzstream* ogzs : Open_gzstreams) {
+
+  for (size_t i = 0; i < Open_gzstreams.size(); ++i) {
+    ogzstream* ogzs = Open_gzstreams[i];
     ogzs->close();
   }
 
@@ -328,8 +334,8 @@ void process_file(char *filename) {
       #pragma omp critical(write_output)
       {
         total_classified += my_total_classified;
-        for (auto &it : my_taxon_counts) {
-            taxon_counts[it.first] += it.second;
+        for (auto it = my_taxon_counts.begin(); it != my_taxon_counts.end(); ++it) {
+            taxon_counts[it->first] += it->second;
         }
 
         if (Print_kraken)
@@ -374,6 +380,7 @@ inline void print_sequence(ostringstream* oss_ptr, const DNASequence& dna) {
       }
 }
 
+/*
 inline
 void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_t& last_counter, uint32_t current_taxon) {
   if (last_taxon == current_taxon) {
@@ -390,6 +397,7 @@ void append_hitlist_string(string& hitlist_string, uint32_t& last_taxon, uint32_
     last_taxon = current_taxon;
   }
 }
+*/
 
 string hitlist_string(const vector<uint32_t> &taxa, const vector<uint8_t> &ambig)
 {
@@ -428,7 +436,7 @@ string hitlist_string(const vector<uint32_t> &taxa, const vector<uint8_t> &ambig
   return hitlist.str();
 }
 
-
+/*
 string hitlist_string_depr(const vector<uint32_t> &taxa)
 {
   uint32_t last_code = taxa[0];
@@ -460,7 +468,7 @@ string hitlist_string_depr(const vector<uint32_t> &taxa)
   }
   return hitlist.str();
 }
-
+*/
 
 bool classify_sequence(DNASequence &dna, ostringstream &koss,
                        ostringstream &coss, ostringstream &uoss,
@@ -472,13 +480,6 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
   uint32_t taxon = 0;
   uint32_t hits = 0;  // only maintained if in quick mode
 
-
-  struct db_status {
-    uint64_t current_bin_key;
-    int64_t current_min_pos = 1;
-    int64_t current_max_pos = 0;
-  };
-
   //string hitlist_string;
   //uint32_t last_taxon;
   //uint32_t last_counter;
diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp
index 3e0d442..76246e4 100644
--- a/src/dump_taxdb.cpp
+++ b/src/dump_taxdb.cpp
@@ -13,16 +13,18 @@ int main(int argc, char **argv) {
   }
 
 	cerr << "Reading taxonomy database from " << argv[1] << ", writing nodes dump to " << argv[3] << " and names dump to " << argv[2] << "." << endl;
-  TaxonomyDB<uint32_t, uint32_t> taxdb {(string)argv[1]};
+  TaxonomyDB<uint32_t> taxdb {(string)argv[1]};
   ofstream names_file(argv[2]);
   names_file.exceptions(ifstream::failbit | ifstream::badbit);
   ofstream nodes_file(argv[3]);
   nodes_file.exceptions(ifstream::failbit | ifstream::badbit);
 
-  for (const auto &taxon : taxdb.taxIDsAndEntries) {
+  for (auto it = taxdb.entries.begin(); it != taxdb.entries.end(); ++it) {
+    const auto &taxon = *it;
     std::string scientificName;
+    uint32_t parentTaxonomyID = taxon.second.parent == NULL? taxon.first : taxon.second.parent->taxonomyID;
     nodes_file << taxon.second.taxonomyID 
-      << "\t|\t" << taxon.second.parentTaxonomyID
+      << "\t|\t" << parentTaxonomyID
       << "\t|\t" << taxon.second.rank
       << endl; // there are further columns, but Kraken does not care about them
     
diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
index c4dec80..5ea7922 100644
--- a/src/grade_classification.cpp
+++ b/src/grade_classification.cpp
@@ -10,10 +10,12 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <iomanip>
+#include <limits>
 
 using namespace std;
 
-using TAXID = uint32_t;
+//using TAXID = uint32_t;
+typedef uint32_t TAXID;
 
 unordered_map<string, uint32_t> read_seqid_mapping(string filename) {
   unordered_map<string, uint32_t> ID_to_taxon_map;
@@ -38,7 +40,7 @@ int main(int argc, char **argv) {
     std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file result_file\n";
     return 1;
   }
-  TaxonomyDB<uint32_t, uint32_t> taxdb = TaxonomyDB<uint32_t, uint32_t>(argv[1], false);
+  TaxonomyDB<uint32_t> taxdb = TaxonomyDB<uint32_t>(argv[1], false);
   unordered_map<string, uint32_t> seqid_map = read_seqid_mapping(argv[2]);
   
   ofstream out_file(argv[4]);
@@ -124,7 +126,8 @@ int main(int argc, char **argv) {
       TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string);
 
       TaxRank::RANK identified_rank = TaxRank::toRank(taxdb.getRank(identified_taxid));
-      for (TaxRank::RANK rank : ranks_of_interest) {
+      for (size_t i=0; i < ranks_of_interest.size(); ++i) {
+        TaxRank::RANK rank = ranks_of_interest[i];
         TAXID simulated_taxid_at_rank = taxdb.getTaxIDAtRank(seq_taxid, TaxRank::toString(rank));
         TAXID identified_taxid_at_rank = taxdb.getTaxIDAtRank(identified_taxid, TaxRank::toString(rank));
         simulated_taxids_at_rank[rank].insert(simulated_taxid_at_rank);
@@ -158,14 +161,15 @@ int main(int argc, char **argv) {
 
   if (0) {
     cout << "#LCA_RANK_READ_COUNTS" << endl;
-    for (const auto & kv : rank_counts) {
-      cout << kv.first << delim << kv.second << endl;
+    for (auto it = rank_counts.begin(); it != rank_counts.end(); ++it) {
+      cout << it->first << delim << it->second << endl;
     }
     cout << endl;
   }
 
   cout << "#rank" << delim << "total_reads" << delim << "correct"<< delim << "incorrect"<< delim << "sensitivity" << delim << "precision"  << delim << "higher_rank" << delim << "unidentified" << endl;
-  for (TaxRank::RANK rank : ranks_of_interest) {
+  for (size_t i=0; i < ranks_of_interest.size(); ++i) {
+    TaxRank::RANK rank = ranks_of_interest[i];
     size_t true_positives = correct_reads_at_rank.at(rank);
     size_t false_positives = incorrect_reads_at_rank.at(rank);
     double sensitivity = 100.0*(double)true_positives/(double)total_reads;
@@ -182,12 +186,14 @@ int main(int argc, char **argv) {
   }
 
   cout << "#rank" << delim << "true_count" << delim << "correct" << delim << "incorrect" << delim << "recall" << delim << "precision" << endl;
-  for (TaxRank::RANK rank : ranks_of_interest) {
+  for (size_t i=0; i < ranks_of_interest.size(); ++i) {
+    TaxRank::RANK rank = ranks_of_interest[i];
     size_t true_positives = 0;
     size_t false_positives = 0;
     
     if (identified_taxids_at_rank.find(rank) != identified_taxids_at_rank.end()) {
-    for (const auto & tid : identified_taxids_at_rank[rank]) {
+    for (auto it = identified_taxids_at_rank[rank].begin(); it != identified_taxids_at_rank[rank].end(); ++it) {
+      const auto & tid = *it;
       if (simulated_taxids_at_rank[rank].count(tid) == 1) {
         ++true_positives;
       } else {
diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp
index 6c40d65..8f83742 100644
--- a/src/read_uid_mapping.cpp
+++ b/src/read_uid_mapping.cpp
@@ -38,8 +38,8 @@ int main(int argc, char **argv) {
       uint32_t UID = atol(argv[i]);
       vector<uint32_t> taxids = get_taxids_for_uid(UID, fptr);
       cout << UID << '\t';
-      for (auto t : taxids) {
-        cout << t << ' ';
+      for (auto it = taxids.begin(); it != taxids.end(); ++it) {
+        cout << *it << ' ';
       }
       cout << endl;
     }
diff --git a/src/readcounts.hpp b/src/readcounts.hpp
index e878381..eddca78 100644
--- a/src/readcounts.hpp
+++ b/src/readcounts.hpp
@@ -7,11 +7,11 @@
 
 namespace kraken {
   struct ReadCounts {
-    uint64_t n_reads = 0;
-    uint64_t n_kmers = 0;
+    uint64_t n_reads;
+    uint64_t n_kmers;
     HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
 
-    ReadCounts() { }
+    ReadCounts() : n_reads(0), n_kmers(0) { }
 
     ReadCounts(size_t precision) : kmers(HyperLogLogPlusMinus<uint64_t>(precision)) {
     }
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 7a5c6e0..1b32721 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -75,7 +75,7 @@ unordered_map<string, uint32_t> ID_to_taxon_map;
 unordered_map<uint32_t, bool> SeqId_added;
 KrakenDB Database;
 const size_t hll_prec = 10;
-TaxonomyDB<uint32_t, ReadCounts> taxdb;
+TaxonomyDB<uint32_t> taxdb;
 
 const string prefix = "kraken:taxid|";
 unordered_set<uint32_t> host_taxids = {9606};
@@ -90,13 +90,8 @@ int main(int argc, char **argv) {
   parse_command_line(argc, argv);
 
   if (!TaxDB_filename.empty()) {
-    taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_filename);
-    for (const auto & tax : taxdb.taxIDsAndEntries) {
-      if (tax.first != 0)
-        Parent_map[tax.first] = tax.second.parentTaxonomyID;
-//      Children_map[tax.second.parentTaxonomyID].insert(tax.first);
-    }
-    Parent_map[1] = 0;
+    taxdb = TaxonomyDB<uint32_t>(TaxDB_filename);
+    Parent_map = taxdb.getParentMap();
   } else {
     cerr << "TaxDB argument is required!" << endl;
     return 1;
@@ -145,8 +140,8 @@ int main(int argc, char **argv) {
     ofstream ofs(Kmer_count_filename.c_str());
     cerr << "Writing kmer counts to " << Kmer_count_filename << "..." << endl;
     auto counts = Database.count_taxons();
-    for (auto const & kv : counts) {
-      ofs << kv.first << '\t' << kv.second << '\n';
+    for (auto it = counts.begin(); it != counts.end(); ++it) {
+      ofs << it->first << '\t' << it->second << '\n';
     }
     ofs.close();
   }
@@ -199,7 +194,7 @@ uint32_t get_new_taxid(
 }
 
 unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_filename, 
-    TaxonomyDB<uint32_t, ReadCounts>& taxdb, unordered_map<uint32_t,uint32_t>& Parent_map, 
+    TaxonomyDB<uint32_t>& taxdb, unordered_map<uint32_t,uint32_t>& Parent_map, 
     bool Add_taxIds_for_Assembly, bool Add_taxIds_for_Sequences) {
 
   cerr << "Reading sequence ID to taxonomy ID mapping ... ";
@@ -213,9 +208,9 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
   uint32_t taxid;
 
   if (Add_taxIds_for_Assembly || Add_taxIds_for_Sequences) {
-    for (const auto& k : taxdb.taxIDsAndEntries) {
-      if (k.first >= New_taxid_start) {
-        New_taxid_start = k.first;
+    for (auto it = taxdb.entries.begin(); it != taxdb.entries.end(); ++it) {
+      if (it->first >= New_taxid_start) {
+        New_taxid_start = it->first+100;
       } 
     }
     cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl;
@@ -312,8 +307,8 @@ void process_single_file() {
     bool is_contaminant_taxid = taxid == 32630 || taxid == 81077;
     if (Add_taxIds_for_Sequences && taxid != 9606 && it_p->second != 9606) {
       // Update entry based on header line
-      auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
-      if (entryIt == taxdb.taxIDsAndEntries.end()) {
+      auto entryIt = taxdb.entries.find(taxid);
+      if (entryIt == taxdb.entries.end()) {
         cerr << "Error! Didn't find taxid " << taxid << " in TaxonomyDB - can't update it!! ["<<dna.header_line<<"]" << endl;
       } else {
         entryIt->second.scientificName = dna.header_line;
diff --git a/src/taxdb.h b/src/taxdb.h
index b522866..608bd33 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -30,9 +30,25 @@
 #include <vector>
 #include <unordered_set>
 #include <iomanip>
+#include <sstream>
+#include <stdexcept>
 #include "report-cols.h"
+//#include "readcounts.hpp"
+
 
 using namespace std;
+//using kraken::ReadCounts;
+
+namespace patch
+{
+    template < typename T > std::string to_string( const T& n )
+    {
+        std::ostringstream stm ;
+        stm << n ;
+        return stm.str() ;
+    }
+}
+
 
 void log_msg (const std::string& s);
 
@@ -113,7 +129,7 @@ struct TaxRank {
       case RANK::superkingdom:     return "superkingdom";
       case RANK::root:             return "root";
       default:
-        log_msg("Invalid rank!");
+        log_msg("Invalid rank!\n");
     }
     return "NA";
   }
@@ -153,37 +169,32 @@ const unordered_map<string, TaxRank::RANK> TaxRank::string_to_rank = {
 };
 
 
-template<typename TAXID, typename READCOUNTS>
+template<typename TAXID>
 class TaxonomyEntry {
  public:
-  TAXID taxonomyID = 0;
-  TAXID parentTaxonomyID = 0;
+  TAXID taxonomyID;
+  TaxonomyEntry* parent;
+  std::vector<TaxonomyEntry*> children;
+
   string rank;
   std::string scientificName;
+  uint64_t genomeSize;
+  uint64_t genomeSizeOfChildren;
 
-  TaxonomyEntry() {}
+  TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {}
 
-  TaxonomyEntry(TAXID taxonomyID_, std::string scientificName_) :
-	  taxonomyID(taxonomyID_), scientificName(scientificName_) {}
+  TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) :
+	  taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_),
+      genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) {
+	  
+		  if (parent_ != NULL) {
+			  parent->children.push_back(this);
+		  }
 
-  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_) :
-	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_) {}
-
-  TaxonomyEntry(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) :
-	  taxonomyID(taxonomyID_), parentTaxonomyID(parentTaxonomyID_), rank(rank_), scientificName(scientificName_),
-      genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) {}
+	  }
 
   inline bool operator==(const TaxonomyEntry& other) const; 
-  TaxonomyEntry* parent = nullptr;
-  std::vector<TaxonomyEntry*> children;
-
-  READCOUNTS readCounts = READCOUNTS();
-  READCOUNTS readCountsOfChildren = READCOUNTS();
 
-  bool used = false;
-  uint64_t genomeSize = 0;
-  uint64_t genomeSizeOfChildren = 0;
-  uint64_t numBelow = 0;
 };
 
 //template<>
@@ -192,13 +203,15 @@ class TaxonomyEntry {
 //	readCountsOfChildren = 0;
 //}
 
-template<typename TAXID, typename READCOUNTS>
+/*
+template<typename TAXID>
 struct TaxonomyEntryPtr_comp {
-	bool operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const;
+	bool operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const;
 };
+*/
 
 
-template<typename TAXID, typename READCOUNTS>
+template<typename TAXID>
 class TaxonomyDB {
  public:
   TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName);
@@ -221,7 +234,7 @@ class TaxonomyDB {
   std::unordered_map<std::string, TAXID> getScientificNameMap() const;
   std::string getLineage(TAXID taxonomyID) const;
   std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
-  TaxonomyEntry<TAXID,READCOUNTS> getEntry(TAXID taxID) const;
+  TaxonomyEntry<TAXID> getEntry(TAXID taxID) const;
 
   bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_);
   bool hasTaxon(TAXID taxonomyID_);
@@ -232,23 +245,24 @@ class TaxonomyDB {
   int isBelowInTree(TAXID upper, TAXID lower) const;
 
   void setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes);
-  void setReadCounts(const std::unordered_map<TAXID, READCOUNTS>& readCounts);
   void readGenomeSizes(string file);
   void setGenomeSize(const TAXID taxid, const uint64_t genomeSize);
-  void addReadCount(const TAXID taxid, const READCOUNTS& readCounts_);
 
   void printReport();
 
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
-  bool genomeSizes_are_set = false;
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID> > entries;
+  bool genomeSizes_are_set;
 
  private:
 
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID> >
         readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes);
   void parseNamesDump(const std::string namesDumpFileName);
-  void parseNodesDump(const std::string nodesDumpFileName);
-  void createPointers(std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >& taxIDsAndEntries);
+  std::unordered_map<TAXID,TAXID> parseNodesDump(const std::string nodesDumpFileName);
+  void createPointers(
+		  std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries,
+		  const std::unordered_map<TAXID, TAXID>& parentMap
+		  );
 };
 
 
@@ -256,15 +270,17 @@ template<typename TAXID, typename READCOUNTS>
 class TaxReport {
 private:
 	std::ostream& _reportOfb;
-	TaxonomyDB<TAXID,READCOUNTS> & _taxdb;
+	TaxonomyDB<TAXID> & _taxdb;
+	std::unordered_map<TAXID, READCOUNTS> _readCounts;
+	std::unordered_map<TAXID, READCOUNTS> _readCountsIncludingChildren;
 	uint64_t _total_n_reads;
 	bool _show_zeros;
-	void printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
+	void printLine(TaxonomyEntry<TAXID>& tax, unsigned depth);
 
 public:
-	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID,READCOUNTS> & taxdb, bool _show_zeros);
+	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID> & taxdb, std::unordered_map<TAXID, READCOUNTS>, bool _show_zeros);
 	void printReport(std::string format, std::string rank);
-	void printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth);
+	void printReport(TaxonomyEntry<TAXID>& tax, unsigned depth);
 	void setReportCols(std::vector<std::string> names);
 
 	std::vector<std::string> _report_col_names;
@@ -277,7 +293,7 @@ V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, co
 
 //////////////////////////// DEFINITIONS
 void log_msg (const std::string& s) {
-	std::cerr << s << "\n";
+	std::cerr << s;
 }
 
 template<typename T>
@@ -311,8 +327,10 @@ std::vector<std::string> in_betweens(const std::string &s, const char start_char
          next_start = s.find(start_char, next_end + 1), ++i) {
 
 		next_end = s.find(end_char, next_start + 1);
-		if (next_end == string::npos)
-			throw std::runtime_error("unmatched start and end!");
+		if (next_end == string::npos) {
+			cerr << "unmatched start and end!";
+			exit(1);
+		}
 
         tokens.push_back(s.substr(next_start+1, next_end-1));
     }
@@ -368,14 +386,18 @@ std::vector<std::string> get_fields(const std::string &s, const std::string& del
 //	readCounts = 0;
 //	readCountsOfChildren = 0;
 //}
-template<typename TAXID, typename READCOUNTS>
-bool TaxonomyEntryPtr_comp<TAXID,READCOUNTS>::operator() ( const TaxonomyEntry<TAXID,READCOUNTS>* a, const TaxonomyEntry<TAXID,READCOUNTS>* b) const {
-	        return ((reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren)));
-			    }
+/*
+template<typename TAXID>
+bool TaxonomyEntryPtr_comp<TAXID>::operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const {
 
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getByScientificName(string name) const {
-	for (const auto & tax : taxIDsAndEntries) {
+	        return (
+					(reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren)));
+			    }
+*/
+/*
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getByScientificName(string name) const {
+	for (const auto & tax : entries) {
 		if (tax.second.scientificName == name) {
 		  return tax.first;
 		}
@@ -383,96 +405,112 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getByScientificName(string name) const {
 	return 0;
 }
 
-template<typename TAXID, typename READCOUNTS>
-std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getScientificNameMap() const {
+template<typename TAXID>
+std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID>::getScientificNameMap() const {
 	std::unordered_map<std::string, TAXID> scientificNameMap;
-	for (const auto & tax : taxIDsAndEntries) {
+	for (const auto & tax : entries) {
 		scientificNameMap[tax.second.scientificName] = tax.first;
     }
 	return scientificNameMap;
 }
+*/
 
-template<typename TAXID, typename READCOUNTS>
-unordered_map<TAXID, TAXID> TaxonomyDB<TAXID,READCOUNTS>::getParentMap() const {
+template<typename TAXID>
+unordered_map<TAXID, TAXID> TaxonomyDB<TAXID>::getParentMap() const {
 	unordered_map<TAXID, TAXID> Parent_map;
-	for (const auto & tax : taxIDsAndEntries) {
-		if (tax.first != 0)
-			Parent_map[tax.first] = tax.second.parentTaxonomyID;
+	//for (const auto & tax : entries) {
+	for (auto it = entries.begin(); it != entries.end(); ++it) {
+		const auto&tax = *it;
+		if (tax.first != 0) 
+			continue;
+		if (tax.second.parent == NULL)
+			Parent_map[tax.first] = 0; // for kraken::lca
+		else
+			Parent_map[tax.first] = tax.second.parent->taxonomyID;
     }
-    Parent_map[1] = 1;
 	return Parent_map;
 }
 
-template<typename TAXID, typename READCOUNTS>
-TaxonomyEntry<TAXID,READCOUNTS> TaxonomyDB<TAXID,READCOUNTS>::getEntry(TAXID taxID) const {
-  auto it = taxIDsAndEntries.find(taxID);
-  if (it == taxIDsAndEntries.end()) {
-    TaxonomyEntry<TAXID, READCOUNTS> ti { 0, 0, "NA"};
+template<typename TAXID>
+TaxonomyEntry<TAXID> TaxonomyDB<TAXID>::getEntry(TAXID taxID) const {
+  auto it = entries.find(taxID);
+  if (it == entries.end()) {
+    TaxonomyEntry<TAXID> ti { 0, 0, "NA"};
     return ti;
   } else {
     return it->second;
   }
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::createPointers(std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >& taxIDsAndEntries) {
-  for (auto& tax : taxIDsAndEntries) {
-  if (tax.second.parentTaxonomyID != tax.first) {
-    auto parentIt = taxIDsAndEntries.find(tax.second.parentTaxonomyID);
-    if (parentIt != taxIDsAndEntries.end()) {
-      tax.second.parent = &(parentIt->second);
-      parentIt->second.children.push_back(&tax.second);
-    }
-  }
+template<typename TAXID>
+void TaxonomyDB<TAXID>::createPointers(
+		std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries, 
+		const std::unordered_map<TAXID, TAXID>& parentMap) {
+  for (auto it = entries.begin(); it != entries.end(); ++it) {
+	TAXID taxonomyID = it->first;
+	TAXID parentTaxonomyID = parentMap.at(taxonomyID);
+	if (taxonomyID != parentTaxonomyID) {
+	  auto parent_ptr = entries.find(parentTaxonomyID);
+	  if (parent_ptr != entries.end()) {
+		it->second.parent = &parent_ptr->second;
+		parent_ptr->second.children.push_back(&it->second);
+	  } else {
+		cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl;
+	  }
+	}
   }
 }
 
-template<typename TAXID, typename READCOUNTS>
-TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB() { }
+template<typename TAXID>
+TaxonomyDB<TAXID>::TaxonomyDB() : genomeSizes_are_set(false) { }
 
-template<typename TAXID, typename READCOUNTS>
-TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) :
-  taxIDsAndEntries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes)
+template<typename TAXID>
+TaxonomyDB<TAXID>::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) :
+  entries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes)
  { }
 
-template<typename TAXID, typename READCOUNTS>
-TaxonomyDB<TAXID,READCOUNTS>::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) {
+template<typename TAXID>
+TaxonomyDB<TAXID>::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) {
   log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName);
-  parseNodesDump(nodesDumpFileName);
+  unordered_map<TAXID, TAXID> parentMap = parseNodesDump(nodesDumpFileName);
   parseNamesDump(namesDumpFileName);
-  createPointers(taxIDsAndEntries);
-  log_msg("Built a tree with " + std::to_string(taxIDsAndEntries.size()) + " taxa");
+  createPointers(entries, parentMap);
+  log_msg(". Done, got " + patch::to_string(entries.size()) + " taxa\n");
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::parseNodesDump(const std::string nodesDumpFileName) {
+template<typename TAXID>
+std::unordered_map<TAXID,TAXID> TaxonomyDB<TAXID>::parseNodesDump(const std::string nodesDumpFileName) {
   std::ifstream nodesDumpFile(nodesDumpFileName);
   if (!nodesDumpFile.is_open())
     throw std::runtime_error("unable to open nodes file");
+
   std::string line;
 
   TAXID taxonomyID;
   TAXID parentTaxonomyID;
   std::string rank;
   char delim;
+  std::unordered_map<TAXID,TAXID> parentMap;
 
   while (nodesDumpFile >> taxonomyID >> delim >> parentTaxonomyID) {
     nodesDumpFile.ignore(3);
     getline(nodesDumpFile, rank, '\t');
-    auto entryIt = taxIDsAndEntries.find(taxonomyID);
-    if (entryIt == taxIDsAndEntries.end()) {
-       taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, parentTaxonomyID, rank);
+    auto entryIt = entries.find(taxonomyID);
+    if (entryIt == entries.end()) {
+      entries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, NULL, rank, "");
+	  parentMap[taxonomyID] = parentTaxonomyID;
     } else {
-      entryIt->second.parentTaxonomyID = parentTaxonomyID;
+	  parentMap[taxonomyID] = parentTaxonomyID;
       entryIt->second.rank = rank;
     }
 
     nodesDumpFile.ignore(2560, '\n');
   }
+  return parentMap;
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFileName) {
+template<typename TAXID>
+void TaxonomyDB<TAXID>::parseNamesDump(const std::string namesDumpFileName) {
   std::ifstream namesDumpFile(namesDumpFileName);
   if (!namesDumpFile.is_open())
     throw std::runtime_error("unable to open names file");
@@ -490,9 +528,9 @@ void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFil
     getline(namesDumpFile, type, '\t');
 
     if (type == "scientific name") {
-      auto entryIt = taxIDsAndEntries.find(taxonomyID);
-     if (entryIt == taxIDsAndEntries.end()) {
-        taxIDsAndEntries[taxonomyID] = TaxonomyEntry<TAXID,READCOUNTS>(taxonomyID, scientificName);
+      auto entryIt = entries.find(taxonomyID);
+     if (entryIt == entries.end()) {
+        entries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, NULL, "", scientificName);
       } else {
         entryIt->second.scientificName = scientificName;
       }
@@ -502,21 +540,24 @@ void TaxonomyDB<TAXID,READCOUNTS>::parseNamesDump(const std::string namesDumpFil
 }
 
 template<typename KeyType, typename ValueType>
-std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>& unordered) {
+std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>& my_unordered_map) {
   std::vector<KeyType> keys;
-  keys.reserve (unordered.size());
-  for (auto& it : unordered) {
-	      keys.push_back(it.first);
+  keys.reserve (my_unordered_map.size());
+  for (auto it = my_unordered_map.begin(); it != my_unordered_map.end(); ++it) {
+	keys.push_back(it->first);
   }
   std::sort (keys.begin(), keys.end());
   return keys;
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs) const {
-  for (TAXID& key : getSortedKeys(taxIDsAndEntries)) {
-	const auto& entry = taxIDsAndEntries.at(key);
-    outs << key << '\t' << entry.parentTaxonomyID << '\t'
+template<typename TAXID>
+void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs) const {
+  std::vector<TAXID> sorted_keys = getSortedKeys(entries);
+  for (size_t i = 0; i < sorted_keys.size(); ++i) {
+	TAXID taxonomyID = sorted_keys[i];
+	const auto& entry = entries.at(taxonomyID);
+	TAXID parentTaxonomyID = (entry.parent==NULL? taxonomyID : entry.parent->taxonomyID);
+    outs << taxonomyID << '\t' << parentTaxonomyID << '\t'
             << entry.scientificName << '\t' << entry.rank;
     if (genomeSizes_are_set) {
 		outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren;
@@ -526,29 +567,30 @@ void TaxonomyDB<TAXID,READCOUNTS>::writeTaxonomyIndex(std::ostream & outs) const
   outs.flush();
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes) {
-  for (const auto& it : genomeSizes) {
-	setGenomeSize(it.first, it.second);
+template<typename TAXID>
+void TaxonomyDB<TAXID>::setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes) {
+  for (auto it = genomeSizes.begin(); it != genomeSizes.end(); ++it) {
+	setGenomeSize(it->first, it->second);
   }
   genomeSizes_are_set = true;
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) {
-  taxIDsAndEntries = readTaxonomyIndex_(inFileName, hasGenomeSizes);
+template<typename TAXID>
+void TaxonomyDB<TAXID>::readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes) {
+  entries = readTaxonomyIndex_(inFileName, hasGenomeSizes);
   genomeSizes_are_set = hasGenomeSizes;
 }
 
-template<typename TAXID, typename READCOUNTS>
-std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > 
- TaxonomyDB<TAXID,READCOUNTS>::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) {
+template<typename TAXID>
+std::unordered_map<TAXID, TaxonomyEntry<TAXID> > 
+ TaxonomyDB<TAXID>::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) {
   log_msg("Reading taxonomy index from " + inFileName);
   std::ifstream inFile(inFileName);
   if (!inFile.is_open())
     throw std::runtime_error("unable to open taxonomy index file " + inFileName);
 
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> > taxIDsAndEntries;
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID> > entries;
+  std::unordered_map<TAXID, TAXID> parentMap;
   TAXID taxonomyID, parentTaxonomyID;
   std::string scientificName, rank;
   uint64_t genomeSize, genomeSizeOfChildren = 0;
@@ -557,7 +599,7 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
   while (!inFile.eof()) {
     inFile >> taxonomyID >> parentTaxonomyID;
     if (taxonomyID > 1 && taxonomyID == parentTaxonomyID) {
-      cerr << "ERROR: the parent of " << taxonomyID << " is itself. Should not happend!\n";
+      cerr << "ERROR: the parent of " << taxonomyID << " is itself. Should not happend for taxa other than the root.\n";
       exit(1);
     }
     inFile.get(); // read tab
@@ -568,22 +610,20 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID,READCOUNTS> >
     } else {
   	  std::getline(inFile, rank, '\n');
     }
-    TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID, parentTaxonomyID, rank, scientificName, genomeSize, genomeSizeOfChildren);
+    TaxonomyEntry<TAXID> newEntry(taxonomyID, NULL, rank, scientificName, genomeSize, genomeSizeOfChildren);
 
-	//cerr << "inserting " << taxonomyID << ";" << parentTaxonomyID << ";" << rank << ";" << scientificName << endl;
-    taxIDsAndEntries.insert({
-      taxonomyID, newEntry
-    });
+    auto insert_res = entries.insert({ taxonomyID, newEntry });
+	parentMap[taxonomyID] = parentTaxonomyID;
   }
-  taxIDsAndEntries.insert({0, {0, 0, "no rank", "unclassified" }});
-  //taxIDsAndEntries.insert({-1, {-1, 0, "no rank", "uncategorized" }});
-  createPointers(taxIDsAndEntries);
-  log_msg("done reading TaxDB, read " + std::to_string(taxIDsAndEntries.size()) + " taxa");
-  return(taxIDsAndEntries);
+  entries.insert({0, {0, NULL, "no rank", "unclassified" }});
+  //entries.insert({-1, {-1, 0, "no rank", "uncategorized" }});
+  createPointers(entries, parentMap);
+  log_msg(". Done, read " + patch::to_string(entries.size()) + " taxa.\n");
+  return(entries);
 }
 
-template<typename TAXID, typename READCOUNTS>
-string TaxonomyDB<TAXID,READCOUNTS>::getNextProperRank(TAXID a) const {
+template<typename TAXID>
+string TaxonomyDB<TAXID>::getNextProperRank(TAXID a) const {
   if (a == 0) {
     return "NA";
   }
@@ -596,8 +636,8 @@ string TaxonomyDB<TAXID,READCOUNTS>::getNextProperRank(TAXID a) const {
   return getRank(a);
 }
 
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtNextProperRank(TAXID a) const {
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getTaxIDAtNextProperRank(TAXID a) const {
   if (a == 0 || a == 1) {
     return 0;
   }
@@ -607,8 +647,8 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtNextProperRank(TAXID a) const {
   return a;
 }
 
-template<typename TAXID, typename READCOUNTS>
-pair<TAXID,int> TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(TAXID a, TAXID b) const {
+template<typename TAXID>
+pair<TAXID,int> TaxonomyDB<TAXID>::getLowestCommonAncestor(TAXID a, TAXID b) const {
     if (a == 0 || b == 0) {
       return a ? pair<TAXID,int>(a,-1) : pair<TAXID,int>(b,-1); 
     }
@@ -637,10 +677,10 @@ pair<TAXID,int> TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(TAXID a, T
     return pair<TAXID,int>(1, distA+distB);
 }
 
+/*
 
-
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getLowestCommonAncestor(
     const std::vector<TAXID>& taxIDs) const {
   if (taxIDs.size() == 0) {
     return 0;
@@ -680,30 +720,30 @@ TAXID TaxonomyDB<TAXID,READCOUNTS>::getLowestCommonAncestor(
   }
   return consensus;
 }
+*/
 
-
-template<typename TAXID, typename READCOUNTS>
-bool TaxonomyDB<TAXID, READCOUNTS>::hasTaxon(TAXID taxonomyID_) {
-  return taxIDsAndEntries.find(taxonomyID_) != taxIDsAndEntries.end();
+template<typename TAXID>
+bool TaxonomyDB<TAXID>::hasTaxon(TAXID taxonomyID_) {
+  return entries.find(taxonomyID_) != entries.end();
 }
 
-template<typename TAXID, typename READCOUNTS>
-bool TaxonomyDB<TAXID, READCOUNTS>::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, 
+template<typename TAXID>
+bool TaxonomyDB<TAXID>::insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, 
     std::string rank_, std::string scientificName_) {
   if (parentTaxonomyID_ == taxonomyID_) {
     return false;
   }
 
-  auto parentIt = taxIDsAndEntries.find(parentTaxonomyID_);
-  if (parentIt == taxIDsAndEntries.end()) {
+  auto parentIt = entries.find(parentTaxonomyID_);
+  if (parentIt == entries.end()) {
     cerr << "ERROR with taxon [" << taxonomyID_  <<";"<<rank_<<";"<<scientificName_<<"] - parent taxon " << parentTaxonomyID_ << " not in database!" << endl;
     return false;
   }
 
-  TaxonomyEntry<TAXID,READCOUNTS> newEntry(taxonomyID_, parentTaxonomyID_, rank_, scientificName_, 0, 0);
+  TaxonomyEntry<TAXID> newEntry(taxonomyID_, &parentIt->second, rank_, scientificName_, 0, 0);
 
   newEntry.parent = &(parentIt->second);
-  auto insert_res = taxIDsAndEntries.insert({taxonomyID_, newEntry});
+  auto insert_res = entries.insert({taxonomyID_, newEntry});
   if (insert_res.second) {
     parentIt->second.children.push_back(&insert_res.first->second);
   }
@@ -711,35 +751,35 @@ bool TaxonomyDB<TAXID, READCOUNTS>::insert(TAXID taxonomyID_, TAXID parentTaxono
 
 }
 
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getParentTaxID(const TAXID taxID) const {
-  auto entry = taxIDsAndEntries.find(taxID);
-  if (entry != taxIDsAndEntries.end() && entry->second.parentTaxonomyID != 1)
-    return entry->second.parentTaxonomyID;
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getParentTaxID(const TAXID taxID) const {
+  auto entry = entries.find(taxID);
+  if (entry != entries.end() && entry->second.parent != NULL)
+    return entry->second.parent->taxonomyID;
   else
     return 0;
 }
 
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getScientificName(const TAXID taxID) const {
-  auto entry = taxIDsAndEntries.find(taxID);
-  if (entry != taxIDsAndEntries.end()) {
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getScientificName(const TAXID taxID) const {
+  auto entry = entries.find(taxID);
+  if (entry != entries.end()) {
     return entry->second.scientificName;
   } else
     return std::string();
 }
 
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getRank(const TAXID taxID) const {
-  auto entry = taxIDsAndEntries.find(taxID);
-  if (entry != taxIDsAndEntries.end()) {
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getRank(const TAXID taxID) const {
+  auto entry = entries.find(taxID);
+  if (entry != entries.end()) {
     return entry->second.rank;
   } else
     return std::string();
 }
 
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getLineage(TAXID taxonomyID) const {
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getLineage(TAXID taxonomyID) const {
   std::string lineage;
   while (true) {
     // 131567 = Cellular organisms
@@ -757,8 +797,8 @@ std::string TaxonomyDB<TAXID,READCOUNTS>::getLineage(TAXID taxonomyID) const {
   return lineage;
 }
 
-template<typename TAXID, typename READCOUNTS>
-std::string TaxonomyDB<TAXID,READCOUNTS>::getMetaPhlAnLineage(TAXID taxonomyID) const {
+template<typename TAXID>
+std::string TaxonomyDB<TAXID>::getMetaPhlAnLineage(TAXID taxonomyID) const {
   std::string rank = getRank(taxonomyID);
   if (rank == "superphylum") return std::string();
   std::string lineage;
@@ -798,102 +838,51 @@ std::string TaxonomyDB<TAXID,READCOUNTS>::getMetaPhlAnLineage(TAXID taxonomyID)
   return lineage;
 }
 
-template<typename TAXID, typename READCOUNTS>
-TAXID TaxonomyDB<TAXID,READCOUNTS>::getTaxIDAtRank(const TAXID taxID,
+template<typename TAXID>
+TAXID TaxonomyDB<TAXID>::getTaxIDAtRank(const TAXID taxID,
                                     const std::string& rank) const {
   if (taxID == 0 || taxID == 1)
     return 0;
-  auto entry = taxIDsAndEntries.find(taxID);
+  auto entry_it = entries.find(taxID);
   // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl;
-  while (entry != taxIDsAndEntries.end() 
-      && entry->second.parentTaxonomyID != 1 
-      && entry->second.parentTaxonomyID != entry->first) {
+  if (entry_it != entries.end()) {
+	const TaxonomyEntry<TAXID>* entry_ptr = &entry_it->second;
+  while (entry_ptr != NULL
+      && entry_ptr->parent != NULL) {
     // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl;
-    if (entry->second.rank == rank) {
-      return entry->second.taxonomyID;
+    if (entry_ptr->rank == rank) {
+      return entry_ptr->taxonomyID;
     } else {
-      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
+      entry_ptr = entry_ptr->parent;
     }
   }
+  }
   return 0;
 }
 
-template<typename TAXID, typename READCOUNTS>
-int TaxonomyDB<TAXID,READCOUNTS>::isBelowInTree(TAXID upper, TAXID lower) const {
-  auto entry = taxIDsAndEntries.find(lower);
-  unsigned level = 0;
-  while (entry != taxIDsAndEntries.end() &&
-         entry->second.parentTaxonomyID != 1) {
-    if (entry->first == upper) {
-      return level;
-    } else {
-      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
-      level++;
-    }
-  }
-  return -1;
-}
 
-template<typename TAXID, typename READCOUNTS>
-bool TaxonomyDB<TAXID,READCOUNTS>::isSubSpecies(TAXID taxonomyID) const {
-  bool isSubSpecies = false;
-  auto entry = taxIDsAndEntries.find(taxonomyID);
-  int numLevels = 0;
-  while (entry != taxIDsAndEntries.end() &&
-         entry->second.parentTaxonomyID != 1) {
-    if (entry->second.rank == "species") {
-      if (numLevels > 0) {
-        isSubSpecies = true;
-      }
-      break;
-    } else
-      entry = taxIDsAndEntries.find(entry->second.parentTaxonomyID);
-    numLevels++;
-  }
-  return isSubSpecies;
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::addReadCount(const TAXID taxid, const READCOUNTS& readCounts_) {
-	auto it = taxIDsAndEntries.find(taxid);
-		if (it == taxIDsAndEntries.end()) {
+template<typename TAXID>
+void TaxonomyDB<TAXID>::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) {
+	auto it = entries.find(taxid);
+		if (it == entries.end()) {
 			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
 			return;
 		}
-		TaxonomyEntry<TAXID,READCOUNTS>* tax = &it->second;
-		//cerr << taxid << " rc before: " << tax->readCounts << endl;
-		tax->readCounts += readCounts_;
-		//cerr << taxid << " rc after:  " << tax->readCounts << endl;
-
-		while (tax->parent != nullptr) {
-			tax = tax->parent;
-			tax->readCountsOfChildren += readCounts_;
-		}
-}
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) {
-	auto it = taxIDsAndEntries.find(taxid);
-		if (it == taxIDsAndEntries.end()) {
-			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
-			return;
-		}
-		TaxonomyEntry<TAXID,READCOUNTS>* tax = &it->second;
+		TaxonomyEntry<TAXID>* tax = &it->second;
 		tax->genomeSize += genomeSize;
 
-		while (tax->parent != nullptr) {
+		while (tax->parent != NULL) {
 			tax = tax->parent;
 			//std::cerr << "setting genomeSizeOfChildren of parent" << std::endl;
 			tax->genomeSizeOfChildren += genomeSize;
 		}
 }
 
-
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::readGenomeSizes(string file) {
-  for (auto& entry : taxIDsAndEntries) {
-    entry.second.genomeSize = 0;
-    entry.second.genomeSizeOfChildren = 0;
+template<typename TAXID>
+void TaxonomyDB<TAXID>::readGenomeSizes(string file) {
+  for (auto entry_it = entries.begin(); entry_it != entries.end(); ++entry_it) {
+    entry_it->second.genomeSize = 0;
+    entry_it->second.genomeSizeOfChildren = 0;
   }
   log_msg("Reading genome sizes from " + file);
   std::ifstream inFile(file);
@@ -907,21 +896,33 @@ void TaxonomyDB<TAXID,READCOUNTS>::readGenomeSizes(string file) {
   }
 }
 
-template<typename TAXID, typename READCOUNTS>
-void TaxonomyDB<TAXID,READCOUNTS>::setReadCounts(const unordered_map<TAXID, READCOUNTS>& readCounts) {
+/*
+template<typename TAXID>
+void TaxonomyDB<TAXID>::setReadCounts(const unordered_map<TAXID>& readCounts) {
 	for (auto& elem : readCounts) {
 		addReadCount(elem.first, elem.second);
 	 }
 
-	for (auto& tax : taxIDsAndEntries) {
-		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID,READCOUNTS>());
+	for (auto& tax : entries) {
+		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID>());
 	}
 }
+*/
 
 
 template<typename TAXID, typename READCOUNTS>
-  TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID,READCOUNTS>& taxdb, 
-    bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _show_zeros(show_zeros) {
+  TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID>& taxdb, 
+	std::unordered_map<TAXID, READCOUNTS> readCounts,
+    bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _readCounts(readCounts), _show_zeros(show_zeros) {
+
+	for (auto it = _readCounts.begin(); it != _readCounts.end(); ++it) {
+		TaxonomyEntry<TAXID>* tax = &taxdb.entries.at(it->first);
+		while (tax != NULL) {
+			_readCountsIncludingChildren[tax->taxonomyID] += it->second;
+			tax = tax->parent;
+		}
+	}
+
   _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, 
     REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, 
     REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE, REPORTCOLS::TAX_RANK, REPORTCOLS::TAX_ID, 
@@ -932,7 +933,8 @@ template<typename TAXID, typename READCOUNTS>
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::setReportCols(std::vector<std::string> names) {
 	_report_cols.clear();
-	for (auto& s : names) {
+	for (size_t i = 0; i< names.size(); ++i) {
+		auto& s = names[i];
 		auto it = report_col_name_map.find(s);
 		if (it == report_col_name_map.end()) {
 			throw std::runtime_error(s + " is not a valid report column name");
@@ -945,11 +947,7 @@ void TaxReport<TAXID,READCOUNTS>::setReportCols(std::vector<std::string> names)
 
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string rank) {
-	_total_n_reads =
-			reads(_taxdb.taxIDsAndEntries.at(0).readCounts) +
-			reads(_taxdb.taxIDsAndEntries.at(0).readCountsOfChildren) +
-			reads(_taxdb.taxIDsAndEntries.at(1).readCounts) +
-			reads(_taxdb.taxIDsAndEntries.at(1).readCountsOfChildren);// +
+	uint64_t _total_n_reads = reads(_readCountsIncludingChildren[0]) + reads(_readCountsIncludingChildren[1]);
 	if (_total_n_reads == 0) {
 		std::cerr << "total number of reads is zero - not creating a report!" << endl;
 		return;
@@ -957,7 +955,8 @@ void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string ra
 	if (_report_cols.size() == _report_col_names.size()) {
 		// print header
 		bool first_one = true;
-		for (std::string s : _report_col_names) {
+		for (size_t i=0; i < _report_col_names.size(); ++i) {
+			const std::string& s = _report_col_names[i];
 			if (first_one) {
 				first_one = false;
 			} else {
@@ -970,12 +969,12 @@ void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string ra
 
 	if (format == "kraken") {
 		// A: print number of unidentified reads
-		printReport(_taxdb.taxIDsAndEntries.at(0),0u);
+		printReport(_taxdb.entries.at(0),0u);
 		// B: print normal results
-		printReport(_taxdb.taxIDsAndEntries.at(1),0u);
+		printReport(_taxdb.entries.at(1),0u);
 		// C: Print Unclassified stuff
-		auto it = _taxdb.taxIDsAndEntries.find(-1);
-		if (it != _taxdb.taxIDsAndEntries.end()) {
+		auto it = _taxdb.entries.find(-1);
+		if (it != _taxdb.entries.end()) {
 		  printReport(it->second,0u);
 		}
 	} else {
@@ -986,40 +985,69 @@ void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string ra
 	}
 }
 
+template<typename ReadCounts>
+struct CompareReadCounts : std::binary_function<size_t, size_t, bool> {
+	CompareReadCounts(std::vector<ReadCounts*> counts_) : counts(counts_) {}
+
+	bool operator()(size_t a, size_t b) const {
+		if (counts[a]->n_reads == counts[b]->n_reads) {
+			return counts[a]->n_kmers < counts[b]->n_kmers;
+		} else {
+			return counts[a]->n_reads < counts[b]->n_reads;
+		}
+	}
+
+	std::vector<ReadCounts*>& counts;
+};
+
 template<typename TAXID, typename READCOUNTS>
-void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
-	if (_show_zeros || (reads(tax.readCounts)+reads(tax.readCountsOfChildren)) > 0) {
+void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID>& tax, unsigned depth) {
+	if (_show_zeros || reads(_readCountsIncludingChildren[tax.taxonomyID]) > 0) {
 		printLine(tax, depth);
-		for (auto child : tax.children)
-			printReport(*child, depth+1);
+		// TODO: Order children ...
+
+		std::vector<size_t> pos(tax.children.size());
+		std::vector<READCOUNTS*> counts(tax.children.size());
+		for (size_t i=0; i < tax.children.size(); ++i) {
+			pos[i] = i;
+			counts[i] = &_readCountsIncludingChildren[i];
+		}
+
+		std::sort(pos.begin(), pos.end(), CompareReadCounts<READCOUNTS>(counts));
+
+		for (size_t i=0; i < tax.children.size(); ++i) {
+			auto child_it = tax.children[ pos[i] ];
+			printReport(*child_it, depth+1);
+		}
 	}
 }
 
 template<typename TAXID, typename READCOUNTS>
-void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID,READCOUNTS>& tax, unsigned depth) {
+void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID>& tax, unsigned depth) {
 
-  long long unique_kmers_for_clade = ( tax.readCounts.kmers.cardinality() + tax.readCountsOfChildren.kmers.cardinality());
+  long long unique_kmers_for_clade = _readCountsIncludingChildren[tax.taxonomyID].kmers.cardinality();
   double genome_size = double(tax.genomeSize+tax.genomeSizeOfChildren);
 
-	for (auto& col : _report_cols) {
+	for (size_t i = 0; i< _report_cols.size(); ++i) {
+		auto& col = _report_cols[i];
 		switch (col) {
 		case REPORTCOLS::NAME:              _reportOfb << tax.scientificName ; break;
 		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
 		case REPORTCOLS::TAX_ID:            _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
 		case REPORTCOLS::DEPTH:             _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:       _reportOfb << setprecision(4) << 100.0*(reads(tax.readCounts) + reads(tax.readCountsOfChildren))/_total_n_reads; break;
+		case REPORTCOLS::PERCENTAGE:       _reportOfb << setprecision(4) << 100.0*(reads(_readCountsIncludingChildren[tax.taxonomyID]))/_total_n_reads; break;
 		//case REPORTCOLS::ABUNDANCE:      _reportOfb << 100*counts.abundance[0]; break;
 		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
-		case REPORTCOLS::NUM_READS:        _reportOfb << reads(tax.readCounts); break;
-		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(tax.readCounts) + reads(tax.readCountsOfChildren)); break;
-		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << tax.readCounts.kmers.cardinality(); break;
+		case REPORTCOLS::NUM_READS:        _reportOfb << reads(_readCounts[tax.taxonomyID]); break;
+		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(_readCountsIncludingChildren[tax.taxonomyID])); break;
+		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << _readCounts[tax.taxonomyID].kmers.cardinality(); break;
 		case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE:  _reportOfb << unique_kmers_for_clade; break;
-		case REPORTCOLS::NUM_KMERS:        _reportOfb << tax.readCounts.n_kmers; break;
-		case REPORTCOLS::NUM_KMERS_CLADE:  _reportOfb << tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers; break;
+		case REPORTCOLS::NUM_KMERS:        _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break;
+		case REPORTCOLS::NUM_KMERS_CLADE:  _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break;
     case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
     case REPORTCOLS::CLADE_KMER_COVERAGE: if (genome_size == 0) { _reportOfb << "NA"; } else {
        _reportOfb << setprecision(4) << (unique_kmers_for_clade  / genome_size); }; break;
-    case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(tax.readCounts.n_kmers + tax.readCountsOfChildren.n_kmers) / unique_kmers_for_clade ); break;
+    case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break;
 		case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp
index d2100d3..2914468 100644
--- a/src/uid_mapping.cpp
+++ b/src/uid_mapping.cpp
@@ -1,5 +1,6 @@
 
 #include<iostream>
+#include<algorithm>
 #include "uid_mapping.hpp"
 #include "krakenutil.hpp"
 #include "assert_helpers.h"
@@ -83,15 +84,18 @@ namespace kraken {
     for (auto it = uid_hit_counts.begin(); it != uid_hit_counts.end(); ++it) {
       uint32_t uid = it->first;
       double frac_count = ((double)it->second / (double)UID_to_taxids_vec[uid-1].size());
-      for (auto taxid : UID_to_taxids_vec[uid-1]) {
-        taxid_counts[taxid] += it->second;
-        frac_taxid_counts[taxid] += frac_count;
+      //for (auto taxid : UID_to_taxids_vec[uid-1]) {
+      for (auto taxid_it = UID_to_taxids_vec[uid-1].begin(); taxid_it != UID_to_taxids_vec[uid-1].end(); ++taxid_it) { // supporting gcc 4.4
+        taxid_counts[*taxid_it] += it->second;
+        frac_taxid_counts[*taxid_it] += frac_count;
       }
     }
     vector<uint32_t> max_taxids;
     uint32_t max_count = 0;
     double max_frac_count = 0;
-    for (auto it : taxid_counts) {
+    // for (auto it : taxid_counts) { 
+    for (auto itt = taxid_counts.begin(); itt != taxid_counts.end(); ++itt) {  // supporting gcc 4.4
+      const auto& it = *itt;
       if (it.second == max_count) {
         if (frac_taxid_counts[it.first] == max_frac_count) {
           max_taxids.push_back(it.first);
@@ -131,7 +135,8 @@ namespace kraken {
       return(0);
     }
 
-    for (const auto& it : uid_hit_counts) {
+    for (auto it1=uid_hit_counts.begin(); it1 != uid_hit_counts.end(); ++it1) { // supporting gcc 4.4
+      const auto &it = *it1;
       if (it.first == 0) {
         continue;
       }
@@ -139,7 +144,8 @@ namespace kraken {
       vector<uint32_t> taxids = get_taxids_for_uid(it.first, fptr);
 
       double frac_count = (double)it.second / (double)taxids.size();
-      for (uint32_t taxid : taxids) {
+      for (size_t i = 0; i < taxids.size(); ++i) { // supporting gcc 4.4
+        uint32_t taxid = taxids[i];
         frac_taxid_counts[taxid] += frac_count;
         taxid_counts[taxid] += it.second;
       }
@@ -151,7 +157,8 @@ namespace kraken {
     vector<uint32_t> max_taxids;
     uint32_t max_count = 0;
     double max_frac_count = 0;
-    for (auto it : taxid_counts) {
+    for (auto it1 = taxid_counts.begin(); it1 != taxid_counts.end(); ++it1) {
+      const auto& it = *it1;
       if (it.second == max_count) {
         if (frac_taxid_counts[it.first] == max_frac_count) {
           max_taxids.push_back(it.first);
diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp
index 1f84c40..93d1680 100644
--- a/src/uid_mapping.hpp
+++ b/src/uid_mapping.hpp
@@ -20,9 +20,12 @@ using namespace std;
 //       - write the mapping to UID_map_file
 //
 
-using TaxidSet = vector<uint32_t>;
+//using TaxidSet = typename std::vector<uint32_t>;
+typedef std::vector<uint32_t> TaxidSet;
 
 namespace kraken {
+
+
 uint32_t uid_mapping(
       map< TaxidSet, uint32_t>& Taxids_to_UID_map, 
       vector< const TaxidSet* >& UID_to_taxids_vec, 

From d5b8dc2a3756a1c1518ac82a19529b27a832aff4 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 5 Nov 2017 12:52:00 -0500
Subject: [PATCH 090/105] Make sure TaxDB is not copied

---
 src/build_taxdb.cpp          |  6 +--
 src/grade_classification.cpp |  4 +-
 src/set_lcas.cpp             |  3 +-
 src/taxdb.h                  | 91 ++++++++++++++++++++++-----------
 tests/build-dbs.sh           | 99 ++++++++++++++++++++++++++++--------
 5 files changed, 145 insertions(+), 58 deletions(-)

diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index fc81cec..263de5c 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -35,12 +35,12 @@ int main(int argc, char **argv) {
         << "build_taxdb taxDB\n";
       return 1;
     }
-        
+	
     TaxonomyDB<uint32_t> taxdb;
     if (argc == 2) {
-    taxdb = TaxonomyDB<uint32_t> ((string)argv[1]);
+		taxdb = TaxonomyDB<uint32_t> ((string)argv[1]);
     } else {
-    taxdb = TaxonomyDB<uint32_t> ((string)argv[1], (string)argv[2]);
+		taxdb = TaxonomyDB<uint32_t> ((string)argv[1], (string)argv[2]);
     }
     if (argc == 4) {
         ifstream ifs(argv[3]);
diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
index 5ea7922..148c7e9 100644
--- a/src/grade_classification.cpp
+++ b/src/grade_classification.cpp
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
     std::cerr << "Usage: grade_classification taxDB seqid2taxid.map classification_file result_file\n";
     return 1;
   }
-  TaxonomyDB<uint32_t> taxdb = TaxonomyDB<uint32_t>(argv[1], false);
+  TaxonomyDB<uint32_t> taxdb (argv[1], false);
   unordered_map<string, uint32_t> seqid_map = read_seqid_mapping(argv[2]);
   
   ofstream out_file(argv[4]);
@@ -123,7 +123,7 @@ int main(int argc, char **argv) {
       // getLowestCommonAncestor returns lca taxon as well as distance between the taxa
       pair<uint32_t, int> lca_taxid_dist = taxdb.getLowestCommonAncestor(seq_taxid, identified_taxid);
       string lca_rank_string = taxdb.getNextProperRank(lca_taxid_dist.first);
-      TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string);
+      // TaxRank::RANK lca_rank = TaxRank::toRank(lca_rank_string);
 
       TaxRank::RANK identified_rank = TaxRank::toRank(taxdb.getRank(identified_taxid));
       for (size_t i=0; i < ranks_of_interest.size(); ++i) {
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index 1b32721..f4e28f6 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -180,11 +180,12 @@ uint32_t get_new_taxid(
   if (it == name_to_taxid_map.end()) {
     uint32_t new_taxid = ++New_taxid_start;
     bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name);
+    cerr << "Adding assembly: " << name << " with taxid " << new_taxid;
     if (!insert_res) {
       return 0;
     }
+    cerr << "Oida " << (insert_res? "success" : "naaa") << endl;
     // insert_res shows if insert failed, but we don't care
-    // cerr << "Adding assembly: " << name << " with taxid " << new_taxid << endl;
     Parent_map[new_taxid] = parent_taxid;
     name_to_taxid_map[name] = new_taxid;
     return new_taxid;
diff --git a/src/taxdb.h b/src/taxdb.h
index 608bd33..91e60ec 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -173,7 +173,7 @@ template<typename TAXID>
 class TaxonomyEntry {
  public:
   TAXID taxonomyID;
-  TaxonomyEntry* parent;
+  TaxonomyEntry<TAXID>* parent;
   std::vector<TaxonomyEntry*> children;
 
   string rank;
@@ -183,7 +183,7 @@ class TaxonomyEntry {
 
   TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {}
 
-  TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) :
+  TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry<TAXID>* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) :
 	  taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_),
       genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) {
 	  
@@ -195,8 +195,17 @@ class TaxonomyEntry {
 
   inline bool operator==(const TaxonomyEntry& other) const; 
 
+  friend std::ostream &operator<<(std::ostream &os, const TaxonomyEntry<TAXID> &m) { 
+	TAXID parentTaxonomyID = (m.parent == NULL)? m.taxonomyID : m.parent->taxonomyID;
+    os << '[' << m.taxonomyID << ";parent="<< parentTaxonomyID << ";name=" << m.scientificName << ";rank=" << m.rank << ']';
+	return os;
+}
+
 };
 
+
+
+
 //template<>
 //TaxonomyEntry<uint32_t, uint64_t>::TaxonomyEntry () {
 //	readCounts = 0;
@@ -217,6 +226,16 @@ class TaxonomyDB {
   TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName);
   TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false);
   TaxonomyDB();
+
+  TaxonomyDB(TaxonomyDB&& rhs) : entries(std::move(rhs.entries)) {
+  }
+
+  TaxonomyDB& operator=(TaxonomyDB&& rhs) {
+	entries = std::move(rhs.entries);
+	return *this;
+  }
+
+
   void writeTaxonomyIndex(std::ostream & outs) const;
   void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes);
 
@@ -257,12 +276,6 @@ class TaxonomyDB {
 
   std::unordered_map<TAXID, TaxonomyEntry<TAXID> >
         readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes);
-  void parseNamesDump(const std::string namesDumpFileName);
-  std::unordered_map<TAXID,TAXID> parseNodesDump(const std::string nodesDumpFileName);
-  void createPointers(
-		  std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries,
-		  const std::unordered_map<TAXID, TAXID>& parentMap
-		  );
 };
 
 
@@ -443,19 +456,24 @@ TaxonomyEntry<TAXID> TaxonomyDB<TAXID>::getEntry(TAXID taxID) const {
 }
 
 template<typename TAXID>
-void TaxonomyDB<TAXID>::createPointers(
+void createPointers(
 		std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries, 
 		const std::unordered_map<TAXID, TAXID>& parentMap) {
-  for (auto it = entries.begin(); it != entries.end(); ++it) {
-	TAXID taxonomyID = it->first;
-	TAXID parentTaxonomyID = parentMap.at(taxonomyID);
-	if (taxonomyID != parentTaxonomyID) {
-	  auto parent_ptr = entries.find(parentTaxonomyID);
-	  if (parent_ptr != entries.end()) {
-		it->second.parent = &parent_ptr->second;
-		parent_ptr->second.children.push_back(&it->second);
-	  } else {
-		cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl;
+  for (auto entry_it = entries.begin(); entry_it != entries.end(); ++entry_it) {
+	TAXID taxonomyID = entry_it->first;
+	auto parent_it = parentMap.find(taxonomyID);
+	if (parent_it == parentMap.end()) {
+	  cerr << "Cannot find parent for " << taxonomyID << endl;
+	} else {
+	  TAXID parentTaxonomyID = parent_it->second;
+      if (taxonomyID != parentTaxonomyID) {
+	    auto parent_ptr = entries.find(parentTaxonomyID);
+	    if (parent_ptr != entries.end()) {
+		  entry_it->second.parent = &parent_ptr->second;
+		  parent_ptr->second.children.push_back(&entry_it->second);
+	    } else {
+		  cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl;
+	    }
 	  }
 	}
   }
@@ -470,16 +488,23 @@ TaxonomyDB<TAXID>::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes)
  { }
 
 template<typename TAXID>
-TaxonomyDB<TAXID>::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) {
+unordered_map<TAXID, TaxonomyEntry<TAXID>> readDumps(const std::string namesDumpFileName, const std::string nodesDumpFileName) {
+  std::unordered_map<TAXID, TaxonomyEntry<TAXID> > entries;
   log_msg("Building taxonomy index from " + nodesDumpFileName + " and " + namesDumpFileName);
-  unordered_map<TAXID, TAXID> parentMap = parseNodesDump(nodesDumpFileName);
-  parseNamesDump(namesDumpFileName);
+  unordered_map<TAXID, TAXID> parentMap = parseNodesDump(nodesDumpFileName, entries);
   createPointers(entries, parentMap);
+  parseNamesDump(namesDumpFileName, entries);
   log_msg(". Done, got " + patch::to_string(entries.size()) + " taxa\n");
+  return(entries);
+}
+
+template<typename TAXID>
+TaxonomyDB<TAXID>::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) : 
+	entries(readDumps<TAXID>(namesDumpFileName, nodesDumpFileName)) {
 }
 
 template<typename TAXID>
-std::unordered_map<TAXID,TAXID> TaxonomyDB<TAXID>::parseNodesDump(const std::string nodesDumpFileName) {
+std::unordered_map<TAXID,TAXID> parseNodesDump(const std::string nodesDumpFileName, std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries) {
   std::ifstream nodesDumpFile(nodesDumpFileName);
   if (!nodesDumpFile.is_open())
     throw std::runtime_error("unable to open nodes file");
@@ -510,7 +535,7 @@ std::unordered_map<TAXID,TAXID> TaxonomyDB<TAXID>::parseNodesDump(const std::str
 }
 
 template<typename TAXID>
-void TaxonomyDB<TAXID>::parseNamesDump(const std::string namesDumpFileName) {
+void parseNamesDump(const std::string namesDumpFileName, std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries) {
   std::ifstream namesDumpFile(namesDumpFileName);
   if (!namesDumpFile.is_open())
     throw std::runtime_error("unable to open names file");
@@ -530,7 +555,8 @@ void TaxonomyDB<TAXID>::parseNamesDump(const std::string namesDumpFileName) {
     if (type == "scientific name") {
       auto entryIt = entries.find(taxonomyID);
      if (entryIt == entries.end()) {
-        entries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, NULL, "", scientificName);
+		 cerr << "Entry for " << taxonomyID << " does not exist - it should!" << '\n';
+        //entries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, NULL, "", scientificName);
       } else {
         entryIt->second.scientificName = scientificName;
       }
@@ -593,7 +619,8 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID> >
   std::unordered_map<TAXID, TAXID> parentMap;
   TAXID taxonomyID, parentTaxonomyID;
   std::string scientificName, rank;
-  uint64_t genomeSize, genomeSizeOfChildren = 0;
+  uint64_t genomeSize = 0;
+  uint64_t genomeSizeOfChildren = 0;
 
   std::string line;
   while (!inFile.eof()) {
@@ -1044,10 +1071,14 @@ void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID>& tax, unsigned
 		case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE:  _reportOfb << unique_kmers_for_clade; break;
 		case REPORTCOLS::NUM_KMERS:        _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break;
 		case REPORTCOLS::NUM_KMERS_CLADE:  _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break;
-    case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
-    case REPORTCOLS::CLADE_KMER_COVERAGE: if (genome_size == 0) { _reportOfb << "NA"; } else {
-       _reportOfb << setprecision(4) << (unique_kmers_for_clade  / genome_size); }; break;
-    case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break;
+		case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
+		case REPORTCOLS::CLADE_KMER_COVERAGE: 
+		  if (genome_size == 0) { 
+		    _reportOfb << "NA"; 
+		  } else {
+		   _reportOfb << setprecision(4) << (unique_kmers_for_clade  / genome_size); 
+		  }; break;
+		case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break;
 		case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
 		//case REPORTCOLS::GENOME_SIZE: ; break;
 		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index bfbd3f8..d9284b8 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -6,50 +6,105 @@ set -eu
 DIR=`pwd`
 [[ `uname` == "Darwin" ]] && THREADS=4 || THREADS=10
 
-
 build_db() {
+  local PROG=$1; shift
   local K=$1; shift
   local MIN=$1; shift
   local NAM=$1; shift
 
+  set -eu
+
   local DB_NAM=refseq-$NAM-k$K
-  DB_DIR=$DIR/dbs/$DB_NAM
-
-  mkdir -p $DB_DIR
-  CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
-  for L in $@; do
-    CMD="$CMD  --library-dir=$DIR/data/library/$L"
-  done
-  #if [[ ! -f "$DB_DIR/is.busy" ]]; then
+  DB_DIR=$DIR/dbs-$PROG/$DB_NAM
+
+  if [[ "$PROG" == "kraken" ]]; then
+    mkdir -p $DB_DIR
+    CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
+    for L in $@; do
+      CMD="$CMD  --library-dir=$DIR/data/library/$L"
+    done
+  elif [[ "$PROG" == "kallisto" ]]; then
+    CMD="kallisto index -k $K -i $DB_DIR"
+    for L in $@; do
+      CMD="$CMD  $DIR/data/all-$L.fna"
+    done
+  fi
+  if [[ ! -f "$DB_DIR/is.busy" ]]; then
     echo "EXECUTING $CMD"
-    touch $DB_DIR/is.busy
-    $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log
-    if [[ ! -f "$DB_DIR/taxonomy/nodes.dmp" ]]; then
+    touch $DB_DIR-is.busy
+    $CMD 2>&1 | tee $DIR/dbs-$PROG/$DB_NAM-build.log
+    if [[ $PROG == "kraken" && ! -f "$DB_DIR/taxonomy/nodes.dmp" ]]; then
       mkdir -p $DB_DIR/taxonomy
       echo "EXECUTING dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp"
       dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp
     fi
-    rm $DB_DIR/is.busy
-  #else 
-  #  echo "IGNORING $DB_DIR"
-  #fi
+    rm $DB_DIR-is.busy
+  else 
+    echo "$DB_DIR/is.busy exists, ignoring directory."
+  fi
 }
 
-K=$1; shift;
 
+
+VERBOSE=false
+HELP=false
+DRY_RUN=false
+K=31
+THREADS=10
+
+USAGE="
+`basename $0` [options] {kraken,kaiju} {viral|all-viral|prok|oct2017|euk-oct2017}
+
+Options:
+  -k KMER_SIZE     default $K
+  -t THREADS       default $THREADS
+"
+
+OPTS=`getopt -o vhnk:t:p: --long verbose,dry-run,help,threads:,path: -n 'parse-options' -- "$@"`
+if [ $? != 0 ] ; then echo "Failed parsing options. Usage: $USAGE" >&2 ; exit 1 ; fi
+eval set -- "$OPTS"
+
+while true; do
+  case "$1" in
+    -v | --verbose ) VERBOSE=true; shift ;;
+    -h | --help )    HELP=true; shift ;;
+    -n | --dry-run ) DRY_RUN=true; shift ;;
+    -k | --kmer-size ) K="$2"; shift; shift ;;
+    -t | --threads ) THREADS="$2"; shift; shift ;;
+    -p | --path ) PATH1="$2"; shift; shift ;;
+    -- ) shift; break ;;
+    * ) break ;;
+  esac
+done
+shift $((OPTIND -1))
+
+if [[ "$#" -le 1 ]]; then
+  echo "$USAGE"
+  exit 1
+fi
+
+[[ "$PATH" != "" ]] && export PATH="$PATH1:$PATH"
+
+PROG=$1
+shift
 for VAR in $@; do
   case "$VAR" in
-    viral)     build_db $K 12 viral viral ;;
-    all-viral) build_db $K 12 all-viral viral viral-neighbors  ;;
-    prok)      build_db $K 15 prok archaea-dusted bacteria-dusted ;;
-    oct2017)   build_db $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
+    viral)     build_db $PROG $K 12 viral viral ;;
+    all-viral) build_db $PROG $K 12 all-viral viral viral-neighbors  ;;
+    prok)      build_db $PROG $K 15 prok archaea-dusted bacteria-dusted ;;
+    oct2017)   build_db $PROG $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
                                vertebrate_mammalian contaminants ;;
     euk-oct2017)
+      DB_DIR=$DIR/dbs/refseq-oct2017-k31
       EUKD=$DIR/dbs/refseq-euk-oct2017-k31
+      if [[ ! -f "$DB_DIR/taxDB" ]]; then
+        echo "Build oct2017 database first!";
+        exit 1;
+      fi
       [[ -d $EUKD ]] || mkdir -p $EUKD
       [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD
       build_db $K euk-oct2017 fungi protozoa ;;
-  *) echo "Usage: $0 K {viral|all-viral|prok|oct2017|euk-oct2017}"
+  *) echo "$USAGE"
      exit 1 ;;
   esac
 done

From 16813a7e1ee2634c2fb75bb787d323f6732c8c23 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 5 Nov 2017 15:04:01 -0500
Subject: [PATCH 091/105] Fox parent map generation in taxDB

---
 src/krakenutil.cpp | 19 +++++++++++++++----
 src/set_lcas.cpp   |  9 ++++-----
 src/taxdb.h        | 20 +++++++++++---------
 tests/build-dbs.sh |  4 +++-
 4 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index d58cf39..46fd953 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -58,14 +58,25 @@ namespace kraken {
       return a ? a : b;
 
     unordered_set<uint32_t> a_path;
-    while (a > 0) {
+    while (a > 1) {
       a_path.insert(a);
-      a = parent_map.at(a);
+      auto a_it = parent_map.find(a);
+      if (a_it == parent_map.end()) {
+        cerr << "No parent for " << a << "!\n";
+        break;
+      } 
+      a = a_it->second;
     }
-    while (b > 0) {
+    while (b > 1) {
       if (a_path.count(b) > 0)
         return b;
-      b = parent_map.at(b);
+
+      auto b_it = parent_map.find(b);
+      if (b_it == parent_map.end()) {
+        cerr << "No parent for " << b << "!\n";
+        break;
+      } 
+      b = b_it->second;
     }
     return 1;
   }
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index f4e28f6..dc75c63 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -180,11 +180,10 @@ uint32_t get_new_taxid(
   if (it == name_to_taxid_map.end()) {
     uint32_t new_taxid = ++New_taxid_start;
     bool insert_res = taxdb.insert(new_taxid, parent_taxid, rank_name, name);
-    cerr << "Adding assembly: " << name << " with taxid " << new_taxid;
+    //cerr << "Adding assembly: " << name << " with taxid " << new_taxid;
     if (!insert_res) {
       return 0;
     }
-    cerr << "Oida " << (insert_res? "success" : "naaa") << endl;
     // insert_res shows if insert failed, but we don't care
     Parent_map[new_taxid] = parent_taxid;
     name_to_taxid_map[name] = new_taxid;
@@ -214,7 +213,7 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
         New_taxid_start = it->first+100;
       } 
     }
-    cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl;
+    cerr << "[starting new taxonomy IDs with " << (New_taxid_start+1) << ']';
   }
 
   // Used when adding new taxids for assembly or sequence
@@ -253,7 +252,7 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
   if (ID_to_taxon_map.size() == 0) {
     cerr << "Error: No ID mappings present!!" << endl;
   }
-  cerr << " Done - read " << ID_to_taxon_map.size() << " mappings." << endl;
+  cerr << " got " << ID_to_taxon_map.size() << " mappings." << endl;
   return std::move(ID_to_taxon_map);
 }
 
@@ -322,7 +321,7 @@ void process_single_file() {
     //}
 
     if (taxid) {
-      if (Parent_map.find(taxid) == Parent_map.end()) {
+      if (Parent_map.find(taxid) == Parent_map.end() || taxdb.entries.find(taxid) == taxdb.entries.end()) {
         cerr << "Ignoring sequence for taxID " << taxid << " - not in taxDB\n";
       } else {
         #pragma omp parallel for schedule(dynamic)
diff --git a/src/taxdb.h b/src/taxdb.h
index 91e60ec..aef1e50 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -432,14 +432,15 @@ template<typename TAXID>
 unordered_map<TAXID, TAXID> TaxonomyDB<TAXID>::getParentMap() const {
 	unordered_map<TAXID, TAXID> Parent_map;
 	//for (const auto & tax : entries) {
-	for (auto it = entries.begin(); it != entries.end(); ++it) {
-		const auto&tax = *it;
-		if (tax.first != 0) 
+	for (auto tax_it = entries.begin(); tax_it != entries.end(); ++tax_it) {
+		if (tax_it->first == 0) 
 			continue;
-		if (tax.second.parent == NULL)
-			Parent_map[tax.first] = 0; // for kraken::lca
-		else
-			Parent_map[tax.first] = tax.second.parent->taxonomyID;
+		if (tax_it->second.parent == NULL) {
+			//cerr << "Parent for " << tax.first << " is 0\n";
+			Parent_map[tax_it->first] = 0; // for kraken::lca
+		} else {
+			Parent_map[tax_it->first] = tax_it->second.parent->taxonomyID;
+		}
     }
 	return Parent_map;
 }
@@ -639,8 +640,9 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID> >
     }
     TaxonomyEntry<TAXID> newEntry(taxonomyID, NULL, rank, scientificName, genomeSize, genomeSizeOfChildren);
 
-    auto insert_res = entries.insert({ taxonomyID, newEntry });
-	parentMap[taxonomyID] = parentTaxonomyID;
+    //auto insert_res = entries.insert({ taxonomyID, newEntry });
+    entries.insert({ taxonomyID, newEntry });
+    parentMap[taxonomyID] = parentTaxonomyID;
   }
   entries.insert({0, {0, NULL, "no rank", "unclassified" }});
   //entries.insert({-1, {-1, 0, "no rank", "uncategorized" }});
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index d9284b8..d0675f2 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -51,9 +51,10 @@ HELP=false
 DRY_RUN=false
 K=31
 THREADS=10
+PATH1="."
 
 USAGE="
-`basename $0` [options] {kraken,kaiju} {viral|all-viral|prok|oct2017|euk-oct2017}
+`basename $0` [options] {kraken,kaiju} {viral|all-viral|prok|oct2017|euk-oct2017|archaea}
 
 Options:
   -k KMER_SIZE     default $K
@@ -92,6 +93,7 @@ for VAR in $@; do
     viral)     build_db $PROG $K 12 viral viral ;;
     all-viral) build_db $PROG $K 12 all-viral viral viral-neighbors  ;;
     prok)      build_db $PROG $K 15 prok archaea-dusted bacteria-dusted ;;
+    archaea)   build_db $PROG $K 15 archaea archaea ;;
     oct2017)   build_db $PROG $K 15 oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
                                vertebrate_mammalian contaminants ;;
     euk-oct2017)

From 746681052fdfeaa7bc27c98548b4ddc2d1f42479 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Sun, 5 Nov 2017 17:19:27 -0500
Subject: [PATCH 092/105] Add environment CPPFLAGS and LDFLAGS

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index e51a28f..cc84b11 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,9 +1,9 @@
 CXX = g++
 FOPENMP?=-fopenmp
-CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -g -Wfatal-errors
+CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -O2 -Wfatal-errors ${CPPFLAGS}
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
 PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
-LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
+LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream ${LDFLAGS}
 
 .PHONY: all install clean
 

From dd7fc4fd6cae0db0234f70577334df7afc5c5e18 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 6 Nov 2017 13:26:47 -0500
Subject: [PATCH 093/105] Change name to KrakenHLL

---
 README.md                                        | 12 ++++++------
 install_kraken.sh                                |  2 +-
 scripts/{krakenu => krakenhll}                   |  0
 ...to_library.sh => krakenhll-add_to_library.sh} |  0
 scripts/{krakenu-build => krakenhll-build}       | 16 ++++++++--------
 ...krakenu-build_db.sh => krakenhll-build_db.sh} |  6 +++---
 ...yfish.sh => krakenhll-check_for_jellyfish.sh} |  0
 ...krakenu-clean_db.sh => krakenhll-clean_db.sh} |  0
 ...tempfile.pl => krakenhll-cp_into_tempfile.pl} |  0
 scripts/{krakenu-download => krakenhll-download} |  2 +-
 scripts/{krakenu-filter => krakenhll-filter}     |  0
 .../{krakenu-mpa-report => krakenhll-mpa-report} |  0
 ...u-read_merger.pl => krakenhll-read_merger.pl} |  0
 scripts/{krakenu-report => krakenhll-report}     |  0
 ...akenu-shrink_db.sh => krakenhll-shrink_db.sh} |  0
 ...ion.sh => krakenhll-standard_installation.sh} | 10 +++++-----
 .../{krakenu-translate => krakenhll-translate}   |  0
 ...enu-upgrade_db.sh => krakenhll-upgrade_db.sh} |  0
 ...numbers.pl => krakenhll-verify_gi_numbers.pl} |  0
 src/Makefile                                     | 12 ++++++------
 src/classify.cpp                                 |  2 +-
 src/get_kmers.cpp                                |  2 +-
 src/krakenutil.cpp                               |  2 +-
 src/krakenutil.hpp                               |  4 ++--
 src/set_lcas.cpp                                 |  2 +-
 src/uid_mapping.cpp                              |  2 +-
 tests/build-dbs.sh                               |  2 +-
 tests/init.sh                                    | 12 ++++++------
 tests/test-on-simulated-reads.sh                 | 16 ++++++++--------
 29 files changed, 52 insertions(+), 52 deletions(-)
 rename scripts/{krakenu => krakenhll} (100%)
 rename scripts/{krakenu-add_to_library.sh => krakenhll-add_to_library.sh} (100%)
 rename scripts/{krakenu-build => krakenhll-build} (96%)
 rename scripts/{krakenu-build_db.sh => krakenhll-build_db.sh} (96%)
 rename scripts/{krakenu-check_for_jellyfish.sh => krakenhll-check_for_jellyfish.sh} (100%)
 rename scripts/{krakenu-clean_db.sh => krakenhll-clean_db.sh} (100%)
 rename scripts/{krakenu-cp_into_tempfile.pl => krakenhll-cp_into_tempfile.pl} (100%)
 rename scripts/{krakenu-download => krakenhll-download} (99%)
 rename scripts/{krakenu-filter => krakenhll-filter} (100%)
 rename scripts/{krakenu-mpa-report => krakenhll-mpa-report} (100%)
 rename scripts/{krakenu-read_merger.pl => krakenhll-read_merger.pl} (100%)
 rename scripts/{krakenu-report => krakenhll-report} (100%)
 rename scripts/{krakenu-shrink_db.sh => krakenhll-shrink_db.sh} (100%)
 rename scripts/{krakenu-standard_installation.sh => krakenhll-standard_installation.sh} (77%)
 rename scripts/{krakenu-translate => krakenhll-translate} (100%)
 rename scripts/{krakenu-upgrade_db.sh => krakenhll-upgrade_db.sh} (100%)
 rename scripts/{krakenu-verify_gi_numbers.pl => krakenhll-verify_gi_numbers.pl} (100%)

diff --git a/README.md b/README.md
index 83ae11b..bf23151 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
-Kraken taxonomic sequence classification system with unique k-mer counting
+KrakenHLL taxonomic sequence classification system with unique k-mer counting
 ===============================================
 
 [Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count using the HyperLogLog algorithm. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. 
 
-kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). 
+KrakenHLL adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). 
 
 Here's a small example of a classification against a viral database with k=25. There are three species identified by just one read - Enterobacteria phage BP-4795, Salmonella phage SEN22, Sulfolobus monocaudavirus SMV1. Out of those, the identification of Salmonella phage SEN22 is the strongest, as there read was matched with 116 k-mers that are unique to the sequence, while the match to Sulfolobus monocaudavirus SMV1 is only based on a single 25-mer.
 
@@ -33,13 +33,13 @@ Here's a small example of a classification against a viral database with k=25. T
 
 ## Usage
 
-For usage, see `krakenu --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM).
+For usage, see `krakenhll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM).
 
 ### Differences to `kraken`
- - Use `krakenu --report-file FILENAME ...` to write the kraken report to `FILENAME`.
- - Use `krakenu --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenu-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same.
+ - Use `krakenhll --report-file FILENAME ...` to write the kraken report to `FILENAME`.
+ - Use `krakenhll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenhll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same.
  - Add a suffix `.gz` to output files to generate gzipped output files
 
 ### Differences to `kraken-build`
- - Use `krakenu-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`.
+ - Use `krakenhll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`.
  - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space
diff --git a/install_kraken.sh b/install_kraken.sh
index 0e662b6..a251cd3 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -87,7 +87,7 @@ echo "Kraken installation complete."
 echo
 echo "To make things easier for you, you may want to copy/symlink the following"
 echo "files into a directory in your PATH:"
-for file in $KRAKEN_DIR/krakenu*
+for file in $KRAKEN_DIR/krakenhll*
 do
   [ -x "$file" ] && echo "  $file"
 done
diff --git a/scripts/krakenu b/scripts/krakenhll
similarity index 100%
rename from scripts/krakenu
rename to scripts/krakenhll
diff --git a/scripts/krakenu-add_to_library.sh b/scripts/krakenhll-add_to_library.sh
similarity index 100%
rename from scripts/krakenu-add_to_library.sh
rename to scripts/krakenhll-add_to_library.sh
diff --git a/scripts/krakenu-build b/scripts/krakenhll-build
similarity index 96%
rename from scripts/krakenu-build
rename to scripts/krakenhll-build
index e90b353..74f0eb7 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenhll-build
@@ -288,7 +288,7 @@ sub display_version {
 }
 
 sub download_taxonomy {
-  exec "krakenu-download_taxonomy.sh";
+  exec "krakenhll-download_taxonomy.sh";
 }
 
 sub download_library {
@@ -297,12 +297,12 @@ sub download_library {
     warn "Unknown library type \"$type\"\n";
     usage();
   }
-  exec "krakenu-download_genomic_library.sh", $type;
+  exec "krakenhll-download_genomic_library.sh", $type;
 }
 
 sub add_to_library {
   my $arg = shift;
-  exec "krakenu-add_to_library.sh", $arg;
+  exec "krakenhll-add_to_library.sh", $arg;
 }
 
 sub shrink_db {
@@ -313,11 +313,11 @@ sub shrink_db {
   if (! defined($new_db)) {
     die "Must specify new database name to perform shrink task\n";
   }
-  exec "krakenu-shrink_db.sh", $new_count, $new_db, $shrink_block_offset;
+  exec "krakenhll-shrink_db.sh", $new_count, $new_db, $shrink_block_offset;
 }
 
 sub standard_installation {
-  exec "krakenu-standard_installation.sh";
+  exec "krakenhll-standard_installation.sh";
 }
 
 sub build_database {
@@ -340,13 +340,13 @@ sub build_database {
   $ENV{"KRAKEN_LIBRARY_DIRS"} = "@library_dirs";
   $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir;
   my $opt = ($verbose? "-x" : "");
-  exec "krakenu-build_db.sh";
+  exec "krakenhll-build_db.sh";
 }
 
 sub clean_database {
-  exec "krakenu-clean_db.sh";
+  exec "krakenhll-clean_db.sh";
 }
 
 sub upgrade_database {
-  exec "krakenu-upgrade_db.sh";
+  exec "krakenhll-upgrade_db.sh";
 }
diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenhll-build_db.sh
similarity index 96%
rename from scripts/krakenu-build_db.sh
rename to scripts/krakenhll-build_db.sh
index f8c1450..959f041 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenhll-build_db.sh
@@ -54,7 +54,7 @@ script_dir=`dirname $0`
 
 DATABASE_DIR="$KRAKEN_DB_NAME"
 FIND_OPTS=-L
-JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh`
+JELLYFISH_BIN=`$script_dir/krakenhll-check_for_jellyfish.sh`
 NCBI_SERVER="ftp.ncbi.nih.gov"
 FTP_SERVER="ftp://$NCBI_SERVER"
 
@@ -267,7 +267,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
   REPNAME=database
   if [[ ! -s $REPNAME.report.tsv ]]; then
     echo "Creating database summary report $REPNAME.report.tsv ..."
-    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv
+    krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv
   fi
 fi
 
@@ -300,7 +300,7 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
   REPNAME=uid_database
   if [[ ! -s $REPNAME.report.tsv ]]; then
     echo "Creating UID database summary report $REPNAME.report.tsv ..."
-    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
+    krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
   fi
 fi
 
diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenhll-check_for_jellyfish.sh
similarity index 100%
rename from scripts/krakenu-check_for_jellyfish.sh
rename to scripts/krakenhll-check_for_jellyfish.sh
diff --git a/scripts/krakenu-clean_db.sh b/scripts/krakenhll-clean_db.sh
similarity index 100%
rename from scripts/krakenu-clean_db.sh
rename to scripts/krakenhll-clean_db.sh
diff --git a/scripts/krakenu-cp_into_tempfile.pl b/scripts/krakenhll-cp_into_tempfile.pl
similarity index 100%
rename from scripts/krakenu-cp_into_tempfile.pl
rename to scripts/krakenhll-cp_into_tempfile.pl
diff --git a/scripts/krakenu-download b/scripts/krakenhll-download
similarity index 99%
rename from scripts/krakenu-download
rename to scripts/krakenhll-download
index b70a24f..c052463 100755
--- a/scripts/krakenu-download
+++ b/scripts/krakenhll-download
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 #vim: et:ts=2:sw=2
 
-# krakenu-download.pl - based on centrifuge-download
+# krakenhll-download.pl - based on centrifuge-download
 # (c) Florian Breitwieser, 2017
 
 use strict;
diff --git a/scripts/krakenu-filter b/scripts/krakenhll-filter
similarity index 100%
rename from scripts/krakenu-filter
rename to scripts/krakenhll-filter
diff --git a/scripts/krakenu-mpa-report b/scripts/krakenhll-mpa-report
similarity index 100%
rename from scripts/krakenu-mpa-report
rename to scripts/krakenhll-mpa-report
diff --git a/scripts/krakenu-read_merger.pl b/scripts/krakenhll-read_merger.pl
similarity index 100%
rename from scripts/krakenu-read_merger.pl
rename to scripts/krakenhll-read_merger.pl
diff --git a/scripts/krakenu-report b/scripts/krakenhll-report
similarity index 100%
rename from scripts/krakenu-report
rename to scripts/krakenhll-report
diff --git a/scripts/krakenu-shrink_db.sh b/scripts/krakenhll-shrink_db.sh
similarity index 100%
rename from scripts/krakenu-shrink_db.sh
rename to scripts/krakenhll-shrink_db.sh
diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenhll-standard_installation.sh
similarity index 77%
rename from scripts/krakenu-standard_installation.sh
rename to scripts/krakenhll-standard_installation.sh
index e09de80..b34dd44 100755
--- a/scripts/krakenu-standard_installation.sh
+++ b/scripts/krakenhll-standard_installation.sh
@@ -30,11 +30,11 @@ then
   WOD_FLAG="--work-on-disk"
 fi
 
-krakenu-check_for_jellyfish.sh
-krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy
-krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map
-krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map
-krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
+krakenhll-check_for_jellyfish.sh
+krakenhll-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy
+krakenhll-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map
+krakenhll-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map
+krakenhll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
                --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \
                --max-db-size "$KRAKEN_MAX_DB_SIZE" \
                --minimizer-len $KRAKEN_MINIMIZER_LEN \
diff --git a/scripts/krakenu-translate b/scripts/krakenhll-translate
similarity index 100%
rename from scripts/krakenu-translate
rename to scripts/krakenhll-translate
diff --git a/scripts/krakenu-upgrade_db.sh b/scripts/krakenhll-upgrade_db.sh
similarity index 100%
rename from scripts/krakenu-upgrade_db.sh
rename to scripts/krakenhll-upgrade_db.sh
diff --git a/scripts/krakenu-verify_gi_numbers.pl b/scripts/krakenhll-verify_gi_numbers.pl
similarity index 100%
rename from scripts/krakenu-verify_gi_numbers.pl
rename to scripts/krakenhll-verify_gi_numbers.pl
diff --git a/src/Makefile b/src/Makefile
index cc84b11..65415db 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -19,23 +19,23 @@ db_shrink: krakendb.o quickfile.o
 
 db_sort: krakendb.o quickfile.o
 
-set_lcas: krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o
+set_lcas: krakendb.o quickfile.o krakenhlltil.o seqreader.o uid_mapping.o
 
 grade_classification: taxdb.h report-cols.h
 
 read_uid_mapping: quickfile.o
 
-classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h
+classify: classify.cpp krakendb.o quickfile.o krakenhlltil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h
 	$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)
 
 build_taxdb: taxdb.h report-cols.h quickfile.o
 
 make_seqid_to_taxid_map: quickfile.o
 
-read_uid_mapping: quickfile.o krakenutil.o uid_mapping.o
+read_uid_mapping: quickfile.o krakenhlltil.o uid_mapping.o
 
-krakenutil.o: krakenutil.cpp krakenutil.hpp taxdb.h report-cols.h
-	$(CXX) $(CXXFLAGS) -c krakenutil.cpp
+krakenhlltil.o: krakenhlltil.cpp krakenhlltil.hpp taxdb.h report-cols.h
+	$(CXX) $(CXXFLAGS) -c krakenhlltil.cpp
 
 krakendb.o: krakendb.cpp krakendb.hpp quickfile.hpp
 	$(CXX) $(CXXFLAGS) -c krakendb.cpp
@@ -46,5 +46,5 @@ seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp
 quickfile.o: quickfile.cpp quickfile.hpp
 	$(CXX) $(CXXFLAGS) -c quickfile.cpp
 
-uid_mapping.o: krakenutil.hpp uid_mapping.hpp uid_mapping.cpp
+uid_mapping.o: krakenhlltil.hpp uid_mapping.hpp uid_mapping.cpp
 	$(CXX) $(CXXFLAGS) -c uid_mapping.cpp
diff --git a/src/classify.cpp b/src/classify.cpp
index f2ac91b..b2a5723 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -19,7 +19,7 @@
 
 #include "kraken_headers.hpp"
 #include "krakendb.hpp"
-#include "krakenutil.hpp"
+#include "krakenhlltil.hpp"
 #include "quickfile.hpp"
 #include "seqreader.hpp"
 #include "readcounts.hpp"
diff --git a/src/get_kmers.cpp b/src/get_kmers.cpp
index 9288078..22f19a4 100644
--- a/src/get_kmers.cpp
+++ b/src/get_kmers.cpp
@@ -20,7 +20,7 @@
 #include "kraken_headers.hpp"
 #include "quickfile.hpp"
 #include "krakendb.hpp"
-#include "krakenutil.hpp"
+#include "krakenhlltil.hpp"
 #include "seqreader.hpp"
 #include <unordered_map>
 
diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index 46fd953..bec1d1c 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -19,7 +19,7 @@
 
 #include "assert_helpers.h"
 #include "kraken_headers.hpp"
-#include "krakenutil.hpp"
+#include "krakenhlltil.hpp"
 #include <unordered_set>
 #include<algorithm>
 
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
index 46e8eb8..cbfd3d5 100644
--- a/src/krakenutil.hpp
+++ b/src/krakenutil.hpp
@@ -17,8 +17,8 @@
  * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef KRAKENUTIL_HPP
-#define KRAKENUTIL_HPP
+#ifndef KRAKENHLLTIL_HPP
+#define KRAKENHLLTIL_HPP
 
 #include "kraken_headers.hpp"
 #include <unordered_map>
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index dc75c63..8457599 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -21,7 +21,7 @@
 #include "kraken_headers.hpp"
 #include "quickfile.hpp"
 #include "krakendb.hpp"
-#include "krakenutil.hpp"
+#include "krakenhlltil.hpp"
 #include "seqreader.hpp"
 #include "taxdb.h"
 #include "readcounts.hpp"
diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp
index 2914468..5b4d001 100644
--- a/src/uid_mapping.cpp
+++ b/src/uid_mapping.cpp
@@ -2,7 +2,7 @@
 #include<iostream>
 #include<algorithm>
 #include "uid_mapping.hpp"
-#include "krakenutil.hpp"
+#include "krakenhlltil.hpp"
 #include "assert_helpers.h"
 
 using namespace std;
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index d0675f2..b10cac8 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -19,7 +19,7 @@ build_db() {
 
   if [[ "$PROG" == "kraken" ]]; then
     mkdir -p $DB_DIR
-    CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
+    CMD="krakenhll-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
     for L in $@; do
       CMD="$CMD  --library-dir=$DIR/data/library/$L"
     done
diff --git a/tests/init.sh b/tests/init.sh
index d029fb1..495ee4a 100755
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -4,15 +4,15 @@ set -xeu
 
 [[ $# -eq 1 ]] && DIR=$1 || DIR=`pwd`
 
-## Install KrakenU locally into install/
+## Install KrakenHLL locally into install/
 #$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install
 
 ## Download taxonomy and genomic data into data/
-time $DIR/install/krakenu-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria
-time $DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any
-time $DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
-time $DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
-time $DIR/install/krakenu-download --db $DIR/data -R contaminants
+time $DIR/install/krakenhll-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria
+time $DIR/install/krakenhll-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any
+time $DIR/install/krakenhll-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
+time $DIR/install/krakenhll-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
+time $DIR/install/krakenhll-download --db $DIR/data -R contaminants
 
 for i in fungi protozoa viral viral-neighbors archaea bacteria; do 
   [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.fna
diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh
index 580f218..21b6f0d 100755
--- a/tests/test-on-simulated-reads.sh
+++ b/tests/test-on-simulated-reads.sh
@@ -29,10 +29,10 @@ run_kraken() {
 
   if [[ "$PROG" == "kraken" ]]; then 
     CMD="kraken"
-  elif [[ "$PROG" == "krakenu" ]]; then
-    CMD="$DIR/install/krakenu --report-file $KFILE.report"
-  elif [[ "$PROG" == "krakenuid" ]]; then
-    CMD="$DIR/install/krakenu --report-file $KFILE.report --uid-mapping"
+  elif [[ "$PROG" == "krakenhll" ]]; then
+    CMD="$DIR/install/krakenhll --report-file $KFILE.report"
+  elif [[ "$PROG" == "krakenhllid" ]]; then
+    CMD="$DIR/install/krakenhll --report-file $KFILE.report --uid-mapping"
   else 
     echo "Unknown $PROG"
     return;
@@ -61,14 +61,14 @@ for i in 1; do # 2 3
       FQ=$SDIR/$NAM.fq
       [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i
       for K in 31; do
-        # run_kraken $FQ $NAM $dat viral $K krakenuid
+        # run_kraken $FQ $NAM $dat viral $K krakenhllid
         if [[ `uname` != "Darwin" ]]; then
           run_kraken $FQ $NAM $dat oct2017 $K kraken ALWAYS_SEQMAP
-          run_kraken $FQ $NAM $dat oct2017 $K krakenu ALWAYS_SEQMAP
-          run_kraken $FQ $NAM $dat oct2017 $K krakenuid ALWAYS_SEQMAP
+          run_kraken $FQ $NAM $dat oct2017 $K krakenhll ALWAYS_SEQMAP
+          run_kraken $FQ $NAM $dat oct2017 $K krakenhllid ALWAYS_SEQMAP
         else
           run_kraken $FQ $NAM $dat viral $K kraken
-          run_kraken $FQ $NAM $dat viral $K krakenu
+          run_kraken $FQ $NAM $dat viral $K krakenhll
         fi
       done
     done

From 0c33f0ecd357a37384de7afb004708d7df5603b4 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Mon, 6 Nov 2017 13:26:47 -0500
Subject: [PATCH 094/105] Change name to KrakenHLL

---
 README.md                                     | 12 +++++-----
 install_kraken.sh                             | 18 ++++++++-------
 scripts/{krakenu => krakenhll}                |  4 +++-
 ...library.sh => krakenhll-add_to_library.sh} |  0
 scripts/{krakenu-build => krakenhll-build}    | 22 ++++++++++---------
 ...kenu-build_db.sh => krakenhll-build_db.sh} |  6 ++---
 ...sh.sh => krakenhll-check_for_jellyfish.sh} |  0
 ...kenu-clean_db.sh => krakenhll-clean_db.sh} |  0
 ...pfile.pl => krakenhll-cp_into_tempfile.pl} |  0
 .../{krakenu-download => krakenhll-download}  |  2 +-
 scripts/{krakenu-filter => krakenhll-filter}  |  0
 ...rakenu-mpa-report => krakenhll-mpa-report} |  0
 ...ead_merger.pl => krakenhll-read_merger.pl} |  0
 scripts/{krakenu-report => krakenhll-report}  |  0
 ...nu-shrink_db.sh => krakenhll-shrink_db.sh} |  0
 ....sh => krakenhll-standard_installation.sh} | 10 ++++-----
 ...{krakenu-translate => krakenhll-translate} |  0
 ...-upgrade_db.sh => krakenhll-upgrade_db.sh} |  0
 ...bers.pl => krakenhll-verify_gi_numbers.pl} |  0
 src/hyperloglogplus.h                         |  3 ++-
 src/krakenutil.hpp                            |  4 ++--
 tests/build-dbs.sh                            |  2 +-
 tests/init.sh                                 | 12 +++++-----
 tests/test-on-simulated-reads.sh              | 16 +++++++-------
 24 files changed, 59 insertions(+), 52 deletions(-)
 rename scripts/{krakenu => krakenhll} (99%)
 rename scripts/{krakenu-add_to_library.sh => krakenhll-add_to_library.sh} (100%)
 rename scripts/{krakenu-build => krakenhll-build} (96%)
 rename scripts/{krakenu-build_db.sh => krakenhll-build_db.sh} (96%)
 rename scripts/{krakenu-check_for_jellyfish.sh => krakenhll-check_for_jellyfish.sh} (100%)
 rename scripts/{krakenu-clean_db.sh => krakenhll-clean_db.sh} (100%)
 rename scripts/{krakenu-cp_into_tempfile.pl => krakenhll-cp_into_tempfile.pl} (100%)
 rename scripts/{krakenu-download => krakenhll-download} (99%)
 rename scripts/{krakenu-filter => krakenhll-filter} (100%)
 rename scripts/{krakenu-mpa-report => krakenhll-mpa-report} (100%)
 rename scripts/{krakenu-read_merger.pl => krakenhll-read_merger.pl} (100%)
 rename scripts/{krakenu-report => krakenhll-report} (100%)
 rename scripts/{krakenu-shrink_db.sh => krakenhll-shrink_db.sh} (100%)
 rename scripts/{krakenu-standard_installation.sh => krakenhll-standard_installation.sh} (77%)
 rename scripts/{krakenu-translate => krakenhll-translate} (100%)
 rename scripts/{krakenu-upgrade_db.sh => krakenhll-upgrade_db.sh} (100%)
 rename scripts/{krakenu-verify_gi_numbers.pl => krakenhll-verify_gi_numbers.pl} (100%)

diff --git a/README.md b/README.md
index 83ae11b..bf23151 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
-Kraken taxonomic sequence classification system with unique k-mer counting
+KrakenHLL taxonomic sequence classification system with unique k-mer counting
 ===============================================
 
 [Kraken](https://github.com/DerrickWood/kraken) is a fast taxonomic classifier for metagenomics data. This project, kraken-hll, adds some additional functionality - most notably a unique k-mer count using the HyperLogLog algorithm. Spurious identifications due to sequence contamination in the dataset or database often leads to many reads, however they usually cover only a small portion of the genome. 
 
-kraken-hll adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). 
+KrakenHLL adds two additional columns to the Kraken report - total number of k-mers observed for taxon, and the total number of unique k-mers observed for taxon (columns 3 and 4, resp.). 
 
 Here's a small example of a classification against a viral database with k=25. There are three species identified by just one read - Enterobacteria phage BP-4795, Salmonella phage SEN22, Sulfolobus monocaudavirus SMV1. Out of those, the identification of Salmonella phage SEN22 is the strongest, as there read was matched with 116 k-mers that are unique to the sequence, while the match to Sulfolobus monocaudavirus SMV1 is only based on a single 25-mer.
 
@@ -33,13 +33,13 @@ Here's a small example of a classification against a viral database with k=25. T
 
 ## Usage
 
-For usage, see `krakenu --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM).
+For usage, see `krakenhll --help`. Note that you can use the same database as Kraken with one difference - instead of the files `DB_DIR/taxonomy/nodes.dmp` and `DB_DIR/taxonomy/names.dmp` than kraken relies upon, `kraken-hll` needs the file `DB_DIR/taxDB`. This can be generated with the script `build_taxdb`: `KRAKEN_DIR/build_taxdb DB_DIR/taxonomy/names.dmp DB_DIR/taxonomy/nodes.dmp > DB_DIR/taxDB`. The code behind the taxDB is based on [k-SLAM](https://github.com/aindj/k-SLAM).
 
 ### Differences to `kraken`
- - Use `krakenu --report-file FILENAME ...` to write the kraken report to `FILENAME`.
- - Use `krakenu --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenu-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same.
+ - Use `krakenhll --report-file FILENAME ...` to write the kraken report to `FILENAME`.
+ - Use `krakenhll --db DB1 --db DB2 --db DB3 ...` to first attempt, for each k-mer, to assign it based on DB1, then DB2, then DB3. You can use this to prefer identifications based on DB1 (e.g. human and contaminant sequences), then DB2 (e.g. completed bacterial genomes), then DB3, etc. Note that this option is incompatible with `krakenhll-build --generate-taxonomy-ids-for-sequences` since the taxDB between the databases has to be absolutely the same.
  - Add a suffix `.gz` to output files to generate gzipped output files
 
 ### Differences to `kraken-build`
- - Use `krakenu-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`.
+ - Use `krakenhll-build --generate-taxonomy-ids-for-sequences ...` to add pseudo-taxonomy IDs for each sequence header. An example for the result using this is in the ouput above - one read has been assigned specifically to `KC207814.1 Human herpesvirus 4 strain Mutu, complete genome`.
  - `seqid2taxid.map` mapping sequence IDs to taxonomy IDs does NOT parse or require `>gi|`, but rather the sequence ID is the header up to just before the first space
diff --git a/install_kraken.sh b/install_kraken.sh
index 0e662b6..3b12552 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -82,14 +82,16 @@ do
   fi
 done
 
-echo
-echo "Kraken installation complete."
-echo
-echo "To make things easier for you, you may want to copy/symlink the following"
-echo "files into a directory in your PATH:"
-for file in $KRAKEN_DIR/krakenu*
+echo -n "
+Kraken installation complete.
+
+To make things easier for you, you may want to copy/symlink the following
+files into a directory in your PATH:
+
+ln -s"
+for file in $KRAKEN_DIR/krakenhll*
 do
-  [ -x "$file" ] && echo "  $file"
+  [ -x "$file" ] && echo -n " $file"
 done
-
+echo " DEST_DIR"
 exit 0
diff --git a/scripts/krakenu b/scripts/krakenhll
similarity index 99%
rename from scripts/krakenu
rename to scripts/krakenhll
index 006a078..e6d4df6 100755
--- a/scripts/krakenu
+++ b/scripts/krakenhll
@@ -222,7 +222,6 @@ Usage: $PROG [options] <filename(s)>
 Options:
   --db NAME               Name for Kraken DB (default: $default_db)
   --report-file FILENAME  Write Kraken report to FILENAME
-  --uid-mapping           Map using UID database
   --threads NUM           Number of threads (default: $def_thread_ct)
   --fasta-input           Input is FASTA format
   --fastq-input           Input is FASTQ format
@@ -246,6 +245,9 @@ Options:
   --help                  Print this message
   --version               Print version information
 
+Experimental:
+  --uid-mapping           Map using UID database
+
 If none of the *-input or *-compressed flags are specified, and the 
 file is a regular file, automatic format detection is attempted.
 EOF
diff --git a/scripts/krakenu-add_to_library.sh b/scripts/krakenhll-add_to_library.sh
similarity index 100%
rename from scripts/krakenu-add_to_library.sh
rename to scripts/krakenhll-add_to_library.sh
diff --git a/scripts/krakenu-build b/scripts/krakenhll-build
similarity index 96%
rename from scripts/krakenu-build
rename to scripts/krakenhll-build
index e90b353..8888cd8 100755
--- a/scripts/krakenu-build
+++ b/scripts/krakenhll-build
@@ -86,7 +86,7 @@ $hash_size = "";
 $max_db_size = "";
 $add_taxonomy_ids_for_genome = 0;
 $add_taxonomy_ids_for_seq = 0;
-$build_uid_database = 1;
+$build_uid_database = 0;
 $build_lca_database = 1;
 
 # variables corresponding to task options
@@ -260,7 +260,6 @@ Options:
   --shrink-block-offset NUM  When shrinking, select the k-mer that is NUM
                              positions from the end of a block of k-mers
                              (default: 1)
-  --uid-database             Build a UID database (default no)
   --lca-database             Build a LCA database (default yes)
   --no-lca-database          Do not build a LCA database
   --work-on-disk             Perform most operations on disk rather than in
@@ -273,6 +272,9 @@ Options:
                              for one taxonomy ID.
   --library-dir DIR          Use DIR for reference sequences instead of DBDIR/library.
   --taxonomy-dir DIR         Use DIR for taxonomy instead of DBDIR/taxonomy.
+
+Experimental:
+  --uid-database             Build a UID database (default no)
 EOF
   exit $exit_code;
 }
@@ -288,7 +290,7 @@ sub display_version {
 }
 
 sub download_taxonomy {
-  exec "krakenu-download_taxonomy.sh";
+  exec "krakenhll-download_taxonomy.sh";
 }
 
 sub download_library {
@@ -297,12 +299,12 @@ sub download_library {
     warn "Unknown library type \"$type\"\n";
     usage();
   }
-  exec "krakenu-download_genomic_library.sh", $type;
+  exec "krakenhll-download_genomic_library.sh", $type;
 }
 
 sub add_to_library {
   my $arg = shift;
-  exec "krakenu-add_to_library.sh", $arg;
+  exec "krakenhll-add_to_library.sh", $arg;
 }
 
 sub shrink_db {
@@ -313,11 +315,11 @@ sub shrink_db {
   if (! defined($new_db)) {
     die "Must specify new database name to perform shrink task\n";
   }
-  exec "krakenu-shrink_db.sh", $new_count, $new_db, $shrink_block_offset;
+  exec "krakenhll-shrink_db.sh", $new_count, $new_db, $shrink_block_offset;
 }
 
 sub standard_installation {
-  exec "krakenu-standard_installation.sh";
+  exec "krakenhll-standard_installation.sh";
 }
 
 sub build_database {
@@ -340,13 +342,13 @@ sub build_database {
   $ENV{"KRAKEN_LIBRARY_DIRS"} = "@library_dirs";
   $ENV{"KRAKEN_TAXONOMY_DIR"} = $taxonomy_dir;
   my $opt = ($verbose? "-x" : "");
-  exec "krakenu-build_db.sh";
+  exec "krakenhll-build_db.sh";
 }
 
 sub clean_database {
-  exec "krakenu-clean_db.sh";
+  exec "krakenhll-clean_db.sh";
 }
 
 sub upgrade_database {
-  exec "krakenu-upgrade_db.sh";
+  exec "krakenhll-upgrade_db.sh";
 }
diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenhll-build_db.sh
similarity index 96%
rename from scripts/krakenu-build_db.sh
rename to scripts/krakenhll-build_db.sh
index f8c1450..959f041 100755
--- a/scripts/krakenu-build_db.sh
+++ b/scripts/krakenhll-build_db.sh
@@ -54,7 +54,7 @@ script_dir=`dirname $0`
 
 DATABASE_DIR="$KRAKEN_DB_NAME"
 FIND_OPTS=-L
-JELLYFISH_BIN=`$script_dir/krakenu-check_for_jellyfish.sh`
+JELLYFISH_BIN=`$script_dir/krakenhll-check_for_jellyfish.sh`
 NCBI_SERVER="ftp.ncbi.nih.gov"
 FTP_SERVER="ftp://$NCBI_SERVER"
 
@@ -267,7 +267,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
   REPNAME=database
   if [[ ! -s $REPNAME.report.tsv ]]; then
     echo "Creating database summary report $REPNAME.report.tsv ..."
-    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv
+    krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --fasta-input <( cat_library ) > $REPNAME.kraken.tsv
   fi
 fi
 
@@ -300,7 +300,7 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
   REPNAME=uid_database
   if [[ ! -s $REPNAME.report.tsv ]]; then
     echo "Creating UID database summary report $REPNAME.report.tsv ..."
-    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
+    krakenhll --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
   fi
 fi
 
diff --git a/scripts/krakenu-check_for_jellyfish.sh b/scripts/krakenhll-check_for_jellyfish.sh
similarity index 100%
rename from scripts/krakenu-check_for_jellyfish.sh
rename to scripts/krakenhll-check_for_jellyfish.sh
diff --git a/scripts/krakenu-clean_db.sh b/scripts/krakenhll-clean_db.sh
similarity index 100%
rename from scripts/krakenu-clean_db.sh
rename to scripts/krakenhll-clean_db.sh
diff --git a/scripts/krakenu-cp_into_tempfile.pl b/scripts/krakenhll-cp_into_tempfile.pl
similarity index 100%
rename from scripts/krakenu-cp_into_tempfile.pl
rename to scripts/krakenhll-cp_into_tempfile.pl
diff --git a/scripts/krakenu-download b/scripts/krakenhll-download
similarity index 99%
rename from scripts/krakenu-download
rename to scripts/krakenhll-download
index b70a24f..c052463 100755
--- a/scripts/krakenu-download
+++ b/scripts/krakenhll-download
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 #vim: et:ts=2:sw=2
 
-# krakenu-download.pl - based on centrifuge-download
+# krakenhll-download.pl - based on centrifuge-download
 # (c) Florian Breitwieser, 2017
 
 use strict;
diff --git a/scripts/krakenu-filter b/scripts/krakenhll-filter
similarity index 100%
rename from scripts/krakenu-filter
rename to scripts/krakenhll-filter
diff --git a/scripts/krakenu-mpa-report b/scripts/krakenhll-mpa-report
similarity index 100%
rename from scripts/krakenu-mpa-report
rename to scripts/krakenhll-mpa-report
diff --git a/scripts/krakenu-read_merger.pl b/scripts/krakenhll-read_merger.pl
similarity index 100%
rename from scripts/krakenu-read_merger.pl
rename to scripts/krakenhll-read_merger.pl
diff --git a/scripts/krakenu-report b/scripts/krakenhll-report
similarity index 100%
rename from scripts/krakenu-report
rename to scripts/krakenhll-report
diff --git a/scripts/krakenu-shrink_db.sh b/scripts/krakenhll-shrink_db.sh
similarity index 100%
rename from scripts/krakenu-shrink_db.sh
rename to scripts/krakenhll-shrink_db.sh
diff --git a/scripts/krakenu-standard_installation.sh b/scripts/krakenhll-standard_installation.sh
similarity index 77%
rename from scripts/krakenu-standard_installation.sh
rename to scripts/krakenhll-standard_installation.sh
index e09de80..b34dd44 100755
--- a/scripts/krakenu-standard_installation.sh
+++ b/scripts/krakenhll-standard_installation.sh
@@ -30,11 +30,11 @@ then
   WOD_FLAG="--work-on-disk"
 fi
 
-krakenu-check_for_jellyfish.sh
-krakenu-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy
-krakenu-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map
-krakenu-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map
-krakenu-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
+krakenhll-check_for_jellyfish.sh
+krakenhll-download -o $KRAKEN_DB_NAME/taxonomy --download-taxonomy
+krakenhll-download -o $KRAKEN_DB_NAME/library -d archaea,bacteria refseq > $KRAKEN_DB_NAME/seqid2taxid.map
+krakenhll-download -o $KRAKEN_DB_NAME/library -d viral -a Any refseq >> $KRAKEN_DB_NAME/seqid2taxid.map
+krakenhll-build --db $KRAKEN_DB_NAME --build --threads $KRAKEN_THREAD_CT \
                --jellyfish-hash-size "$KRAKEN_HASH_SIZE" \
                --max-db-size "$KRAKEN_MAX_DB_SIZE" \
                --minimizer-len $KRAKEN_MINIMIZER_LEN \
diff --git a/scripts/krakenu-translate b/scripts/krakenhll-translate
similarity index 100%
rename from scripts/krakenu-translate
rename to scripts/krakenhll-translate
diff --git a/scripts/krakenu-upgrade_db.sh b/scripts/krakenhll-upgrade_db.sh
similarity index 100%
rename from scripts/krakenu-upgrade_db.sh
rename to scripts/krakenhll-upgrade_db.sh
diff --git a/scripts/krakenu-verify_gi_numbers.pl b/scripts/krakenhll-verify_gi_numbers.pl
similarity index 100%
rename from scripts/krakenu-verify_gi_numbers.pl
rename to scripts/krakenhll-verify_gi_numbers.pl
diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
index b4d9a81..10baa14 100644
--- a/src/hyperloglogplus.h
+++ b/src/hyperloglogplus.h
@@ -32,7 +32,7 @@ using namespace std;
 
 // experimentally determined threshold values for  p - 4
 static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100,
-							  6500, 11500, 20000, 50000, 120000, 350000};
+				  6500, 11500, 20000, 50000, 120000, 350000};
 
 
 ///////////////////////
@@ -69,6 +69,7 @@ inline uint64_t ranhash (uint64_t u) {
   return v;
 }
 
+// from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
 inline uint64_t murmurhash3_finalizer (uint64_t key)  {
 	key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that.
 	key ^= key >> 33;
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
index 46e8eb8..cbfd3d5 100644
--- a/src/krakenutil.hpp
+++ b/src/krakenutil.hpp
@@ -17,8 +17,8 @@
  * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef KRAKENUTIL_HPP
-#define KRAKENUTIL_HPP
+#ifndef KRAKENHLLTIL_HPP
+#define KRAKENHLLTIL_HPP
 
 #include "kraken_headers.hpp"
 #include <unordered_map>
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index d0675f2..b92d23a 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -19,7 +19,7 @@ build_db() {
 
   if [[ "$PROG" == "kraken" ]]; then
     mkdir -p $DB_DIR
-    CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
+    CMD="krakenhll-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy"
     for L in $@; do
       CMD="$CMD  --library-dir=$DIR/data/library/$L"
     done
diff --git a/tests/init.sh b/tests/init.sh
index d029fb1..495ee4a 100755
--- a/tests/init.sh
+++ b/tests/init.sh
@@ -4,15 +4,15 @@ set -xeu
 
 [[ $# -eq 1 ]] && DIR=$1 || DIR=`pwd`
 
-## Install KrakenU locally into install/
+## Install KrakenHLL locally into install/
 #$(dirname $0)/../install_kraken.sh --install-jellyfish $DIR/install
 
 ## Download taxonomy and genomic data into data/
-time $DIR/install/krakenu-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria
-time $DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any
-time $DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
-time $DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
-time $DIR/install/krakenu-download --db $DIR/data -R contaminants
+time $DIR/install/krakenhll-download --db $DIR/data -R taxonomy refseq/archaea refseq/bacteria
+time $DIR/install/krakenhll-download --db $DIR/data -R --include-viral-neighbors refseq/viral/Any
+time $DIR/install/krakenhll-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
+time $DIR/install/krakenhll-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
+time $DIR/install/krakenhll-download --db $DIR/data -R contaminants
 
 for i in fungi protozoa viral viral-neighbors archaea bacteria; do 
   [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -print0 | xargs -0 -n 100 cat > $DIR/data/all-$i.fna
diff --git a/tests/test-on-simulated-reads.sh b/tests/test-on-simulated-reads.sh
index 580f218..61ff561 100755
--- a/tests/test-on-simulated-reads.sh
+++ b/tests/test-on-simulated-reads.sh
@@ -29,10 +29,10 @@ run_kraken() {
 
   if [[ "$PROG" == "kraken" ]]; then 
     CMD="kraken"
-  elif [[ "$PROG" == "krakenu" ]]; then
-    CMD="$DIR/install/krakenu --report-file $KFILE.report"
-  elif [[ "$PROG" == "krakenuid" ]]; then
-    CMD="$DIR/install/krakenu --report-file $KFILE.report --uid-mapping"
+  elif [[ "$PROG" == "krakenhll" ]]; then
+    CMD="$DIR/install/krakenhll --report-file $KFILE.report"
+  elif [[ "$PROG" == "krakenhull" ]]; then
+    CMD="$DIR/install/krakenhll --report-file $KFILE.report --uid-mapping"
   else 
     echo "Unknown $PROG"
     return;
@@ -61,14 +61,14 @@ for i in 1; do # 2 3
       FQ=$SDIR/$NAM.fq
       [[ -f $FQ ]] || randomreads.sh -Xmx40g ref=$DIR/data/all-$dat.fna out=$FQ reads=$AB len=$len seed=$i
       for K in 31; do
-        # run_kraken $FQ $NAM $dat viral $K krakenuid
+        # run_kraken $FQ $NAM $dat viral $K krakenhllid
         if [[ `uname` != "Darwin" ]]; then
           run_kraken $FQ $NAM $dat oct2017 $K kraken ALWAYS_SEQMAP
-          run_kraken $FQ $NAM $dat oct2017 $K krakenu ALWAYS_SEQMAP
-          run_kraken $FQ $NAM $dat oct2017 $K krakenuid ALWAYS_SEQMAP
+          run_kraken $FQ $NAM $dat oct2017 $K krakenhll ALWAYS_SEQMAP
+          run_kraken $FQ $NAM $dat oct2017 $K krakenhllid ALWAYS_SEQMAP
         else
           run_kraken $FQ $NAM $dat viral $K kraken
-          run_kraken $FQ $NAM $dat viral $K krakenu
+          run_kraken $FQ $NAM $dat viral $K krakenhll
         fi
       done
     done

From 274e41fb87fc1d3ccd76926682559de51ddc3b5a Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 8 Nov 2017 11:39:01 -0500
Subject: [PATCH 095/105] Update

---
 tests/build-dbs.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index bfbd3f8..d087dd5 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -20,7 +20,7 @@ build_db() {
   for L in $@; do
     CMD="$CMD  --library-dir=$DIR/data/library/$L"
   done
-  #if [[ ! -f "$DB_DIR/is.busy" ]]; then
+  if [[ ! -f "$DB_DIR/is.busy" ]]; then
     echo "EXECUTING $CMD"
     touch $DB_DIR/is.busy
     $CMD 2>&1 | tee $DIR/dbs/$DB_NAM/build.log
@@ -30,9 +30,9 @@ build_db() {
       dump_taxdb $DB_DIR/taxDB $DB_DIR/taxonomy/names.dmp $DB_DIR/nodes.dmp
     fi
     rm $DB_DIR/is.busy
-  #else 
-  #  echo "IGNORING $DB_DIR"
-  #fi
+  else 
+    echo "$DB_DIR/is.busy exists, ignoring directory."
+  fi
 }
 
 K=$1; shift;

From 69d835234c475c866871ceb1a7cfe9bf228edcb0 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 8 Nov 2017 11:43:14 -0500
Subject: [PATCH 096/105] Create taxDB if not present

---
 scripts/krakenhll | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/scripts/krakenhll b/scripts/krakenhll
index e6d4df6..8a4aad9 100755
--- a/scripts/krakenhll
+++ b/scripts/krakenhll
@@ -38,6 +38,7 @@ $ENV{"KRAKEN_DIR"} = $KRAKEN_DIR;
 $ENV{"PATH"} = "$KRAKEN_DIR:$ENV{PATH}";
 
 my $CLASSIFY = "$KRAKEN_DIR/classify";
+my $CREATE_TAXDB = "$KRAKEN_DIR/build_taxdb";
 my $GZIP_MAGIC = chr(hex "1f") . chr(hex "8b");
 my $BZIP2_MAGIC = "BZ";
 
@@ -92,6 +93,12 @@ if (! @ARGV) {
   usage();
 }
 
+if (!defined $report_file) {
+  print STDERR "Need to specify a report file with --report-file!
+See --help for more details.\n";
+  exit 1;
+}
+
 eval { @db_prefix = map { krakenlib::find_db($_) } @db_prefix };
 if ($@) {
   die "$PROG: $@";
@@ -160,6 +167,16 @@ if ($uid_mapping) {
 
 }
 
+if (! -f $db_prefix[0]."/taxDB") {
+  print STDERR "Taxonomy database not at ".$db_prefix[0]."/taxDB - creating it ...";
+  die "$db_prefix[0]/taxonomy/nodes.dmp does not exist!" unless  -f $db_prefix[0]."/taxonomy/nodes.dmp";
+  die "$db_prefix[0]/taxonomy/names.dmp does not exist!" unless  -f $db_prefix[0]."/taxonomy/names.dmp";
+
+  my $cmd = "$CREATE_TAXDB $db_prefix[0]/taxonomy/names.dmp $db_prefix[0]/taxonomy/nodes.dmp > $db_prefix[0]/taxDB";
+  print STDERR "$cmd\n";
+  system $cmd;
+}
+
 # handle piping for decompression/merging
 my @pipe_argv;
 if ($paired) {

From d83c579bb5bd5a1787f9790aedd069a64036395c Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 8 Nov 2017 11:43:44 -0500
Subject: [PATCH 097/105] Compute k-mer counts if not present

---
 src/classify.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/classify.cpp b/src/classify.cpp
index f2ac91b..3b8a03e 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -153,7 +153,7 @@ int main(int argc, char **argv) {
 
   if (!TaxDB_file.empty()) {
     // TODO: Define if the taxDB has read counts or not!!
-	  taxdb = TaxonomyDB<uint32_t>(TaxDB_file, false);
+      taxdb = TaxonomyDB<uint32_t>(TaxDB_file, false);
       Parent_map = taxdb.getParentMap();
   } else {
       cerr << "TaxDB argument is required!" << endl;
@@ -226,12 +226,20 @@ int main(int argc, char **argv) {
 
   if (Print_kraken_report) {
     for (size_t i = 0; i < DB_filenames.size(); ++i) {
-      const auto& fname = DB_filenames[i];
-      ifstream ifs(fname + ".counts");
+      const auto fname = DB_filenames[i] + ".counts";
+      ifstream ifs(fname);
       if (ifs.good()) {
         ifs.close();
-        taxdb.readGenomeSizes(fname+".counts");
+      } else {
+        ofstream ofs(fname);
+        cerr << "Writing kmer counts to " << fname << "... [only once for this database, may take a while] " << endl;
+        auto counts = KrakenDatabases[i]->count_taxons();
+        for (auto it = counts.begin(); it != counts.end(); ++it) {
+          ofs << it->first << '\t' << it->second << '\n';
+        }
+        ofs.close();
       }
+      taxdb.readGenomeSizes(fname);
     }
 
 	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, taxon_counts, false);

From 30a65384f600c2233ef7ec8b58422b34f6f0f522 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 8 Nov 2017 11:46:26 -0500
Subject: [PATCH 098/105] Fix indent

---
 src/hyperloglogplus.h | 823 +++++++++++++++++++++---------------------
 1 file changed, 421 insertions(+), 402 deletions(-)

diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
index b4d9a81..9274451 100644
--- a/src/hyperloglogplus.h
+++ b/src/hyperloglogplus.h
@@ -32,51 +32,52 @@ using namespace std;
 
 // experimentally determined threshold values for  p - 4
 static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100,
-							  6500, 11500, 20000, 50000, 120000, 350000};
+                6500, 11500, 20000, 50000, 120000, 350000};
 
 
 ///////////////////////
 
 //
 /**
- * gives the estimated cardinality for m bins, v of which are non-zero
+ * Gives the estimated cardinality for m bins, v of which are non-zero
+ * using linear counting of Whang et al., 1990: n_hat = -m ln(v)
  * @param m number of bins in the matrix
  * @param v number of non-zero bins
  * @return
  */
 double linearCounting(uint32_t m, uint32_t v) {
-	if (v > m) {
-	    throw std::invalid_argument("number of v should not be greater than m");
-	}
-	double fm = double(m);
-	return fm * log(fm/double(v));
+  if (v > m) {
+      throw std::invalid_argument("number of v should not be greater than m");
+  }
+  return double(m) * log(double(m)/double(v));
 }
 
 /**
-  * from Numerical Recipes, 3rd Edition, p 352
-  * Returns hash of u as a 64-bit integer.
-  *
-*/
+ * from Numerical Recipes, 3rd Edition, p 352
+ * Returns hash of u as a 64-bit integer.
+ *
+ */
 inline uint64_t ranhash (uint64_t u) {
   uint64_t v = u * 3935559000370003845 + 2691343689449507681;
-
   v ^= v >> 21; v ^= v << 37; v ^= v >>  4;
-
   v *= 4768777513237032717;
-
   v ^= v << 20; v ^= v >> 41; v ^= v <<  5;
 
   return v;
 }
 
+/**
+ * Avalanche mixer/finalizer from MurMurHash3
+ * https://github.com/aappleby/smhasher
+ */
 inline uint64_t murmurhash3_finalizer (uint64_t key)  {
-	key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that.
-	key ^= key >> 33;
-	key *= 0xff51afd7ed558ccd;
-	key ^= key >> 33;
-	key *= 0xc4ceb9fe1a85ec53;
-	key ^= key >> 33;
-	return key;
+  key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that.
+  key ^= key >> 33;
+  key *= 0xff51afd7ed558ccd;
+  key ^= key >> 33;
+  key *= 0xc4ceb9fe1a85ec53;
+  key ^= key >> 33;
+  return key;
 }
 
 /**
@@ -85,14 +86,14 @@ inline uint64_t murmurhash3_finalizer (uint64_t key)  {
  * @return
  */
 double alpha(uint32_t m)  {
-	switch (m) {
-	case 16: return 0.673;
-	case 32: return 0.697;
-	case 64: return 0.709;
-	}
-
-	// m >= 128
-	return 0.7213 / (1 + 1.079/double(m));
+  switch (m) {
+  case 16: return 0.673;
+  case 32: return 0.697;
+  case 64: return 0.709;
+  }
+
+  // m >= 128
+  return 0.7213 / (1 + 1.079/double(m));
 }
 
 /**
@@ -101,16 +102,16 @@ double alpha(uint32_t m)  {
  * @return
  */
 double calculateEstimate(vector<uint8_t> array) {
-	double inverseSum = 0.0;
-	for (size_t i = 0; i < array.size(); ++i) {
-		// TODO: pre-calculate the power calculation
-		inverseSum += pow(2,-array[i]);
-	}
-	return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum;
+  double inverseSum = 0.0;
+  for (size_t i = 0; i < array.size(); ++i) {
+    // TODO: pre-calculate the power calculation
+    inverseSum += pow(2,-array[i]);
+  }
+  return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum;
 }
 
 uint32_t countZeros(vector<uint8_t> s) {
-	return (uint32_t)count(s.begin(), s.end(), 0);
+  return (uint32_t)count(s.begin(), s.end(), 0);
 }
 
 /**
@@ -128,9 +129,9 @@ T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {
     //           ((T(1) << (hi - lo) - 1)              1's from position 0 to position (hi-lo-1)
     //          (((T(1) << (hi - lo)) - 1) << lo)      1's from position lo to position hi
 
-	// The T(1) is required to not cause overflow on 32bit machines
-	// TODO: consider creating a bitmask only once in the beginning
-	T bitmask = (((T(1) << (hi - lo)) - 1) << lo);
+  // The T(1) is required to not cause overflow on 32bit machines
+  // TODO: consider creating a bitmask only once in the beginning
+  T bitmask = (((T(1) << (hi - lo)) - 1) << lo);
     T result = value & bitmask;
 
     if (!shift_left) {
@@ -140,15 +141,31 @@ T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {
         // shift resulting bits to the left
         result = result << (sizeof(T)*8 - hi);
     }
-    return result;	
+    return result;  
+}
+
+inline 
+void insert_hash(vector<uint32_t>& vec, uint32_t val) {
+  auto it = std::lower_bound( vec.begin(), vec.end(), val); // find proper position in descending order
+  if (it == vec.end()) {
+    vec.insert( it, val ); // insert before iterator it
+  }
+}
+
+inline 
+void merge_lists(vector<uint32_t>& vec1, const vector<uint32_t>& vec2) {
+  auto it = std::lower_bound( vec.begin(), vec.end(), val); // find proper position in descending order
+  if (it == vec.end()) {
+    vec.insert( it, val ); // insert before iterator it
+  }
 }
 
 template<typename T>
 T extractBits(T bits, uint8_t hi) {
     // create a bitmask for first hi bits (LSB 0 numbering)
-	T bitmask = T(-1) << (sizeof(T)*8 - hi);
+  T bitmask = T(-1) << (sizeof(T)*8 - hi);
 
-	return (bits & bitmask);
+  return (bits & bitmask);
 }
 
 // functions for counting the number of leading 0-bits (clz)
@@ -174,7 +191,7 @@ static int clz_manual(uint64_t x)
 #endif
 
 inline uint32_t clz(const uint32_t x) {
-	return __builtin_clz(x);
+  return __builtin_clz(x);
 }
 
 inline uint32_t clz(const uint64_t x) {
@@ -189,7 +206,7 @@ inline uint32_t clz(const uint64_t x) {
 //#else
 
 uint32_t clz_log2(const uint64_t w) {
-	return 63 - floor(log2(w));
+  return 63 - floor(log2(w));
 }
 //#endif
 
@@ -198,7 +215,7 @@ uint32_t clz_log2(const uint64_t w) {
 //   see Heule et al., section 5.3.2
 // Also, using sets might give a larger overhead as each insertion costs more
 //  consider using vector and sort/unique when merging.
-typedef set<uint32_t> SparseListType;
+typedef vector<uint32_t> SparseListType;
 typedef uint64_t HashSize;
 
 /**
@@ -212,233 +229,236 @@ class HyperLogLogPlusMinus {
 
 private:
 
-	vector<uint8_t> M;  // registers (M) of size m
-	uint8_t p;            // precision
-	uint32_t m;           // number of registers
-	bool sparse;          // sparse representation of the data?
-	SparseListType sparseList; // TODO: use a compressed list instead
+  vector<uint8_t> M;  // registers (M) of size m
+  uint8_t p;            // precision
+  uint32_t m;           // number of registers
+  bool sparse;          // sparse representation of the data?
+  SparseListType sparseList; // TODO: use a compressed list instead
 
-	// vectors containing data for bias correction
-	vector<vector<double> > rawEstimateData; // TODO: make this static
-	vector<vector<double> > biasData;
+  // vectors containing data for bias correction
+  vector<vector<double> > rawEstimateData; // TODO: make this static
+  vector<vector<double> > biasData;
 
-	// sparse versions of p and m
-	static const uint8_t  pPrime = 25; // precision when using a sparse representation
-	                                   // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32
-	static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime
+  // sparse versions of p and m
+  static const uint8_t  pPrime = 25; // precision when using a sparse representation
+                                     // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32
+  static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime
 
 
 public:
 
-	~HyperLogLogPlusMinus() {};
-
-	/**
-	 * Create new HyperLogLogPlusMinus counter
-	 * @param precision
-	 * @param sparse
-	 */
-	HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) {
-		if (precision > 18 || precision < 4) {
-	        throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
-		}
-
-		this->m = 1 << precision;
-
-		if (sparse) {
-			this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
-		} else {
-			this->M = vector<uint8_t>(m);
-		}
-	}
-
-	/**
-	 * Add a new item to the counter.
-	 * @param item
-	 */
-	void add(T_KEY item) {
-		add(item, sizeof(T_KEY));
-	}
-
-	/**
-	 * Add a new item to the counter.
-	 * @param item
-	 * @param size  size of item
-	 */
-	void add(T_KEY item, size_t size) {
-
-		// compute hash for item
-		HashSize hash_value = murmurhash3_finalizer(item);
+  ~HyperLogLogPlusMinus() {};
+
+  /**
+   * Create new HyperLogLogPlusMinus counter
+   * @param precision
+   * @param sparse
+   */
+  HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) {
+    if (precision > 18 || precision < 4) {
+          throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
+    }
+
+    this->m = 1 << precision;
+
+    if (sparse) {
+      this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
+    } else {
+      this->M = vector<uint8_t>(m);
+    }
+  }
+
+  /**
+   * Add a new item to the counter.
+   * @param item
+   */
+  void add(T_KEY item) {
+    add(item, sizeof(T_KEY));
+  }
+
+  /**
+   * Add a new item to the counter.
+   * @param item
+   * @param size  size of item
+   */
+  void add(T_KEY item, size_t size) {
+
+    // compute hash for item
+    HashSize hash_value = murmurhash3_finalizer(item);
 
 #ifdef HLL_DEBUG
-		cerr << "Value: " << item << "; hash(value): " << hash_value << endl;
-		cerr << bitset<64>(hash_value) << endl;
+    cerr << "Value: " << item << "; hash(value): " << hash_value << endl;
+    cerr << bitset<64>(hash_value) << endl;
 #endif
 
-		if (sparse) {
-			// sparse mode: put the encoded hash into sparse list
-			uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value);
-			this->sparseList.insert(encoded_hash_value);
+    if (sparse) {
+      // sparse mode: put the encoded hash into sparse list
+      uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value);
+      insert_hash(sparseList, encoded_hash_value);
 
 #ifdef HLL_DEBUG
-			idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value);
-			assert_eq(ir.idx,get_index(hash_value, p));
-			assert_eq(ir.rank, get_rank(hash_value, p));
+      idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value);
+      assert_eq(ir.idx,get_index(hash_value, p));
+      assert_eq(ir.rank, get_rank(hash_value, p));
 #endif
 
-			// if the sparseList is too large, switch to normal (register) representation
-			if (this->sparseList.size() > this->m) { // TODO: is the size of m correct?
-				switchToNormalRepresentation();
-			}
-		} else {
-			// normal mode
-			// take first p bits as index  {x63,...,x64-p}
-			uint32_t idx = get_index(hash_value, p);
-			// shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0}
-			uint8_t rank = get_rank(hash_value, p);
-
-			// update the register if current rank is bigger
-			if (rank > this->M[idx]) {
-				this->M[idx] = rank;
-			}
-		}
-	}
-
-	void add(vector<T_KEY> words) {
-		for(size_t i = 0; i < words.size(); ++i) {
-			this->add(words[i]);
-		}
-	}
-
-	/**
-	 * Reset to its initial state.
-	 */
-	void reset() {
-		this->sparse = true;
-		this->sparseList.clear();  // 
-		this->M.clear();
-	}
-
-	/**
-	 * Convert from sparse representation (using tmpSet and sparseList) to normal (using register)
-	 */
-	void switchToNormalRepresentation() {
+      // if the sparseList is too large, switch to normal (register) representation
+      if (this->sparseList.size() > this->m) { // TODO: is the size of m correct?
+        switchToNormalRepresentation();
+      }
+    } else {
+      // normal mode
+      // take first p bits as index  {x63,...,x64-p}
+      uint32_t idx = get_index(hash_value, p);
+      // shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0}
+      uint8_t rank = get_rank(hash_value, p);
+
+      // update the register if current rank is bigger
+      if (rank > this->M[idx]) {
+        this->M[idx] = rank;
+      }
+    }
+  }
+
+  void add(vector<T_KEY> words) {
+    for(size_t i = 0; i < words.size(); ++i) {
+      this->add(words[i]);
+    }
+  }
+
+  /**
+   * Reset to its initial state.
+   */
+  void reset() {
+    this->sparse = true;
+    this->sparseList.clear();  // 
+    this->M.clear();
+  }
+
+  /**
+   * Convert from sparse representation (using tmpSet and sparseList) to normal (using register)
+   */
+  void switchToNormalRepresentation() {
 #ifdef HLL_DEBUG
-		cerr << "switching to normal representation" << endl;
-		cerr << " est before: " << cardinality(true) << endl;
+    cerr << "switching to normal representation" << endl;
+    cerr << " est before: " << cardinality(true) << endl;
 #endif
-		this->sparse = false;
-		this->M = vector<uint8_t>(this->m);
-		if (sparseList.size() > 0) { //TDOD: do I need to check this, here?
-			addToRegisters(this->sparseList);
-			this->sparseList.clear();
-		}
+    this->sparse = false;
+    this->M = vector<uint8_t>(this->m);
+    if (sparseList.size() > 0) { //TDOD: do I need to check this, here?
+      addToRegisters(this->sparseList);
+      this->sparseList.clear();
+    }
 #ifdef HLL_DEBUG
-		cerr << " est after: " << cardinality(true) << endl;
+    cerr << " est after: " << cardinality(true) << endl;
 #endif
-	}
-
-	/**
-	 * add sparseList to the registers of M
-	 */
-	void addToRegisters(const SparseListType &sparseList) {
-		if (sparseList.size() == 0) {
-			return;
-		}
-		for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) {
-
-			idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr);
-
-			assert_lt(ir.idx,M.size());
-			if (ir.rank > this->M[ir.idx]) {
-				this->M[ir.idx] = ir.rank;
-			}
-		}
-	}
-
-	/**
-	 * Merge another HyperLogLogPlusMinus into this. Converts to normal representation
-	 * @param other
-	 */
-	void merge(const HyperLogLogPlusMinus* other) {
-		if (this->p != other->p) {
-			throw std::invalid_argument("precisions must be equal");
-		}
-
-		if (this->sparse && other->sparse) {
-			if (this->sparseList.size()+other->sparseList.size() > this->m) {
-				switchToNormalRepresentation();
-				addToRegisters(other->sparseList);
-			} else {
-				this->sparseList.insert(other->sparseList.begin(),other->sparseList.end());
-			}
-		} else if (other->sparse) {
-			// other is sparse, but this is not
-			addToRegisters(other->sparseList);
-		} else {
-			if (this->sparse) {
-				switchToNormalRepresentation();
-			}
-
-			// merge registers
-			for (size_t i = 0; i < other->M.size(); ++i) {
-				if (other->M[i] > this->M[i]) {
-					this->M[i] = other->M[i];
-				}
-			}
-		}
-	}
-
-	HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus* other) {
-		merge(other);
-		return *this;
-	}
-
-	HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus& other) {
-		merge(&other);
-		return *this;
-	}
-
-	/**
-	 *
-	 * @return cardinality estimate
-	 */
-	uint64_t cardinality(bool verbose=true) {
-		if (sparse) {
-			// if we are still 'sparse', then use linear counting, which is more
-			//  accurate for low cardinalities, and use increased precision pPrime
-			return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
-		}
-
-		// initialize bias correction data
-		if (rawEstimateData.empty()) { initRawEstimateData(); }
-		if (biasData.empty())        { initBiasData(); }
-
-		// calculate raw estimate on registers
-		//double est = alpha(m) * harmonicMean(M, m);
-		double est = calculateEstimate(M);
-
-		// correct for biases if estimate is smaller than 5m
-		if (est <= double(m)*5.0) {
-			est -= getEstimateBias(est);
-		}
-
-		uint32_t v = countZeros(M);
-		if (v > 2) {
-			// calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix
-			double lc_estimate = linearCounting(m, v);
-
-			// check if the lc estimate is below the threshold
-			if (lc_estimate <= double(threshold[p-4])) {
-				if (lc_estimate < 0) { throw; }
-				// return lc estimate of cardinality
-				return lc_estimate;
-			}
-			return lc_estimate; // always use lc_estimate when available
-		}
-
-		// return bias-corrected hyperloglog estimate of cardinality
-		return uint64_t(est);
-	}
+  }
+
+  /**
+   * add sparseList to the registers of M
+   */
+  void addToRegisters(const SparseListType &sparseList) {
+    if (sparseList.size() == 0) {
+      return;
+    }
+    for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) {
+
+      idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr);
+
+      assert_lt(ir.idx,M.size());
+      if (ir.rank > this->M[ir.idx]) {
+        this->M[ir.idx] = ir.rank;
+      }
+    }
+  }
+
+  /**
+   * Merge another HyperLogLogPlusMinus into this. Converts to normal representation
+   * @param other
+   */
+  void merge(const HyperLogLogPlusMinus* other) {
+    if (this->p != other->p) {
+      throw std::invalid_argument("precisions must be equal");
+    }
+
+    if (this->sparse && other->sparse) {
+      if (this->sparseList.size()+other->sparseList.size() > this->m) {
+        // TODO: this switches to normal representation too soon if there is duplication
+        switchToNormalRepresentation();
+        addToRegisters(other->sparseList);
+      } else {
+        
+        for (const auto val : other->sparseList) {
+          insert_hash(this->sparseList, val);
+        }
+      }
+    } else if (other->sparse) {
+      // other is sparse, but this is not
+      addToRegisters(other->sparseList);
+    } else {
+      if (this->sparse) {
+        switchToNormalRepresentation();
+      }
+      // merge registers
+      for (size_t i = 0; i < other->M.size(); ++i) {
+        if (other->M[i] > this->M[i]) {
+          this->M[i] = other->M[i];
+        }
+      }
+    }
+  }
+
+  HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus* other) {
+    merge(other);
+    return *this;
+  }
+
+  HyperLogLogPlusMinus & operator+=(const HyperLogLogPlusMinus& other) {
+    merge(&other);
+    return *this;
+  }
+
+  /**
+   *
+   * @return cardinality estimate
+   */
+  uint64_t cardinality(bool verbose=true) {
+    if (sparse) {
+      // if we are still 'sparse', then use linear counting, which is more
+      //  accurate for low cardinalities, and use increased precision pPrime
+      return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
+    }
+
+    // initialize bias correction data
+    if (rawEstimateData.empty()) { initRawEstimateData(); }
+    if (biasData.empty())        { initBiasData(); }
+
+    // calculate raw estimate on registers
+    //double est = alpha(m) * harmonicMean(M, m);
+    double est = calculateEstimate(M);
+
+    // correct for biases if estimate is smaller than 5m
+    if (est <= double(m)*5.0) {
+      est -= getEstimateBias(est);
+    }
+
+    uint32_t v = countZeros(M);
+    if (v > 2) {
+      // calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix
+      double lc_estimate = linearCounting(m, v);
+
+      // check if the lc estimate is below the threshold
+      if (lc_estimate <= double(threshold[p-4])) {
+        if (lc_estimate < 0) { throw; }
+        // return lc estimate of cardinality
+        return lc_estimate;
+      }
+      return lc_estimate; // always use lc_estimate when available
+    }
+
+    // return bias-corrected hyperloglog estimate of cardinality
+    return uint64_t(est);
+  }
 
 private:
 
@@ -452,10 +472,10 @@ class HyperLogLogPlusMinus {
     }
 
     template<typename T> inline uint32_t get_index(const T hash_value, const uint8_t p, const uint8_t size) const {
-    	// take first p bits as index  {x63,...,x64-p}
-    	assert_lt(p,size);
-    	uint32_t idx = hash_value >> (size - p);
-    	return idx;
+      // take first p bits as index  {x63,...,x64-p}
+      assert_lt(p,size);
+      uint32_t idx = hash_value >> (size - p);
+      return idx;
     }
 
     inline uint32_t get_index(const uint64_t hash_value, const uint8_t p) const {
@@ -463,167 +483,166 @@ class HyperLogLogPlusMinus {
     }
 
     inline uint32_t get_index(const uint32_t hash_value, const uint8_t p) const {
-    	return get_index(hash_value, p, 32);
+      return get_index(hash_value, p, 32);
     }
 
     template<typename T> inline
-	T get_trailing_ones(const uint8_t p) const {
-    	return (T(1) << p ) - 1;
+  T get_trailing_ones(const uint8_t p) const {
+      return (T(1) << p ) - 1;
     }
 
     template<typename T> inline
     uint8_t get_rank(const T hash_value, const uint8_t p) const {
-    	// shift p values off, and count leading zeros of the remaining string {x63-p,...,x0}
-    	T_KEY rank_bits = (hash_value << p | get_trailing_ones<T>(p));
+      // shift p values off, and count leading zeros of the remaining string {x63-p,...,x0}
+      T_KEY rank_bits = (hash_value << p | get_trailing_ones<T>(p));
 #ifdef HLL_DEBUG
-    	cerr << "rank bits: " << bitset<32>(rank_bits) << endl;
+      cerr << "rank bits: " << bitset<32>(rank_bits) << endl;
 #endif
 
-    	uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1;
-    	assert_leq(rank_val,64-p+1);
-    	return rank_val;
+      uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1;
+      assert_leq(rank_val,64-p+1);
+      return rank_val;
     }
 
-	void initRawEstimateData() {
-	    rawEstimateData = vector<vector<double> >();
-
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)));
-	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)));
-
-	}
-
-	void initBiasData() {
-		biasData = vector<vector<double> >();
-
-		biasData.push_back(vector<double>(biasData_precision4,arr_len(biasData_precision4)));
-		biasData.push_back(vector<double>(biasData_precision5,arr_len(biasData_precision5)));
-		biasData.push_back(vector<double>(biasData_precision6,arr_len(biasData_precision6)));
-		biasData.push_back(vector<double>(biasData_precision7,arr_len(biasData_precision7)));
-		biasData.push_back(vector<double>(biasData_precision8,arr_len(biasData_precision8)));
-		biasData.push_back(vector<double>(biasData_precision9,arr_len(biasData_precision9)));
-		biasData.push_back(vector<double>(biasData_precision10,arr_len(biasData_precision10)));
-		biasData.push_back(vector<double>(biasData_precision11,arr_len(biasData_precision11)));
-		biasData.push_back(vector<double>(biasData_precision12,arr_len(biasData_precision12)));
-		biasData.push_back(vector<double>(biasData_precision13,arr_len(biasData_precision13)));
-		biasData.push_back(vector<double>(biasData_precision14,arr_len(biasData_precision14)));
-		biasData.push_back(vector<double>(biasData_precision15,arr_len(biasData_precision15)));
-		biasData.push_back(vector<double>(biasData_precision16,arr_len(biasData_precision16)));
-		biasData.push_back(vector<double>(biasData_precision17,arr_len(biasData_precision17)));
-		biasData.push_back(vector<double>(biasData_precision18,arr_len(biasData_precision18)));
-	}
-
-	/**
-	 * Estimate the bias using empirically determined values.
-	 * Uses weighted average of the two cells between which the estimate falls.
-	 * TODO: Check if nearest neighbor average gives better values, as proposed in the paper
-	 * @param est
-	 * @return correction value for
-	 */
-	double getEstimateBias(double estimate) {
-		vector<double> rawEstimateTable = rawEstimateData[p-4];
-		vector<double> biasTable = biasData[p-4];
-	
-		// check if estimate is lower than first entry, or larger than last
-		if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); }
-		if (rawEstimateTable.back()  <= estimate) { return rawEstimateTable.back() - biasTable.back(); }
-	
-		// get iterator to first element that is not smaller than estimate
-		vector<double>::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate);
-		size_t pos = it - rawEstimateTable.begin();
-
-		double e1 = rawEstimateTable[pos-1];
-		double e2 = rawEstimateTable[pos];
-	
-		double c = (estimate - e1) / (e2 - e1);
-
-		return biasTable[pos-1]*(1-c) + biasTable[pos]*c;
-	}
-	
-
-	/**
-	 * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation.
-	 *
-	 * Difference from the algorithm described in the paper:
-	 * The index always is in the p most significant bits
-	 *
-	 * see section 5.3 in Heule et al.
-	 * @param x the hash bits
-	 * @return encoded hash value
-	 */
-	uint32_t encodeHashIn32Bit(uint64_t hash_value) {
-		// extract first pPrime bits, and shift them onto a 32-bit integer
-		uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32);
+  void initRawEstimateData() {
+      rawEstimateData = vector<vector<double> >();
+
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)));
+      rawEstimateData.push_back(vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)));
+
+  }
+
+  void initBiasData() {
+    biasData = vector<vector<double> >();
+
+    biasData.push_back(vector<double>(biasData_precision4,arr_len(biasData_precision4)));
+    biasData.push_back(vector<double>(biasData_precision5,arr_len(biasData_precision5)));
+    biasData.push_back(vector<double>(biasData_precision6,arr_len(biasData_precision6)));
+    biasData.push_back(vector<double>(biasData_precision7,arr_len(biasData_precision7)));
+    biasData.push_back(vector<double>(biasData_precision8,arr_len(biasData_precision8)));
+    biasData.push_back(vector<double>(biasData_precision9,arr_len(biasData_precision9)));
+    biasData.push_back(vector<double>(biasData_precision10,arr_len(biasData_precision10)));
+    biasData.push_back(vector<double>(biasData_precision11,arr_len(biasData_precision11)));
+    biasData.push_back(vector<double>(biasData_precision12,arr_len(biasData_precision12)));
+    biasData.push_back(vector<double>(biasData_precision13,arr_len(biasData_precision13)));
+    biasData.push_back(vector<double>(biasData_precision14,arr_len(biasData_precision14)));
+    biasData.push_back(vector<double>(biasData_precision15,arr_len(biasData_precision15)));
+    biasData.push_back(vector<double>(biasData_precision16,arr_len(biasData_precision16)));
+    biasData.push_back(vector<double>(biasData_precision17,arr_len(biasData_precision17)));
+    biasData.push_back(vector<double>(biasData_precision18,arr_len(biasData_precision18)));
+  }
+
+  /**
+   * Estimate the bias using empirically determined values.
+   * Uses weighted average of the two cells between which the estimate falls.
+   * TODO: Check if nearest neighbor average gives better values, as proposed in the paper
+   * @param est
+   * @return correction value for
+   */
+  double getEstimateBias(double estimate) {
+    vector<double> rawEstimateTable = rawEstimateData[p-4];
+    vector<double> biasTable = biasData[p-4];
+  
+    // check if estimate is lower than first entry, or larger than last
+    if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); }
+    if (rawEstimateTable.back()  <= estimate) { return rawEstimateTable.back() - biasTable.back(); }
+  
+    // get iterator to first element that is not smaller than estimate
+    vector<double>::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate);
+    size_t pos = it - rawEstimateTable.begin();
+
+    double e1 = rawEstimateTable[pos-1];
+    double e2 = rawEstimateTable[pos];
+  
+    double c = (estimate - e1) / (e2 - e1);
+
+    return biasTable[pos-1]*(1-c) + biasTable[pos]*c;
+  }
+  
+
+  /**
+   * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation.
+   *
+   * Difference from the algorithm described in the paper:
+   * The index always is in the p most significant bits
+   *
+   * see section 5.3 in Heule et al.
+   * @param x the hash bits
+   * @return encoded hash value
+   */
+  uint32_t encodeHashIn32Bit(uint64_t hash_value) {
+    // extract first pPrime bits, and shift them onto a 32-bit integer
+    uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32);
 
 #ifdef HLL_DEBUG
-		cerr << "value:  " << bitset<64>(hash_value) << endl;
+    cerr << "value:  " << bitset<64>(hash_value) << endl;
         cerr << "index: " << std::bitset<32>(idx) << " ( bits from 64 to " << 64-pPrime << "; " << idx << ")" << endl;
 #endif
 
-		// are the bits {63-p, ..., 63-p'} all 0?
-		if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) {
-			// compute the additional rank (minimum rank is already p'-p)
-			// the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag
-			uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0
-			return idx | uint32_t(additional_rank<<1) | 1;
-		} else {
-			// else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0)
-			assert_eq((idx & 1),0);
-			return idx;
-		}
-	}
-
-
-	/**
-	 * struct holding the index and rank/rho of an entry
-	 */
-	struct idx_n_rank {
-		uint32_t idx;
-		uint8_t rank;
-		idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {}
-	};
-
-	//
-	//
-	/**
-	 * Decode a hash from the sparse representation.
-	 * Returns the index and number of leading zeros (nlz) with precision p stored in k
-	 * @param k the hash bits
-	 * @return index and rank in non-sparse format
-	 */
-	idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const  {
-
-		// difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes
-		uint32_t idx = get_index(encoded_hash_value, p);
-		uint8_t rank_val;
-
-		// check if the last bit is 1
-		if ( (encoded_hash_value & 1) == 1) {
-			// if yes: the hash was stored with higher precision, bits p to pPrime were 0
-			uint8_t additional_rank = pPrime - p;
-			rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1);
-		} else {
-			rank_val = get_rank(encoded_hash_value,p);
-
-			// clz counts 64 bit only, it seems
-			if (rank_val > 32)
-				rank_val -= 32;
-		}
-
-		return(idx_n_rank(idx,rank_val));
-	}
+    // are the bits {63-p, ..., 63-p'} all 0?
+    if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) {
+      // compute the additional rank (minimum rank is already p'-p)
+      // the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag
+      uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0
+      return idx | uint32_t(additional_rank<<1) | 1;
+    } else {
+      // else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0)
+      assert_eq((idx & 1),0);
+      return idx;
+    }
+  }
+
+
+  /**
+   * struct holding the index and rank/rho of an entry
+   */
+  struct idx_n_rank {
+    uint32_t idx;
+    uint8_t rank;
+    idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {}
+  };
+
+  //
+  //
+  /**
+   * Decode hash from sparse representation.
+   * Returns the index and number of leading zeros (nlz) with precision p stored in k
+   * @return index and rank in non-sparse format
+   */
+  idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const  {
+
+    // difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes
+    uint32_t idx = get_index(encoded_hash_value, p);
+    uint8_t rank_val;
+
+    // check if the last bit is 1
+    if ( (encoded_hash_value & 1) == 1) {
+      // if yes: the hash was stored with higher precision, bits p to pPrime were 0
+      uint8_t additional_rank = pPrime - p;
+      rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1);
+    } else {
+      rank_val = get_rank(encoded_hash_value,p);
+
+      // clz counts 64 bit only, it seems
+      if (rank_val > 32)
+        rank_val -= 32;
+    }
+
+    return(idx_n_rank(idx,rank_val));
+  }
 
 };
 

From f34f8d4722aa8729b4ba3dca21c9a9ebc05893cd Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 8 Nov 2017 21:34:00 -0500
Subject: [PATCH 099/105] Fix gzstream compilation, update on HLL

---
 src/Makefile               |  10 ++-
 src/classify.cpp           |  60 +++++++--------
 src/gzstream/Makefile      |   2 +-
 src/gzstream/index.html    | 145 -------------------------------------
 src/gzstream/libgzstream.a | Bin 9648 -> 0 bytes
 src/gzstream/logo.gif      | Bin 1651 -> 0 bytes
 src/hyperloglogbias.h      |   4 +-
 src/hyperloglogplus.h      | 127 +++++++++++++++-----------------
 8 files changed, 98 insertions(+), 250 deletions(-)
 delete mode 100644 src/gzstream/index.html
 delete mode 100644 src/gzstream/libgzstream.a
 delete mode 100644 src/gzstream/logo.gif

diff --git a/src/Makefile b/src/Makefile
index cc84b11..d236de3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,9 +1,10 @@
 CXX = g++
 FOPENMP?=-fopenmp
-CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -O2 -Wfatal-errors ${CPPFLAGS}
+CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS}
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
 PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
-LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream ${LDFLAGS}
+#LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS}
+LIBFLAGS = -L. -lz ${LDFLAGS}
 
 .PHONY: all install clean
 
@@ -25,7 +26,7 @@ grade_classification: taxdb.h report-cols.h
 
 read_uid_mapping: quickfile.o
 
-classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h
+classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o gzstream.o hyperloglogplus.h taxdb.h report-cols.h
 	$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)
 
 build_taxdb: taxdb.h report-cols.h quickfile.o
@@ -43,6 +44,9 @@ krakendb.o: krakendb.cpp krakendb.hpp quickfile.hpp
 seqreader.o: seqreader.cpp seqreader.hpp quickfile.hpp
 	$(CXX) $(CXXFLAGS) -c seqreader.cpp
 
+gzstream.o: gzstream/gzstream.C gzstream/gzstream.h
+	$(CXX) $(CXXFLAGS) -c -O gzstream/gzstream.C
+
 quickfile.o: quickfile.cpp quickfile.hpp
 	$(CXX) $(CXXFLAGS) -c quickfile.cpp
 
diff --git a/src/classify.cpp b/src/classify.cpp
index 3b8a03e..2ceca19 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -114,19 +114,19 @@ ostream* cout_or_file(string file) {
 }
 
 void loadKrakenDB(KrakenDB& database, string DB_filename, string Index_filename) {
-	QuickFile db_file;
-	db_file.open_file(DB_filename);
-	if (Populate_memory) {
-		db_file.load_file();
-	}
-	database = KrakenDB(db_file.ptr());
-	QuickFile idx_file;
-	idx_file.open_file(Index_filename);
-	if (Populate_memory)
-		idx_file.load_file();
-
-	KrakenDBIndex db_index(idx_file.ptr());
-	database.set_index(&db_index);
+  QuickFile db_file;
+  db_file.open_file(DB_filename);
+  if (Populate_memory) {
+    db_file.load_file();
+  }
+  database = KrakenDB(db_file.ptr());
+  QuickFile idx_file;
+  idx_file.open_file(Index_filename);
+  if (Populate_memory)
+    idx_file.load_file();
+
+  KrakenDBIndex db_index(idx_file.ptr());
+  database.set_index(&db_index);
 }
 
 int main(int argc, char **argv) {
@@ -242,18 +242,18 @@ int main(int argc, char **argv) {
       taxdb.readGenomeSizes(fname);
     }
 
-	TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, taxon_counts, false);
-	rep.setReportCols(vector<string> { 
-		"%",
-		"reads", 
+  TaxReport<uint32_t,ReadCounts> rep = TaxReport<uint32_t, ReadCounts>(*Report_output, taxdb, taxon_counts, false);
+  rep.setReportCols(vector<string> { 
+    "%",
+    "reads", 
     "taxReads",
     "kmers",
     "dup",
     "cov", 
-		"taxID", 
-		"rank", 
-		"taxName"});
-	rep.printReport("kraken","blu");
+    "taxID", 
+    "rank", 
+    "taxName"});
+  rep.printReport("kraken","blu");
   }
 
   for (size_t i = 0; i < Open_fstreams.size(); ++i) {
@@ -367,11 +367,11 @@ void process_file(char *filename) {
 
 inline
 uint32_t get_taxon_for_kmer(KrakenDB& database, uint64_t* kmer_ptr, uint64_t& current_bin_key,
-		int64_t& current_min_pos, int64_t& current_max_pos) {
-	uint32_t* val_ptr = database.kmer_query(
-			database.canonical_representation(*kmer_ptr), &current_bin_key,
-			&current_min_pos, &current_max_pos);
-	return val_ptr ? *val_ptr : 0;
+    int64_t& current_min_pos, int64_t& current_max_pos) {
+  uint32_t* val_ptr = database.kmer_query(
+      database.canonical_representation(*kmer_ptr), &current_bin_key,
+      &current_min_pos, &current_max_pos);
+  return val_ptr ? *val_ptr : 0;
 }
 
 
@@ -512,10 +512,10 @@ bool classify_sequence(DNASequence &dna, ostringstream &koss,
         taxon = get_taxon_for_kmer(*KrakenDatabases[i], kmer_ptr,
         db_statuses[i].current_bin_key, db_statuses[i].current_min_pos, db_statuses[i].current_max_pos);
 
-	  //uint32_t* val_ptr = KrakenDatabases[i]->kmer_query(
-		//	KrakenDatabases[i]->canonical_representation(*kmer_ptr), &db_statuses[i].current_bin_key,
-		//	&db_statuses[i].current_min_pos, &db_statuses[i].current_max_pos);
-	  //taxon = val_ptr ? *val_ptr : 0;
+        //uint32_t* val_ptr = KrakenDatabases[i]->kmer_query(
+        //  KrakenDatabases[i]->canonical_representation(*kmer_ptr), &db_statuses[i].current_bin_key,
+        //  &db_statuses[i].current_min_pos, &db_statuses[i].current_max_pos);
+        //taxon = val_ptr ? *val_ptr : 0;
           if (taxon) break;
         }
 
diff --git a/src/gzstream/Makefile b/src/gzstream/Makefile
index 8c21da1..4c32088 100644
--- a/src/gzstream/Makefile
+++ b/src/gzstream/Makefile
@@ -33,7 +33,7 @@
 # ----------------------------------------------------------------------------
 
 # CXX      = CC -n32 -LANG:std   # for SGI Irix 6.5, MIPSpro CC version 7.30
-CXX      = g++-7   # for Linux RedHat 6.1, g++ version 2.95.2
+CXX      = g++   # for Linux RedHat 6.1, g++ version 2.95.2
 
 CPPFLAGS = -I. -O -fPIC
 LDFLAGS  = -L. -lgzstream -lz
diff --git a/src/gzstream/index.html b/src/gzstream/index.html
deleted file mode 100644
index 8a9ef8e..0000000
--- a/src/gzstream/index.html
+++ /dev/null
@@ -1,145 +0,0 @@
-<html> <head>  
-<title>Gzstream Library Home Page</title>
-</head>  
-<body BGCOLOR="FAF8E8" TEXT="#000000">
-
-<h1>Gzstream Library Home Page</h1>
-
-<hr>
-    <TABLE><TR><TD ALIGN=LEFT VALIGN=TOP>
-        <img border=0 src="logo.gif" align=center>
-    </TD><TD ALIGN=LEFT VALIGN=TOP NOWRAP>
-	<ul>
-	<li><a href="#intro"> Introduction</a>
-        <li><a href="#sys">   Supported Systems</a>
-	<li><a href="#inst">  Installation</a>
-	<li><a href="#doc">   Documentation</a>
-	<li><a href="#miss">  What's Missing</a>
-	<li><a href="#src">   Download</a>
-	<li><a href="#links"> Links</a><P>
-	</ul>
-    </TD></TR></TABLE>
-
-
-<hr><!-------------------------------------------------------------------->
-<a name="intro"><h2> Introduction </h2></a>
-
-<i>Gzstream</i> is a small C++ library, basically just a wrapper,
-that provides the functionality of the 
-<a href="http://www.gzip.org/zlib/">zlib C-library</a> in a C++ iostream.
-It is freely available under the <a href="COPYING.LIB">LGPL license</a>.<P>
-
-Gzstream has been written by 
-<a href="http://www.cs.unc.edu/~debug/">Deepak Bandyopadhyay</a> and
-<a href="http://www.cs.unc.edu/~kettner/">Lutz Kettner</a> at 
-the <a href="http://www.cs.unc.edu/Research/compgeom/">Computational 
-Geometry Group at UNC Chapel Hill</a>.<P>
-
-
-<hr><!-------------------------------------------------------------------->
-<a name="sys"><h2> Supported Systems </h2></a>
-
-Gzstream requires a standard compliant C++ compiler (we use the new
-header file conventions and the new iostream in the std:: name space)
-and, of course, zlib. We used zlib 1.1.3 so far, but see the <a
-href="http://www.gzip.org/zlib/">zlib home page</a> for why you should
-upgrade to zlib 1.1.4. So, in theory, the provided sources could run
-on many platforms. However, we used only the following few
-platforms.<P>
-<P>
-
-<ul>
-  <li> PC Linux, RedHat 6.1, g++ version 2.95.2
-  <li> PC Linux, Debian, g++ version 2.95.2 and 3.1
-  <li> SGI Irix 6.5, MIPSpro CC version 7.30
-</ul><P>
-
-
-<hr><!-------------------------------------------------------------------->
-<a name="inst"><h2> Installation </h2></a>
-
-Either compile <tt>gzstream.C</tt> by hand, place it in some library,
-and move <tt>gzstream.h</tt> into the include search path of your
-compiler. Or use the provided <tt>Makefile</tt>, adapt its 
-variables, and follow the remarks in the <tt>Makefile</tt>. Two 
-test programs are provided, <tt>test_gzip.C</tt> and <tt>test_gunzip.C</tt>.
-The <tt>Makefile</tt> contains a rule that performs a small test
-with these programs.<P>
-
-
-<hr><!-------------------------------------------------------------------->
-<a name="doc"><h2> Documentation </h2></a>
-
-The library provides two classes, <tt>igzstream</tt> and <tt>ogzstream</tt>,
-that can be used analogously to <tt>ifstream</tt> and <tt>ofstream</tt>
-respectively.<P>
-
-The classes are by default in the global name space. This can 
-be changed by setting the macro <tt>GZSTREAM_NAMESPACE</tt> to
-the desired name space, e.g., by setting the option 
-</tt>-DGZSTREAM_NAMESPACE=gz</tt> in the <tt>Makefile</tt>. 
-However, this needs to be consistent for both, the library compilation
-and the application that uses the library.<P>
-
-
-<hr><!-------------------------------------------------------------------->
-<a name="miss"><h2> What's Missing </h2></a>
-
-<ul>
-  <li> Seek. The zlib library provides the necessary functionality,
-       but we have not realized that in the wrapper (yet? ;-).
-  <li> Both streams are based on the same streambuffer. So, they 
-       cannot be used to derive an iogzstream class that would allow
-       simultaneous reading and writing to the same file.
-</ul><P>
-
-
-<hr><!-------------------------------------------------------------------->
-<a name="src"><h2> Download and Release Notes</h2></a>
-
-<ul>
-  <li> Gzstream library 1.5 (08 Apr 2003):
-         <a href="gzstream.tgz">gzstream.tgz</a><br>
-       Fixed bug that did not set the state correctly on failure to open or 
-       close a file. <br>
-       Fixed bug in the indexing of the write buffer that
-       caused the write buffer to shrink continously and finally caused 
-       wrong results when writing compressed files (only observed on some
-       platforms). <P>
-  <li> Gzstream library 1.4 (27 Apr 2002):<br>
-       Fixed a bug that stopped stream output after calling <tt>flush()</tt>
-       or using <tt>std::endl</tt>.<P>
-  <li> Gzstream library 1.3 (06 Nov 2001):<br>
-       Fixed unsigned char -- signed char bug. Increased buffer size
-       for better performance.<P>
-  <li> Gzstream library 1.2 (04 Oct 2001):<br>
-       Initial release as gzstream, renamed from zipstream.<P>
-  <li> Zipstream library 1.1 (09 Sep 2001):<br>
-       Initial release.
-</ul>
-
-<hr><!-------------------------------------------------------------------->
-<a name="links"><h2> Acknowledgements </h2></a>
-
-Credits for finding bugs and improving this software go to:
-Vincent Ricard, Peter Milley, Peter J. Torelli, and Ares Lagae.
-<P>
-
-<hr><!-------------------------------------------------------------------->
-<a name="links"><h2> Links </h2></a>
-
-<ul>
-    <li><a href="http://www.gzip.org/zlib/">zlib C-library</a>
-    <li><a href="http://www.cs.unc.edu/~debug/">Deepak Bandyopadhyay</a>
-    <li><a href="http://www.cs.unc.edu/~kettner/">Lutz Kettner</a>
-    <li><a href="http://www.cs.unc.edu/Research/compgeom/">
-            The Computational Geometry Group at UNC Chapel Hill</a>
-</ul>
-
-<hr><!-------------------------------------------------------------------->
-<address> 
-    The Computational Geometry Group at UNC Chapel Hill, Jan. 08, 2003. 
-</address>
-</body>  </html>
-<!-------------------------------------------------------------------->
-<!EOF>
diff --git a/src/gzstream/libgzstream.a b/src/gzstream/libgzstream.a
deleted file mode 100644
index e0df77266a1f739fb8331743cfa8eb3717ebbf09..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9648
zcmc&)Yj7LY6<*u2t(tVL(ooE+C?ur{se|O$iA_n6klm~Yk+`w*CfUfcY^$*?Bdue_
zOhc&1W2-2b#|&ZU586(Lv}Fo21)6~ZX+#~E0SbjerZ6S^D3~!&3N#@E;(llESxGA^
ziPJy5<JGt4dCxuf+^f}I&$>1k3-vCkn(eu|&h5-KPlJ1*$5U6osKM!MaMwW~#2&YM
zVST;R+2|o2wM|v3I=_AGjm<4TaJDaB(b3W@p&g_Tw81h-x`J^D<7&p68TT?i#CVAD
z2;+Yk9VNowRgBGy>ltH=I^)k6A7RWezQH)bc#84-QsLjj*alS9b;~@So{_i~3kUl<
zhq^s=-F-vxUKO|9;aJO-sT%ds!SFy!+YO;e5m|ieK&Y5eL#Qtr4;Pa(Mz@4x-F?yF
zmLmR|h6cJ=S|pNYcQJ>}9+O*_9ak_OE)3fuw1#I9`8|1KwMyzvC&>k7F*r?R&2=K+
zj#kS|P<FwxcrbYFQ`OpwsC5+Kt(t1Os)*Vaw>nT)uP(ES%u#imMof7>%{p7qwBR)B
zY>~UsTxaHN&5vS1aQn*_AD*g|q!KJM31uwVa2+MWi3{x_T)jk}rj34`1!Rb3((ozK
zRYgoRRn@`?P23B*qh(D;jyCUhRq!4BQ9PsHkfg=LvPs(s%%-Y6p^zF*gu;WGss%e$
zwb|F<Go_Mb`Ux;=ml;S-#d0U&Q{!4NrVR~Jyw><$<+!e9t5?}y;+v9Wh^E9BiVnnc
z11|Jur{thrE<m!MCrM{WGvX=qXC1GL{W&RVXfF1*AMsr#Se8$fhN?wEYFAJTLQ<GN
z#V*v@outgLm`@>8HQcLq$AbM~Au1ei8~bZ47xFgvqw$W;lq3%j;r0rN_*B*YXqOrp
zh-hl}&_HNPe7m{s*sZi1bwd7Xz9s7BYx&M_&-sfDg|ukQpsn+3)~#tqb$6zws$F61
z>83mQKf6COm&N?f0n<T(8KSW8)gjg3a$-E!1HBhZ(nFYI<VkA7MUBQt7wOI73vf<N
zNew%c9mljPrD?dPde={3d~8<&k=m<ozYWCpXND<kZJTeUuVZDqlKjA~=qDBZ-^#q@
z_5-x&lP7G|%b!z1uLROP%3R!x2TJ~_%za(a|1}P=6i9ClD5+b#Pt1d(fc}xe+;MPQ
zouY5H2lUMjWnXK$*=AF8e@*q{{(zEv*1mp&?<U`dO!eaxnFEt(`JWU2;FyM@6pjPo
z==V~@#}1VZb?UDq->FHSsL|A8hwR#-vF14yF(n<CQ>p03$4WZKd?k{S*5*_gQJ_n&
zvg4q3%~+B~S)%Ai(7R)OhAb+M9dc;TlY?Qy!NgMsXgC?;bK(sFFAI2nwB1b9TuM)>
za<a`WoJury9<!mWRJR;KN9!|BlDmn|+Q}eYPLr?bl_IW`u}YM*7o#gv(zi4k?Dd)H
z>ps0515)(2*>E&Bb?NdN(G}dg6n)ojQcOoNtzy2ivG{YW{CL1l^N5sueo{Dkl0b~Y
zn8-{Zbzy7S$oTs-T8iJ0Jw1v7&xMz-W0aZx6$;a-q*RwO_fsWxt5^TKVb_!yn(5Qu
zI{u~-dQ&Oc;A+;7Qmtfsf21;*_;^$luS1(>&WEc#d1_Ldp`@0&yt~~>a;d9C|J3l6
z{CrX?rHRNdF(tLyC8lY|U$qLQX|*#}lA)?MO{(uXSpE3H<5Vd2OfIXb3QV4oT4h&K
z-f=~rS^2-FR3vRw%=E3ITdL7gRAs76DO&^YZ>QqU<Vz(+NE7dk`d0Dc)K9kRfAi@t
zDN~}s+C*e|RkhTv9A0W{I6x{03Fu^WF}fKW4NOP9MQUU$c`GIy736Qja&qHxDe2@k
zqx$@fV0VCpGWOTt!i<(7e5EWg9?;)4;&0YJ!KPrA!bC%>{^z_34CwC}T9(*zg&E+Q
z75jy6V`kL1I-3o#RkiBJeEJ(o`eCCy%^E+P+(bJp+E}ue?#7Z40_AXWkOjC+<Zid;
zZg-ov=1Rf}rIlnX0PJ?wL?hV4eonNX6YY1REA-{(MEg0>ek0L=bjX_>x4AL+?dO(g
zBd20PH|t*n^kdl!FK^YyEE(oypI;q*Y|KRxFP6Buuj|Lf)MWQ--$tLB8QuPjQ4m8V
zyjxAHh9@V>q=M_>1IQ7cUZfW(SzqDeM+b>CFP2eZB%Mvw*N=~^)cB4{8xM^l!1M)B
zL4`ILkjj(L8YU*5LGpj25pDfhAikRoP3wm-vsRWw5t*%QoHnM(Nt6D6p;3A0+{*#b
zYM4$-oD%Vycs83$S)pX4EwDl=NjNI+r-uZGV@9QY6n)+ff}VAb2)C1uvc8>BvdzPt
z>`0f|Xqm}1d&+DSs_fip6F078xs6=PZd-`J#c}~P!ptGSZZI=rFT7}@iIUyP@}0Il
zW$99hetOXRf;NuhZroGL#X-+uvLjh4I&<1qfUc4qq8s7(JR6?QB-x&^(N6%`nJks+
zaYHJfbKb52O)clw7!CQQ$KEQLJ(1nD5FKQ@?~xIj{b6Q^e#!1Cuw~O9*=exb&1y|#
zgk~R}Vz&tFtX%-aL$GUOwGlExvtMVm+gU0)u1S`eiMSbRat+5Lz;4Xxmozjs$K$=x
z%s81e@-qiBd*myTc(X3Edbx3m+_E9n%uTR+fbCA_pXT*;=p|UsSnS~C{2OKE*kg`s
zr&(V~o2?d#Rk@)VntAi{78p+U>xC+{oL}0EjCyg>4OgB-^EeYfOvb+y<DCzhTF#Ds
zVWX^O64)i0muBc6VEwhAso5W72Ir^9ytF|r7d*`c{84?}1Ao12POFPru75+3>gs&3
zv-PNfcmZ~9wmXm1(Cn9+afm$bHP>mnOe{I)9&?@UEElo5lNGj;FwL!0!?a+f8r~&o
z53m`_`@l(&uiMOiA7qB;#c_w(?_RUtv`vUyd(3`!R)_!^la-=&y|!MslN&iM0WOYP
zKw(srzt<3ti@P_zL%_@BkA9IE^KQu>&HD^|C4Kyb7>D`ZAZKUnVGiqRmZaOTlhjF4
zZ5TKMxCA&0xDbe^gxc9aJS8l?6o@`*X8^IV7ncJ`{{;HH41XU2@e{W8Z6G$9+FvmL
zejsX~b~TXfmH@vE{Z9}MVyry`r2d`&lK<ZTN$;0H@_#Ro{HK8A|HnY`-^BbnAo>3Y
zwL^AK0ckux2VzI9-2)`Qx3F9Tl3#kqM1F%zUj@W(+uGx(6~f;Fy}-Rde4W?Q^BDHJ
z+Bgs!MeUCmR|6?eULfVE2}u39fE3RxAoX_vkouFE{tSgdd3Xm%eqRHU-LpVr96+*r
z3`lnSS-y|uce0$`9h1BVxET5;PzQK#T^k2p2iyqs0j~t&yP<X#5PQO6{Jf|4WtY(|
z@_iuslCT}s(rX>!*T5g%70L7t+X=J-F9LoHIr1To1Ca;$D3JKi0Wlu=AmafbUYW>`
z0MWO+A2=8I0B|;NA5g~MDDXSL-9UWN$lHOaa5({V0SAHK1@;2x05<`z0ImU61KWTO
zpaQG`x`9|jvJ-d-5Ikui5IWKVpdE;@qiadR+L9fpNANZhqNL>`s3)QcvB2cDOcO#y
zR+uJ4?&Nn+H>6Jpmi#i)gfl_!W10~4E3ajm5P!15G~tDyYnUdaJRC*6kbgqTC;b>8
znh<p=Z(^E|=8Ni%<b*U&64QiE(0{@_lAI7Mna&!BCZzeKdLo(-f3opv1~eh%`-dzi
zM3cQt6Vg0Tosc~tYRZ^@!#|X<9%UDhXhNEQs=sR`=|+H&|NBsUv_{PLsW&s7!sJj}
z$@Cpe`<cF*X%Ex)GJQGIk1=f{8o4#!uX-s7kSPd?S`W!_nePv80<RilxeI#Ks$fg>
zqk@z^021v0pV~HDL@T(c4YU5sY>%~R$PFPr-i2Ou$#&^?tiOTjpD=wT+f)5e+lPz%
zui)`)F#V(WsQH<Gmh~?|zob9F{oiHy$MXUEFK2oQ>wil2D0K7v?w#;Uv==#{c9i7c
zneUTnKPI|{<q_5&XZl6<*UR)umVeGPJ;PCaJ2}2M)16HJ3i@DWTc&`<8nDRkgI&IU
zuz-H0fG)+@^7Yph(DW>mFaLc(f3$Ds%YR)!KUqLG7trnk`qBb=CgvHeY@=M~)2|oE
z>H9oi{!GDm-Y=l{7wE4mpw|@StFC~4tRS8w<~6_nK!KdjDD&liD2S(`K%aE;^(D1G
z+#ed;iWlJPmbGi1hR$FdrwZBQ&ejl^o=|TvrfRWZM2okET3Q;0jPuAMCyr-6k39R?
z=^4%$Eu!g0+7$1N4)+HKwiZ2ft<OJ=wY)Gej~tPb#z-`-A}iryJU<^z=IhnB;?yH|
zl$rlx!sMRmgwQx~wgzJ!*tTm6`=X&>U)c0BrCS`vido63W;vEOBxgIS%pM$D0}*UB
zlnAOpoCiiUDw}pK(5R}BC>{1lYR?EAe-po}uWd{JIn~v|M7$@g;p{xrcW!p^u!i&a
zbGJ*x2ZyM?8@NMDj?Q=>enugxB6s|6$)M%1JijsuPx+@^;DepP(B|%7q)+^NKu3`i
z>>}skg(vAfBg3(X7UrxsiA?~7Uwrw8`oh6j3qq_@Td_EB;VcryJtM|{F_0I;VHvo=
zD|X^u>|}oGq5jH&;mAOjx*{AO!U?5{)4O<7?F|le^_ekGbLWcSe>lP|D{jcH4<7RL
z%ooMO;my(R?v_M$@a^DqMPi^A7|B?U>c`RNxrJr;9gN12)B0{8rUwV8okm(!jWS*6
zyk`XekJ2SFz0Bjr@n|9u?2K&j)ZsoJQA2&fcwE)C4u)|E+Z}Bwrinhb^cPpf#&VV-
zsHf%L(uUBAd}Vl4?(nQ`jJR5-r?#qiXH&nQOsB;{E2_OTu>5*o0ErvwR9jKT;_Gj|
zUs~$-=lzpHU$|ikr+x}&!4yuN)S`&Pb<Y%;2f?a6iG(^Bj>V$`!M=#LRo&vAqK6-1
HRcQYO8{rkq

diff --git a/src/gzstream/logo.gif b/src/gzstream/logo.gif
deleted file mode 100644
index e259089fbb097573bdc3bab9eccc75e548cf7251..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1651
zcmc(e`&Z0)7{}l3jC|8rW;J1BOOZK!%QYqubK-O%w{_WMSe0qa7^NBs*~ZDCEjE{2
zHl>g<sdQmVjBidMnPOdPinYY4`A(~>OFG4Tw%OhPVV@tKAD-u&_wzdM^M3pT{k%k>
zsZik)AVhEP=U)C!P_al<ap)NErQn4ETgG6r4oWVf#E=biU=kn$<X(s{Vj{sj1_SjF
z5CD`#h^BCq1(!g~pxD9CqHPesmV$V_Cn;bFj}VF>xGV;{8)O_9@nx=%#X|xC1Ik4l
zX~hCW3}#ShHK%|DkAPxOV#W(s2sog`n9}QI)oO&mvM`7_4J>R49&;^F)_@q`a4EG!
zVqlE|%PmIojQYx=2{15l(E!*2l1(tNAOsA9bR%L}SQ9)jquCYe02o>z5P+hfo(4lZ
z1S65Ghbj7BUPuzb(BSeR=HWpynpq2r`U?zQkmymXhKUB@gQ1X}h<F+l9~2J<0Uf~m
z$0R5XFg1YK2F2!f7Rbm*Yep;%)^qw;j3UT9veI<X;uJ$MyE3^qOjxF`>|~9E5Qbrb
z)Lxx5OZWtY`$1i6qK%>Y1(hs2n2=zY``|hX9Ox4e-HLGnh$-aMS``|>Cn^mWg#aJm
z8t|VDVq5M~4&neQ5QEp-!N!2d!KkoN3X>BdQih2)bLH8H#XwLX&v_fUhn0aifr+rP
zk^@Q<Nzw0=te$8`PX>Q0C}pQQltb?TC51W?mf>coRiIViN1|XLSSnCaQxmsZ({k{`
z!5dKi|I6KiVEF}5W>`^U0!ngjCQ&2YBy}ZcYW5FzS(q76diGZ8sOgGlb*1O-q&dw`
z853BRd-tetMS<jQ+4*}Xri9mL2A1bFobrmT;#IH6U%G!z(vY!z<wEtDfTKO0iWQni
zxk0kISqnW)OA3OMLZrW5>3mou)a@L!>-)msvhn54E9vDL{|dQgTKZFag@2_B+n5zH
zl3jbv`Qyyj4*YJv8Yi3i39S`w{<V&yMb^CZGQYa*6PCtFTB0KEkDC!?A7Y5{)2C&v
zw(~J~`aM3e=F(nW>&MV1Q{*ci?_S=cd}_aA@SUt?_ti<a)0*wg-dEf9G-jtYeJk?b
z*jYQV&zw5e^F!D3d~@PZ-s$q{ckPR&cON|MeY2<Qifq`>d3e`X)7%l2=x8>-l~`3s
zc4}CEM|H!v_)~j+wy8Xo^yk!3qxU{KEczpC<fQRu_Su>JH^|2Rl%KQz)#A#R%SXGZ
z!hC7%@wWBW+W12UrtIr|9_QXlIm>V9PP!y{+S}!(Jd8HJ3h}P$eJS%#F>Fj+^mSc$
znwR&i@gdX8ww>D$oRR$F3Hx`0U5~vuxaOOk5g98ZG8BGy@)q9K?O1g<rb*tB72mGu
zIGxbLc4Vuq-HbVV?G?r|Gn(>^XAjIY8P6T^b7OPUS1Z`_CnE~jyv#Tgo1e4K&2%C6
zxWc5lkY~z1R-jclUNjfyRY?oi-K=Wg(V~%_j47YnF*f4W;=qjw%Wh<xFBSHr<f<9x
z{@=?c+z?$nGQ%u<bKGx;tw&j-uSi+(peo=^S#!!sVY1Eh?e3LdYy#b7F0QlRTwhY<
z;89u`xAv@7*f>#H{+AWCwVF%%i?!{u9KPFZ;~;r^lM?AYt`F-@Ezu319F((xBU4VT
z5zq;xMY=YJD8_EKzPsYLFv+yrLr#n-tR5aWCpvxEFQ+cu@vToNzq{(}=l6QAr>Wz0
zzNw#!?@#XMW!5j4KlEMA;<YWYPiE6T$L|%^^QMP!!aY7bS?7|as@*iKNZVvzqu<t~
zyw<P1TD04zqG9<6pHFKW!$hBl-+1k&iK_E(?B8A&8BTY#b!a!!*Xm7~s<w}f7vit9
zXoI|7loU%<Evx;Ph@Q!cE=IFU%D2c(m)9#qokc%nyl*buTowK%(=}yR{=BZP{_g6*
J!`6#2`)^o?o1Xvx

diff --git a/src/hyperloglogbias.h b/src/hyperloglogbias.h
index 013bd5b..9647cfd 100644
--- a/src/hyperloglogbias.h
+++ b/src/hyperloglogbias.h
@@ -8,7 +8,9 @@
 #ifndef HYPERLOGLOGBIAS_H_
 #define HYPERLOGLOGBIAS_H_
 
-const double rawEstimateData_precision4[] = {
+// Bias data from https://docs.google.com/document/d/1gyjfMHy43U9OWBXxfaeG-3MjGzejW1dlpyMwEYAAWEI/view?fullscreen
+
+static const double rawEstimateData_precision4[] = {
     11, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394
 };
 
diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
index 5e27407..1c5091c 100644
--- a/src/hyperloglogplus.h
+++ b/src/hyperloglogplus.h
@@ -34,7 +34,6 @@ using namespace std;
 static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100,
                 6500, 11500, 20000, 50000, 120000, 350000};
 
-
 ///////////////////////
 
 //
@@ -102,7 +101,7 @@ double alpha(uint32_t m)  {
  * @param array
  * @return
  */
-double calculateEstimate(vector<uint8_t> array) {
+double calculateRawEstimate(vector<uint8_t> array) {
   double inverseSum = 0.0;
   for (size_t i = 0; i < array.size(); ++i) {
     // TODO: pre-calculate the power calculation
@@ -238,10 +237,6 @@ class HyperLogLogPlusMinus {
   bool sparse;          // sparse representation of the data?
   SparseListType sparseList; // TODO: use a compressed list instead
 
-  // vectors containing data for bias correction
-  vector<vector<double> > rawEstimateData; // TODO: make this static
-  vector<vector<double> > biasData;
-
   // sparse versions of p and m
   static const uint8_t  pPrime = 25; // precision when using a sparse representation
                                      // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32
@@ -257,12 +252,12 @@ class HyperLogLogPlusMinus {
    * @param precision
    * @param sparse
    */
-  HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),sparse(sparse) {
+  HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true):p(precision),m(1<<precision),sparse(sparse) {
     if (precision > 18 || precision < 4) {
           throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
     }
 
-    this->m = 1 << precision;
+    //this->m = 1 << precision;
 
     if (sparse) {
       this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
@@ -427,39 +422,30 @@ class HyperLogLogPlusMinus {
    */
   uint64_t cardinality(bool verbose=true) {
     if (sparse) {
-      // if we are still 'sparse', then use linear counting, which is more
-      //  accurate for low cardinalities, and use increased precision pPrime
+      // if we are 'sparse', then use linear counting with increased precision pPrime
       return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
     }
 
-    // initialize bias correction data
-    if (rawEstimateData.empty()) { initRawEstimateData(); }
-    if (biasData.empty())        { initBiasData(); }
+    // use linear counting (lc) estimate if there are zeros in the matrix
+    //  AND the lc estimate is smaller than an empirically defined threshold
+    uint32_t v = countZeros(M);
+    if (v != 0) {
+      uint64_t lc_estimate = linearCounting(m, v);
+      // check if the lc estimate is below the threshold
+      assert(lc_estimate >= 0);
+      if (lc_estimate <= double(threshold[p-4])) {
+        return lc_estimate;
+      }
+    }
 
     // calculate raw estimate on registers
     //double est = alpha(m) * harmonicMean(M, m);
-    double est = calculateEstimate(M);
-
+    double est = calculateRawEstimate(M);
     // correct for biases if estimate is smaller than 5m
     if (est <= double(m)*5.0) {
       est -= getEstimateBias(est);
     }
 
-    uint32_t v = countZeros(M);
-    if (v > 2) {
-      // calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix
-      double lc_estimate = linearCounting(m, v);
-
-      // check if the lc estimate is below the threshold
-      if (lc_estimate <= double(threshold[p-4])) {
-        if (lc_estimate < 0) { throw; }
-        // return lc estimate of cardinality
-        return lc_estimate;
-      }
-      return lc_estimate; // always use lc_estimate when available
-    }
-
-    // return bias-corrected hyperloglog estimate of cardinality
     return uint64_t(est);
   }
 
@@ -507,57 +493,58 @@ class HyperLogLogPlusMinus {
       return rank_val;
     }
 
-  void initRawEstimateData() {
-      rawEstimateData = vector<vector<double> >();
-
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)));
-      rawEstimateData.push_back(vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)));
-
+  vector<double> rawEstimateData(size_t p) {
+    switch (p) {
+      case  4: return vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4));
+      case  5: return vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5));
+      case  6: return vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6));
+      case  7: return vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7));
+      case  8: return vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8));
+      case  9: return vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9));
+      case 10: return vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10));
+      case 11: return vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11));
+      case 12: return vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12));
+      case 13: return vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13));
+      case 14: return vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14));
+      case 15: return vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15));
+      case 16: return vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16));
+      case 17: return vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17));
+      case 18: return vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18));
+    }
+    return vector<double>();
   }
 
-  void initBiasData() {
-    biasData = vector<vector<double> >();
-
-    biasData.push_back(vector<double>(biasData_precision4,arr_len(biasData_precision4)));
-    biasData.push_back(vector<double>(biasData_precision5,arr_len(biasData_precision5)));
-    biasData.push_back(vector<double>(biasData_precision6,arr_len(biasData_precision6)));
-    biasData.push_back(vector<double>(biasData_precision7,arr_len(biasData_precision7)));
-    biasData.push_back(vector<double>(biasData_precision8,arr_len(biasData_precision8)));
-    biasData.push_back(vector<double>(biasData_precision9,arr_len(biasData_precision9)));
-    biasData.push_back(vector<double>(biasData_precision10,arr_len(biasData_precision10)));
-    biasData.push_back(vector<double>(biasData_precision11,arr_len(biasData_precision11)));
-    biasData.push_back(vector<double>(biasData_precision12,arr_len(biasData_precision12)));
-    biasData.push_back(vector<double>(biasData_precision13,arr_len(biasData_precision13)));
-    biasData.push_back(vector<double>(biasData_precision14,arr_len(biasData_precision14)));
-    biasData.push_back(vector<double>(biasData_precision15,arr_len(biasData_precision15)));
-    biasData.push_back(vector<double>(biasData_precision16,arr_len(biasData_precision16)));
-    biasData.push_back(vector<double>(biasData_precision17,arr_len(biasData_precision17)));
-    biasData.push_back(vector<double>(biasData_precision18,arr_len(biasData_precision18)));
+  vector<double> biasData(size_t p) {
+    switch(p) {
+      case  4: return vector<double>(biasData_precision4,arr_len(biasData_precision4));
+      case  5: return vector<double>(biasData_precision5,arr_len(biasData_precision5));
+      case  6: return vector<double>(biasData_precision6,arr_len(biasData_precision6));
+      case  7: return vector<double>(biasData_precision7,arr_len(biasData_precision7));
+      case  8: return vector<double>(biasData_precision8,arr_len(biasData_precision8));
+      case  9: return vector<double>(biasData_precision9,arr_len(biasData_precision9));
+      case 10: return vector<double>(biasData_precision10,arr_len(biasData_precision10));
+      case 11: return vector<double>(biasData_precision11,arr_len(biasData_precision11));
+      case 12: return vector<double>(biasData_precision12,arr_len(biasData_precision12));
+      case 13: return vector<double>(biasData_precision13,arr_len(biasData_precision13));
+      case 14: return vector<double>(biasData_precision14,arr_len(biasData_precision14));
+      case 15: return vector<double>(biasData_precision15,arr_len(biasData_precision15));
+      case 16: return vector<double>(biasData_precision16,arr_len(biasData_precision16));
+      case 17: return vector<double>(biasData_precision17,arr_len(biasData_precision17));
+      case 18: return vector<double>(biasData_precision18,arr_len(biasData_precision18));
+    }
+    return vector<double>();
   }
 
   /**
-   * Estimate the bias using empirically determined values.
+   * Estimate the bias of raw estimate using empirically determined values.
    * Uses weighted average of the two cells between which the estimate falls.
    * TODO: Check if nearest neighbor average gives better values, as proposed in the paper
    * @param est
    * @return correction value for
    */
   double getEstimateBias(double estimate) {
-    vector<double> rawEstimateTable = rawEstimateData[p-4];
-    vector<double> biasTable = biasData[p-4];
+    vector<double> rawEstimateTable = rawEstimateData(p);
+    vector<double> biasTable = biasData(p);
   
     // check if estimate is lower than first entry, or larger than last
     if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); }

From 93c155f5fe9eebd0a16c2ac37a38d3a3fd1391bf Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 8 Nov 2017 22:12:29 -0500
Subject: [PATCH 100/105] Fix gzstream building and licensing

---
 CHANGELOG                                  |  90 --------------
 install_kraken.sh => install_krakenhll.sh  |   0
 scripts/krakenhll                          |   3 +-
 scripts/krakenhll-build                    |   3 +-
 scripts/krakenhll-build_db.sh              |   3 +-
 scripts/krakenhll-check_for_jellyfish.sh   |   4 +-
 scripts/krakenhll-download                 |   1 +
 scripts/krakenhll-report                   |   3 +-
 scripts/krakenhll-standard_installation.sh |   3 +-
 scripts/krakenhll-verify_gi_numbers.pl     |  54 ---------
 src/Makefile                               |   2 +-
 src/gzstream/.Makefile.swp                 | Bin 12288 -> 0 bytes
 src/make_seqid_to_taxid_map.cpp            | 130 ---------------------
 tests/build-dbs.sh                         |   2 +-
 14 files changed, 15 insertions(+), 283 deletions(-)
 delete mode 100644 CHANGELOG
 rename install_kraken.sh => install_krakenhll.sh (100%)
 delete mode 100755 scripts/krakenhll-verify_gi_numbers.pl
 delete mode 100644 src/gzstream/.Makefile.swp
 delete mode 100644 src/make_seqid_to_taxid_map.cpp

diff --git a/CHANGELOG b/CHANGELOG
deleted file mode 100644
index b914174..0000000
--- a/CHANGELOG
+++ /dev/null
@@ -1,90 +0,0 @@
-v0.10.6-beta:
-* fixed overflow bug in command line parsing
-* fixed GRCh38.p2 bug in human genome downloads
-
-v0.10.5-beta:
-* fix bug in GRCh38 download to handle multi-fasta files
-* add --header-line and --intermediate-ranks options to kraken-mpa-report
-* improved support for adding multi-FASTA files with --add-to-library
-* allow assigning taxon IDs in reference sequences w/o GI numbers
-  using "kraken:taxid" code
-* included full sequence descriptions when using "--[un]classified-out"
-* reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink)
-* reduced memory usage of db_sort (Build step 3)
-* reduced memory usage of set_lcas (Build step 6)
-* support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB
-  env. variables
-* added kraken-translate for getting taxonomic names for each sequence
-* added a --rebuild option to kraken-build
-* turned off default name checking for PE reads; added --check-names option
-* added plasmids to --download-library options
-* added HTML manual, redirecting README to that
-
-v0.10.4-beta:
-* use GRCh38 for human genome library
-* enable input via stdin (via /dev/fd/0)
-* enable compressed (gzip/bzip2) input
-* enable auto-detection of fasta/fastq/gz/bz2
-* simplified add_to_library.sh code to speed up large additions
-* use RNA genomes for viral genome library
-* scan .ffn (RNA) files for genomic data when building databases
-* handle paired-end reads with --paired option
-* provide MetaPhlAn-compatible output with kraken-mpa-report
-* added domain/kingdom codes to kraken-report
-* added kraken-filter script for simple confidence scoring
-* added support for multi-FASTA files in custom DBs
-* fixed build_kraken_db.sh bug for k-mers w/ k < 31
-* updates to README file
-
-v0.10.3-beta:
-* remove Fatal.pm use in kraken-report
-* fixed false success message on make failure in installer
-* explicitly require g++ as C++ compiler in Makefile
-* change to quickfile.cpp to do proper syncing on close
-* fixed kraken-build bug w/ --work-on-disk (cause of some major build stalls)
-* changed hash size calculation to use Perl
-* close input files explicitly in db_sort/db_shrink to reduce reported memory
-* allow db_shrink to work in RAM
-* updates to README file
-
-v0.10.2-beta:
-* fixed kraken-report bug w/ --show-zeros
-* fixed kraken-report installation bug
-* updates to README file
-
-v0.10.1-beta:
-* fixed 2nd bug in build_kraken.sh in calculating hash size (thanks T. Antao)
-* fixed bug in add_to_library.sh for some bash versions (thanks T. Antao)
-* fixed issue where search window wasn't cached until a failure (query speedup)
-* added $KRAKEN_DIR fallback for kraken/kraken-build (thanks S. Koren)
-
-v0.10.0-beta:
-* added CHANGELOG
-* fixed quick mode hit list output
-* updated README citation
-* changed minimizer sort order (query speedup), changes database structure
-* use linear search with small windows (query speedup)
-* changed query procedure (query speedup); search w/o 1st calculating minimizer
-* changed readlink in installer to perl Cwd::abs_path (portability)
-* removed MAP_POPULATE for preloading, uses read loop instead (bugfix/port.)
-* added --work-on-disk switch to kraken-build
-* added kraken-report script
-* fixed bug in build_kraken.sh in calculating hash size (thanks T. Antao)
-
-v0.9.1b:
-* fixed bug to allow kraken-build --shrink
-
-v0.9.0b:
-* full rewrite
-* minimizers used to speed queries, prefix index removed
-
-v0.3:
-* DB build parallelized, Jellyfish removed from LCA assignment
-
-v0.2:
-* full rewrite, most progs. changed to C++
-* Jellyfish removed from classification step
-* prefix index used to speed queries
-
-v0.1:
-* initial version, mostly Perl
diff --git a/install_kraken.sh b/install_krakenhll.sh
similarity index 100%
rename from install_kraken.sh
rename to install_krakenhll.sh
diff --git a/scripts/krakenhll b/scripts/krakenhll
index 8a4aad9..69e3b56 100755
--- a/scripts/krakenhll
+++ b/scripts/krakenhll
@@ -1,6 +1,7 @@
 #!/usr/bin/env perl
 
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
diff --git a/scripts/krakenhll-build b/scripts/krakenhll-build
index 8888cd8..1ab41c9 100755
--- a/scripts/krakenhll-build
+++ b/scripts/krakenhll-build
@@ -1,6 +1,7 @@
 #!/usr/bin/env perl
 
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
diff --git a/scripts/krakenhll-build_db.sh b/scripts/krakenhll-build_db.sh
index 959f041..adc4345 100755
--- a/scripts/krakenhll-build_db.sh
+++ b/scripts/krakenhll-build_db.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 #vim: noai:ts=2:sw=2
 
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
diff --git a/scripts/krakenhll-check_for_jellyfish.sh b/scripts/krakenhll-check_for_jellyfish.sh
index c2aa2d7..50d9867 100755
--- a/scripts/krakenhll-check_for_jellyfish.sh
+++ b/scripts/krakenhll-check_for_jellyfish.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
-# modified by Florian Breitwieser, 2017
+# Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
diff --git a/scripts/krakenhll-download b/scripts/krakenhll-download
index c052463..e4e52c3 100755
--- a/scripts/krakenhll-download
+++ b/scripts/krakenhll-download
@@ -3,6 +3,7 @@
 
 # krakenhll-download.pl - based on centrifuge-download
 # (c) Florian Breitwieser, 2017
+# licensed under GPL-3
 
 use strict;
 use warnings;
diff --git a/scripts/krakenhll-report b/scripts/krakenhll-report
index e9cdaf5..bf950d4 100755
--- a/scripts/krakenhll-report
+++ b/scripts/krakenhll-report
@@ -1,6 +1,7 @@
 #!/usr/bin/env perl
 
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
diff --git a/scripts/krakenhll-standard_installation.sh b/scripts/krakenhll-standard_installation.sh
index b34dd44..98353e1 100755
--- a/scripts/krakenhll-standard_installation.sh
+++ b/scripts/krakenhll-standard_installation.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+# Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
diff --git a/scripts/krakenhll-verify_gi_numbers.pl b/scripts/krakenhll-verify_gi_numbers.pl
deleted file mode 100755
index 0bb5cdf..0000000
--- a/scripts/krakenhll-verify_gi_numbers.pl
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
-#
-# This file is part of the Kraken taxonomic sequence classification system.
-#
-# Kraken is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Kraken is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
-
-# Checks each sequence header to ensure it has a GI number to
-# enable taxonomic ID lookup later.  Also has some (very basic)
-# FASTA-format checking.
-
-use strict;
-use warnings;
-use File::Basename;
-
-my $PROG = basename $0;
-
-die "$PROG: must specify one filename!\n" if @ARGV != 1;
-
-my $filename = shift;
-
-open FASTA, "<", $filename
-  or die "$PROG: can't open $filename: $!\n";
-my $seq_ct = 0;
-my $errors = 0;
-while (<FASTA>) {
-  next unless /^>/;
-  $seq_ct++;
-  if (! /^>(\S+)/) {
-    $errors++;
-    warn "file $filename, line $. lacks sequence ID\n";
-  }
-  if ($1 !~ /(^|\|)(gi|kraken:taxid)\|(\d+)/) {
-    $errors++;
-    warn "file $filename, line $.: sequence ID lacks GI number\n";
-  }
-}
-close FASTA;
-
-if ($errors) {
-  exit 1;
-}
diff --git a/src/Makefile b/src/Makefile
index d236de3..cfebf25 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,7 +14,7 @@ install: $(PROGS)
 	cp $(PROGS) $(KRAKEN_DIR)/
 
 clean:
-	rm -f $(PROGS) *.o
+	rm -rf $(PROGS) *.o *.dSYM
 
 db_shrink: krakendb.o quickfile.o
 
diff --git a/src/gzstream/.Makefile.swp b/src/gzstream/.Makefile.swp
deleted file mode 100644
index f5e077d91d7cee7748a0835509de1d24b28f23d8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeHNJ8UDz86M|x$=PudpUb%TZ5`m#nOu^RFXy8m1AElt2vZV4QaQ=ZVYoXa2e;ha
z%*=|Sj6;BwhJh3|V7N%<1bG2*nj}b?BzBP^RXUeOiU0{x1Syg--z>S*O1XotP~%`1
z`oPuB%s2o4{g0)f=~uti+@YJ!%M91o8GHAY=U#p5(wkR4{xM^Nz7o<*q|idy;YVG8
zjKW8a<)2ryr?~M()k>bZG8TppMHr2xr^~U7MIb{_(UZ!3#qW!-qLo(}o@k>4A6Itp
zXdnZT=Pm9F1quZoM}eo=>PoF@XD+?)JUx45|8W)-a|#6t1quZU1quZU1quZU1quZU
z1^!PIF!2-YJMiUmbAJxzpHH0ooc~&U6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy6bcjy
zd<GTZK4U+?AN$XL9>(MU|JmRFzxWzs5~u=Y;2&RQ>;vH2KnYj|{`xdyKLq+f6(|FL
z`wC<40|&rm;1clnrx^Pq@E+g;9Jm5J1N`Ah#=Z}{4qO5Le3`L#fK6Zn`1h9?`z`Pr
z;77olz!W$JmVtkLiLpNc-vthUXMv9{G4>(wGvKGdTYv=a0<Qu!;2GdQUu5hrz^{N`
z0&fEz&<ECmr+_DcPrd*i;3vTMfCvbIJHT_mH-LXazuyDD1AYyB0Q?+yA2<dkz%}3n
zz-moEaeWpPSfb6nEw)7K7kHsI)R?Y25e2fJoo#VA8KKR+;fajrcS(qQ<^_U>JP4@1
zST}o?KDhcu<KW-{m81Nc{5^KL`bKMGtL1KYAF$aLDKXX0O*>-@_Bu~}Y8dVwMr$Wz
zq_ZjiOX+qVob!4iK*Y{W2UTupVGgWnw`bl^**oVrtjc-lH1hspEHf_k-)Wrkk51iv
z6#vYjMTi#Mv*#JJ{q1piPrW-eq``7ea{`>>T%zmOuhX9MnnOJ#4Gm-{^+@=n6VDS`
z50W65I(AZ;YQq!9jALhbf{pAQyX?!sfEs)4!{+X`(`w#c3^<jXfb*yMg=u<zD$8(|
zVT*m)<%|~sJ!D_(kcO&GxE31cV~KZ%81N)8^|R@Nvn*7;zzcL^95SP<>eh&q7)M7!
z!<um%DXylFI}o}=W+bdJ_ECv==u@nsq2gnGh6D__9zl}Lv_8)!k{69*%bzleW8ozj
zOr>yU5g#MKI5G!2)qR~s+{u{cGFh?(Q_fIKivuo0(sCRJf>dI+GSSRy>qP2#f*y40
zT^*|~P~$RVZYNcRXq~tsPqBH2vJc8Fhsr@#2e>_9S;^bcCPw!dL))8CtkdwNmvFm8
zgGf<JhRHE?gnx${`leGWrK3keX&h6e73bCsX9eEmRyP_{4p&#Ot-HHj*T!FPb+?<;
zRPq=+H%heAY<Cf0!u*Wxrn8FUR2Qtx61g_^!~-K#$c;P_aAJM}63xZNXz}ZaJWTBi
z623ITw^@*j8*xn&fmg?5gp8W!gA9kn<2aBWxT8>0pKEjus3VPCkKB+(Q#1)F<jA#{
zd%~gJ$iNIUk{X%fL55HWKTRBLu7%0Az%fmO9l*nVpI$6)Y`@;^bvE6dd%Nz=X1DD&
zHrFlfi#F#LVZ}YN(i5riUcj{$wurdTV`HD`bPr3MZaIw-l^SsIiFv_q`Xz!JZem7}
zx~7-Ygsszt5HY_`w-L!{6mx$x<!B(S#GKG;!kADfEb53OX><0T&yB#F>*@wR>$Fm>
z)+)7XrTQ|}Zq%!{>MO6%)!g)1O<;PqzD_k~HJ!3$b6|1RX>ph~HSz+#->k!)ttbg$
zIUFcaGxVGi-MEE!RoZjfHq<5RCP-OYU9FbrcBGA+x#JR5S8BCdxwcxpNwnW}!PSzU
z2$2Fjh>*JzX-2a|ocG=}k^>4OQ=*Boj7TP116xW6>P7=I;YxtzFiJ>|q9pKX#E;PV
zgeOp2K5-OAJf&!mF5BMSrxrS_P_!)&3La29>7!aeVQqQX(85C7fyR*%<l|Jc479DM
zoy|_;j=S4)Z#P@b-XUSiR<pOe+3nKSUWZ(2yPaOMvEM>ZZ0~p4d)-Zks0-t*7cd7L
z{lw7SX7A44evjPUL%QpBI-opUOVx-lDvm^$F0gG~!Yg>KcxbF&An4h`o$E!o^(9VJ
zOVDB_3f}HCiekGZ+NMM`OLPrXcA^EoOO(KXpRDbzIJHczd1y-3t7}-AR%R62xG$qq
zO({Uus%)i|V-ZYEvDG#G+R~q<CY%XbE``<zC(%b~S&Somb(vZdkzZi8mScx88rG-B
z@4N~Fn<9FiB5T(8w4Ew6(@Y+_6Y8`Q*_y;p1F53qaAfFOV>zuxdc1nhT6T#VcwI>(
lygkv5w{6<&kh<u}wl!OaP7tZITPtK6q|72`^}EnI{VzORJx2fl

diff --git a/src/make_seqid_to_taxid_map.cpp b/src/make_seqid_to_taxid_map.cpp
deleted file mode 100644
index c8a30ed..0000000
--- a/src/make_seqid_to_taxid_map.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
- *
- * This file is part of the Kraken taxonomic sequence classification system.
- *
- * Kraken is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Kraken is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-// Produce a mapping of sequence IDs to taxon IDs
-
-// This program's reason for being is that the gi_taxid_nucl.dmp file
-// is monstrously huge, and the only efficient way to do this task is
-// to use mmap to quickly access the file.  Otherwise, I'd have just
-// used a little Perl script instead of all these strchr() calls.
-
-#include "kraken_headers.hpp"
-#include "quickfile.hpp"
-
-using namespace std;
-using namespace kraken;
-
-#define USER_SPECIFIED_FLAG "TAXID"
-
-map<string, uint64_t> user_specified_taxids;
-map<uint64_t, set<string> > requests;
-uint64_t request_count = 0;
-
-void fill_request_map(char *filename);
-void report_taxo_numbers(char *filename);
-
-int main(int argc, char **argv) {
-  if (argc < 3) {
-    cerr << "Usage: make_seqid_to_taxid_map <gi to taxid map> <gi to seqid list> [<names.dmp file> <nodes.dmp file>]\n"
-         << "  If nodes.dmp and names.dmp files are provided, then each sequence header is added with a further link\n"
-         << "  to the taxonomy."
-         << endl;
-    return 1;
-  }
-  char *map_filename = argv[1];
-  char *list_filename = argv[2];
-
-  //char *nodes_filename;
-  //char *names_filename;
-  //if (argc == 5) {
-  //    nodes_filename = argv[3];
-  //    names_filename = argv[4];
-  //}
-
-  fill_request_map(list_filename);
-  report_taxo_numbers(map_filename);
-
-  return 0;
-}
-
-void report_taxo_numbers(char *filename) {
-  string file_str = filename;
-  QuickFile file(file_str);
-  char *fptr, *fptr_start;
-  fptr_start = fptr = file.ptr();
-  size_t file_size = file.size();
-
-  // Line format: <gi num><tab><taxon ID>
-  while (request_count > 0 && (size_t)(fptr - fptr_start) < file_size) {
-    char *nl_ptr = strchr(fptr, '\n');
-    uint64_t gi = atoll(fptr);
-    
-    if (requests.count(gi) > 0) {
-      char *tab_ptr = strchr(fptr, '\t');
-      set<string>::iterator it;
-      // Output line format: <sequence ID><tab><taxon ID>
-      for (it = requests[gi].begin(); it != requests[gi].end(); it++) {
-        cout << *it << "\t";
-        cout.write(tab_ptr + 1, nl_ptr - tab_ptr);
-        request_count--;
-      }
-    }
-
-    fptr = nl_ptr + 1;
-  }
-  file.close_file();
-
-  // Same as before - just doing the user specified sequences now
-  // Output line format: <sequence ID><tab><taxon ID>
-  map<string, uint64_t>::iterator mit = user_specified_taxids.begin();
-  while (mit != user_specified_taxids.end()) {
-    cout << mit->first << "\t" << mit->second << endl;
-    mit++;
-  }
-}
-
-void fill_request_map(char *filename) {
-  string file_str = filename;
-  QuickFile file(file_str);
-  char *fptr, *fptr_start;
-  fptr_start = fptr = file.ptr();
-  size_t file_size = file.size();
-
-  // Line format: <gi num><tab><sequence ID><tab><full sequence header>
-  // OR: TAXID<tab><taxonomy ID><tab><sequence ID><tab><full sequence header>    (user spec'ed)
-  while ((size_t)(fptr - fptr_start) < file_size) {
-    char *nl_ptr = strchr(fptr, '\n');
-    char *sep_ptr = strchr(fptr, '\t');
-    if (strncmp(fptr, USER_SPECIFIED_FLAG, strlen(USER_SPECIFIED_FLAG)) == 0) {
-      char *sep_ptr = strchr(fptr, '\t');
-      uint64_t taxid = atoll(sep_ptr + 1);
-      sep_ptr = strchr(sep_ptr + 1, '\t');
-      string seqid(sep_ptr + 1, nl_ptr - sep_ptr - 1);
-      user_specified_taxids[seqid] = taxid;
-    }
-    else {
-      uint64_t gi = atoll(fptr);
-      requests[gi].insert(string(sep_ptr + 1, nl_ptr - sep_ptr - 1));
-      request_count++;
-    }
-    fptr = nl_ptr + 1;
-  }
-
-  file.close_file();
-}
diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
index a2f5438..5abbb66 100755
--- a/tests/build-dbs.sh
+++ b/tests/build-dbs.sh
@@ -106,7 +106,7 @@ for VAR in $@; do
       [[ -d $EUKD ]] || mkdir -p $EUKD
       [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD
       build_db $K euk-oct2017 fungi protozoa ;;
-  *) echo "$USAGE"
+  *) echo -e "Unknown database $VAR!\n$USAGE"
      exit 1 ;;
   esac
 done

From c6871c11e0e4150191d14dcbe3ca69c973bd2b55 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Wed, 8 Nov 2017 22:15:07 -0500
Subject: [PATCH 101/105] Fix licensing

---
 install_kraken.sh            |  3 +--
 src/build_taxdb.cpp          |  6 +++---
 src/classify.cpp             |  3 ++-
 src/dump_taxdb.cpp           | 19 +++++++++++++++++
 src/grade_classification.cpp | 16 +++++++++++++-
 src/hyperloglogplus.h        | 41 ++++++++++++++++++++++--------------
 src/krakenutil.cpp           |  3 ++-
 src/krakenutil.hpp           |  3 ++-
 src/query_taxdb.cpp          |  8 +++----
 src/quickfile.cpp            |  3 ++-
 src/read_uid_mapping.cpp     | 18 ++++++++++++++++
 src/readcounts.hpp           | 18 ++++++++++++++++
 src/report-cols.h            |  5 ++---
 src/set_lcas.cpp             |  4 ++--
 src/taxdb.h                  |  7 +++---
 src/uid_mapping.cpp          | 18 ++++++++++++++++
 src/uid_mapping.hpp          | 18 ++++++++++++++++
 17 files changed, 155 insertions(+), 38 deletions(-)

diff --git a/install_kraken.sh b/install_kraken.sh
index 3b12552..9655715 100755
--- a/install_kraken.sh
+++ b/install_kraken.sh
@@ -1,9 +1,8 @@
 #!/bin/bash
 
+# Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu>
 # Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
 #
-# This file is part of the Kraken taxonomic classification system.
-#
 # Kraken is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
diff --git a/src/build_taxdb.cpp b/src/build_taxdb.cpp
index 263de5c..467ca98 100644
--- a/src/build_taxdb.cpp
+++ b/src/build_taxdb.cpp
@@ -1,14 +1,14 @@
 /*
  * Copyright 2017, Florian Breitwieser
  *
- * This file is part of the Kraken taxonomic sequence classification system.
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
  *
- * Kraken is free software: you can redistribute it and/or modify
+ * KrakenHLL is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
- * Kraken is distributed in the hope that it will be useful,
+ * KrakenHLL is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
diff --git a/src/classify.cpp b/src/classify.cpp
index 3b8a03e..a940f18 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -1,5 +1,6 @@
 /*
- * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
  *
  * This file is part of the Kraken taxonomic sequence classification system.
  *
diff --git a/src/dump_taxdb.cpp b/src/dump_taxdb.cpp
index 76246e4..b2e8555 100644
--- a/src/dump_taxdb.cpp
+++ b/src/dump_taxdb.cpp
@@ -1,3 +1,22 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
 #include "taxdb.h"
 #include "quickfile.hpp"
 #include <iostream>
diff --git a/src/grade_classification.cpp b/src/grade_classification.cpp
index 148c7e9..781b26e 100644
--- a/src/grade_classification.cpp
+++ b/src/grade_classification.cpp
@@ -1,6 +1,20 @@
 /*
  * Copyright 2017, Florian Breitwieser
- * licnsed under GPLv3
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "taxdb.h"
diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
index 5e27407..21d40eb 100644
--- a/src/hyperloglogplus.h
+++ b/src/hyperloglogplus.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
 /*
  * hyperloglogplus.h
  *
@@ -99,16 +117,14 @@ double alpha(uint32_t m)  {
 
 /**
  * calculate the raw estimate as harmonic mean of the ranks in the register
- * @param array
- * @return
  */
-double calculateEstimate(vector<uint8_t> array) {
+double calculateEstimate(vector<uint8_t> M) {
   double inverseSum = 0.0;
-  for (size_t i = 0; i < array.size(); ++i) {
+  for (size_t i = 0; i < M.size(); ++i) {
     // TODO: pre-calculate the power calculation
-    inverseSum += pow(2,-array[i]);
+    inverseSum += pow(2,-M[i]);
   }
-  return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum;
+  return alpha(M.size()) * double(M.size() * M.size()) * 1 / inverseSum;
 }
 
 uint32_t countZeros(vector<uint8_t> s) {
@@ -117,10 +133,6 @@ uint32_t countZeros(vector<uint8_t> s) {
 
 /**
  * Extract bits (from uint32_t or uint64_t) using LSB 0 numbering from hi to lo, including lo
- * @param bits
- * @param hi
- * @param lo
- * @return
  */
 template<typename T>
 T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {
@@ -198,19 +210,16 @@ inline uint32_t clz(const uint32_t x) {
 }
 
 inline uint32_t clz(const uint64_t x) {
-    uint32_t u32 = (x >> 32);
+  return __builtin_clzl(x);
+/*    uint32_t u32 = (x >> 32);
     uint32_t result = u32 ? __builtin_clz(u32) : 32;
     if (result == 32) {
         u32 = x & 0xFFFFFFFFUL;
         result += (u32 ? __builtin_clz(u32) : 32);
     }
-    return result;
+    return result; */
 }
 //#else
-
-uint32_t clz_log2(const uint64_t w) {
-  return 63 - floor(log2(w));
-}
 //#endif
 
 
diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
index 46fd953..2e18cf0 100644
--- a/src/krakenutil.cpp
+++ b/src/krakenutil.cpp
@@ -1,5 +1,6 @@
 /*
- * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
  *
  * This file is part of the Kraken taxonomic sequence classification system.
  *
diff --git a/src/krakenutil.hpp b/src/krakenutil.hpp
index cbfd3d5..aff26bf 100644
--- a/src/krakenutil.hpp
+++ b/src/krakenutil.hpp
@@ -1,5 +1,6 @@
 /*
- * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
  *
  * This file is part of the Kraken taxonomic sequence classification system.
  *
diff --git a/src/query_taxdb.cpp b/src/query_taxdb.cpp
index 7412792..a45c117 100644
--- a/src/query_taxdb.cpp
+++ b/src/query_taxdb.cpp
@@ -1,20 +1,20 @@
 /*
  * Copyright 2017, Florian Breitwieser
  *
- * This file is part of the Kraken taxonomic sequence classification system.
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
  *
- * Kraken is free software: you can redistribute it and/or modify
+ * KrakenHLL is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
- * Kraken is distributed in the hope that it will be useful,
+ * KrakenHLL is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ * along with KrakenHLL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include "taxdb.h"
diff --git a/src/quickfile.cpp b/src/quickfile.cpp
index c518dd9..39e257d 100644
--- a/src/quickfile.cpp
+++ b/src/quickfile.cpp
@@ -1,5 +1,6 @@
 /*
- * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
  *
  * This file is part of the Kraken taxonomic sequence classification system.
  *
diff --git a/src/read_uid_mapping.cpp b/src/read_uid_mapping.cpp
index 8f83742..3802924 100644
--- a/src/read_uid_mapping.cpp
+++ b/src/read_uid_mapping.cpp
@@ -1,3 +1,21 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
 
 #include "uid_mapping.hpp"
 #include "kraken_headers.hpp"
diff --git a/src/readcounts.hpp b/src/readcounts.hpp
index eddca78..afb45c7 100644
--- a/src/readcounts.hpp
+++ b/src/readcounts.hpp
@@ -1,3 +1,21 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
 
 #ifndef READCOUNTS_HPP
 #define READCOUNTS_HPP
diff --git a/src/report-cols.h b/src/report-cols.h
index e5fa0a5..9a37d56 100644
--- a/src/report-cols.h
+++ b/src/report-cols.h
@@ -1,8 +1,7 @@
 /*
  * report-cols.h
- * Copyright (C) 2017 fbreitwieser <fbreitwieser@sherman>
- *
- * Distributed under terms of the MIT license.
+ * Copyright (C) 2017 fbreitwieser
+ * licensed under GPL3
  */
 
 #ifndef REPORT_COLS_H
diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
index dc75c63..25cebf6 100644
--- a/src/set_lcas.cpp
+++ b/src/set_lcas.cpp
@@ -1,6 +1,6 @@
-// vim: noai:ts=2:sw=2:expandtab:smarttab
 /*
- * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Original file Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
+ * Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
  *
  * This file is part of the Kraken taxonomic sequence classification system.
  *
diff --git a/src/taxdb.h b/src/taxdb.h
index aef1e50..38f71a5 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -1,8 +1,9 @@
-/* Original work Copyright 2013 David Ainsworth
- * Modified work copyright 2017 Florian Breitwieser
+/* 
+ * Original work Copyright 2013 David Ainsworth
+ * Modified work copyright 2017 Florian Breitwieser 
  *
  * The original file is part of SLAM
- * The modified file is part of a modified Kraken version
+ * The modified file is part of KrakenHLL
  *
  * SLAM is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Affero General Public License as published by
diff --git a/src/uid_mapping.cpp b/src/uid_mapping.cpp
index 2914468..e2fc4cd 100644
--- a/src/uid_mapping.cpp
+++ b/src/uid_mapping.cpp
@@ -1,3 +1,21 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
 
 #include<iostream>
 #include<algorithm>
diff --git a/src/uid_mapping.hpp b/src/uid_mapping.hpp
index 93d1680..64e6193 100644
--- a/src/uid_mapping.hpp
+++ b/src/uid_mapping.hpp
@@ -1,3 +1,21 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
 
 #ifndef UID_MAPPING_H
 #define UID_MAPPING_H

From 53314ae85b8422649e7fb6f89664fc6f5fc1e255 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 9 Nov 2017 18:06:38 -0500
Subject: [PATCH 102/105] Fix taxonomy reporting order and percentage

---
 src/Makefile       |   2 +-
 src/readcounts.hpp |  10 +
 src/taxdb.h        | 898 ++++++++++++++++++++++-----------------------
 3 files changed, 451 insertions(+), 459 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index cfebf25..37e068d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -2,7 +2,7 @@ CXX = g++
 FOPENMP?=-fopenmp
 CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS}
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
-PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
+PROGS = classify db_sort set_lcas db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
 #LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS}
 LIBFLAGS = -L. -lz ${LDFLAGS}
 
diff --git a/src/readcounts.hpp b/src/readcounts.hpp
index afb45c7..74a52a6 100644
--- a/src/readcounts.hpp
+++ b/src/readcounts.hpp
@@ -45,6 +45,16 @@ namespace kraken {
       kmers += b.kmers;
       return *this;
     }
+
+    bool operator<(const ReadCounts& rc) {
+      if (n_reads < rc.n_reads) {
+        return true;
+      }
+      if (n_reads == rc.n_reads && n_kmers < rc.n_kmers) {
+        return true;
+      }
+      return false;
+    }
   };
   
   uint64_t reads(const ReadCounts& read_count) {
diff --git a/src/taxdb.h b/src/taxdb.h
index 38f71a5..0416119 100644
--- a/src/taxdb.h
+++ b/src/taxdb.h
@@ -42,12 +42,12 @@ using namespace std;
 
 namespace patch
 {
-    template < typename T > std::string to_string( const T& n )
-    {
-        std::ostringstream stm ;
-        stm << n ;
-        return stm.str() ;
-    }
+  template < typename T > std::string to_string( const T& n )
+  {
+    std::ostringstream stm ;
+    stm << n ;
+    return stm.str() ;
+  }
 }
 
 
@@ -73,7 +73,7 @@ struct TaxRank {
   //  plus 'sequence', 'assembly', and 'root'
   //static constexpr vector<string> rank_strings = {
   // "no rank", "sequence", "assembly",
- // "subspecies", "species", "subgenus", "genus", "tribe", "subfamily",
+  // "subspecies", "species", "subgenus", "genus", "tribe", "subfamily",
   //"family", "superfamily", "parvorder", "infraorder", "suborder",
   //"order", "superorder", "parvclass", "infraclass", "subclass",
   //"class", "superclass", "subphylum", "phylum", "kingdom",
@@ -130,11 +130,11 @@ struct TaxRank {
       case RANK::superkingdom:     return "superkingdom";
       case RANK::root:             return "root";
       default:
-        log_msg("Invalid rank!\n");
+				   log_msg("Invalid rank!\n");
     }
     return "NA";
   }
-  
+
 };
 
 const unordered_map<string, TaxRank::RANK> TaxRank::string_to_rank = {
@@ -172,35 +172,35 @@ const unordered_map<string, TaxRank::RANK> TaxRank::string_to_rank = {
 
 template<typename TAXID>
 class TaxonomyEntry {
- public:
-  TAXID taxonomyID;
-  TaxonomyEntry<TAXID>* parent;
-  std::vector<TaxonomyEntry*> children;
+  public:
+    TAXID taxonomyID;
+    TaxonomyEntry<TAXID>* parent;
+    std::vector<TaxonomyEntry*> children;
 
-  string rank;
-  std::string scientificName;
-  uint64_t genomeSize;
-  uint64_t genomeSizeOfChildren;
+    string rank;
+    std::string scientificName;
+    uint64_t genomeSize;
+    uint64_t genomeSizeOfChildren;
 
-  TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {}
+    TaxonomyEntry() : taxonomyID(0), parent(NULL), genomeSize(0), genomeSizeOfChildren(0) {}
 
-  TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry<TAXID>* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) :
-	  taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_),
+    TaxonomyEntry(TAXID taxonomyID_, TaxonomyEntry<TAXID>* parent_, std::string rank_, std::string scientificName_, uint64_t genomeSize_ = 0, uint64_t genomeSizeOfChildren_ = 0) :
+      taxonomyID(taxonomyID_), parent(parent_), rank(rank_), scientificName(scientificName_),
       genomeSize(genomeSize_), genomeSizeOfChildren(genomeSizeOfChildren_) {
-	  
-		  if (parent_ != NULL) {
-			  parent->children.push_back(this);
-		  }
 
-	  }
+	if (parent_ != NULL) {
+	  parent->children.push_back(this);
+	}
 
-  inline bool operator==(const TaxonomyEntry& other) const; 
+      }
 
-  friend std::ostream &operator<<(std::ostream &os, const TaxonomyEntry<TAXID> &m) { 
-	TAXID parentTaxonomyID = (m.parent == NULL)? m.taxonomyID : m.parent->taxonomyID;
-    os << '[' << m.taxonomyID << ";parent="<< parentTaxonomyID << ";name=" << m.scientificName << ";rank=" << m.rank << ']';
-	return os;
-}
+    inline bool operator==(const TaxonomyEntry& other) const; 
+
+    friend std::ostream &operator<<(std::ostream &os, const TaxonomyEntry<TAXID> &m) { 
+      TAXID parentTaxonomyID = (m.parent == NULL)? m.taxonomyID : m.parent->taxonomyID;
+      os << '[' << m.taxonomyID << ";parent="<< parentTaxonomyID << ";name=" << m.scientificName << ";rank=" << m.rank << ']';
+      return os;
+    }
 
 };
 
@@ -214,91 +214,91 @@ class TaxonomyEntry {
 //}
 
 /*
-template<typename TAXID>
-struct TaxonomyEntryPtr_comp {
-	bool operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const;
-};
-*/
+   template<typename TAXID>
+   struct TaxonomyEntryPtr_comp {
+   bool operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const;
+   };
+   */
 
 
 template<typename TAXID>
 class TaxonomyDB {
- public:
-  TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName);
-  TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false);
-  TaxonomyDB();
+  public:
+    TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName);
+    TaxonomyDB(const std::string inFileName, bool hasGenomeSizes = false);
+    TaxonomyDB();
 
-  TaxonomyDB(TaxonomyDB&& rhs) : entries(std::move(rhs.entries)) {
-  }
+    TaxonomyDB(TaxonomyDB&& rhs) : entries(std::move(rhs.entries)) {
+    }
 
-  TaxonomyDB& operator=(TaxonomyDB&& rhs) {
-	entries = std::move(rhs.entries);
-	return *this;
-  }
+    TaxonomyDB& operator=(TaxonomyDB&& rhs) {
+      entries = std::move(rhs.entries);
+      return *this;
+    }
 
 
-  void writeTaxonomyIndex(std::ostream & outs) const;
-  void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes);
+    void writeTaxonomyIndex(std::ostream & outs) const;
+    void readTaxonomyIndex(const std::string inFileName, bool hasGenomeSizes);
 
-  TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
-  std::string getScientificName(const TAXID taxID) const;
-  std::string getRank(const TAXID taxID) const;
-  TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
-  pair<TAXID,int> getLowestCommonAncestor(TAXID a, TAXID b) const;
-  string getNextProperRank(TAXID a) const;
-  TAXID getTaxIDAtNextProperRank(TAXID a) const;
+    TAXID getTaxIDAtRank(const TAXID taxID, const std::string& rank) const;
+    std::string getScientificName(const TAXID taxID) const;
+    std::string getRank(const TAXID taxID) const;
+    TAXID getLowestCommonAncestor(const std::vector<TAXID>& taxIDs) const;
+    pair<TAXID,int> getLowestCommonAncestor(TAXID a, TAXID b) const;
+    string getNextProperRank(TAXID a) const;
+    TAXID getTaxIDAtNextProperRank(TAXID a) const;
 
-  TAXID getParentTaxID(const TAXID taxID) const;
-  std::unordered_map<TAXID, TAXID> getParentMap() const;
-  TAXID getByScientificName(string name) const;
-  std::unordered_map<std::string, TAXID> getScientificNameMap() const;
-  std::string getLineage(TAXID taxonomyID) const;
-  std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
-  TaxonomyEntry<TAXID> getEntry(TAXID taxID) const;
+    TAXID getParentTaxID(const TAXID taxID) const;
+    std::unordered_map<TAXID, TAXID> getParentMap() const;
+    TAXID getByScientificName(string name) const;
+    std::unordered_map<std::string, TAXID> getScientificNameMap() const;
+    std::string getLineage(TAXID taxonomyID) const;
+    std::string getMetaPhlAnLineage(TAXID taxonomyID) const;
+    TaxonomyEntry<TAXID> getEntry(TAXID taxID) const;
 
-  bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_);
-  bool hasTaxon(TAXID taxonomyID_);
+    bool insert(TAXID taxonomyID_, TAXID parentTaxonomyID_, std::string rank_, std::string scientificName_);
+    bool hasTaxon(TAXID taxonomyID_);
 
-  size_t distance(TAXID taxID1, TAXID taxID2) const;
+    size_t distance(TAXID taxID1, TAXID taxID2) const;
 
-  bool isSubSpecies(TAXID taxonomyID) const;
-  int isBelowInTree(TAXID upper, TAXID lower) const;
+    bool isSubSpecies(TAXID taxonomyID) const;
+    int isBelowInTree(TAXID upper, TAXID lower) const;
 
-  void setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes);
-  void readGenomeSizes(string file);
-  void setGenomeSize(const TAXID taxid, const uint64_t genomeSize);
+    void setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes);
+    void readGenomeSizes(string file);
+    void setGenomeSize(const TAXID taxid, const uint64_t genomeSize);
 
-  void printReport();
+    void printReport();
 
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID> > entries;
-  bool genomeSizes_are_set;
+    std::unordered_map<TAXID, TaxonomyEntry<TAXID> > entries;
+    bool genomeSizes_are_set;
 
- private:
+  private:
 
-  std::unordered_map<TAXID, TaxonomyEntry<TAXID> >
-        readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes);
+    std::unordered_map<TAXID, TaxonomyEntry<TAXID> >
+      readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes);
 };
 
 
 template<typename TAXID, typename READCOUNTS>
 class TaxReport {
-private:
-	std::ostream& _reportOfb;
-	TaxonomyDB<TAXID> & _taxdb;
-	std::unordered_map<TAXID, READCOUNTS> _readCounts;
-	std::unordered_map<TAXID, READCOUNTS> _readCountsIncludingChildren;
-	uint64_t _total_n_reads;
-	bool _show_zeros;
-	void printLine(TaxonomyEntry<TAXID>& tax, unsigned depth);
-
-public:
-	TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID> & taxdb, std::unordered_map<TAXID, READCOUNTS>, bool _show_zeros);
-	void printReport(std::string format, std::string rank);
-	void printReport(TaxonomyEntry<TAXID>& tax, unsigned depth);
-	void setReportCols(std::vector<std::string> names);
-
-	std::vector<std::string> _report_col_names;
-	std::vector<REPORTCOLS> _report_cols;
+  private:
+    std::ostream& _reportOfb;
+    TaxonomyDB<TAXID> & _taxdb;
+    std::unordered_map<TAXID, READCOUNTS> _readCounts;
+    std::unordered_map<TAXID, READCOUNTS> _readCountsIncludingChildren;
+    uint64_t _total_n_reads;
+    bool _show_zeros;
+    void printLine(TaxonomyEntry<TAXID>& tax, unsigned depth);
+
+  public:
+    TaxReport(std::ostream& _reportOfb, TaxonomyDB<TAXID> & taxdb, std::unordered_map<TAXID, READCOUNTS>, bool _show_zeros);
+    void printReport(std::string format, std::string rank);
+    void printReport(TaxonomyEntry<TAXID>& tax, unsigned depth);
+    void setReportCols(std::vector<std::string> names);
+
+    std::vector<std::string> _report_col_names;
+    std::vector<REPORTCOLS> _report_cols;
 };
 
 template<typename K,typename V>
@@ -307,7 +307,7 @@ V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, co
 
 //////////////////////////// DEFINITIONS
 void log_msg (const std::string& s) {
-	std::cerr << s;
+  std::cerr << s;
 }
 
 template<typename T>
@@ -321,77 +321,77 @@ uint64_t string_to_T(string str) {
 template <typename T>
 inline
 uint64_t reads(const T read_count) {
-	cerr << "No reads function for type!! " << endl;
-	throw ;
-	return(0);
+  cerr << "No reads function for type!! " << endl;
+  throw ;
+  return(0);
 }
 
 inline
 uint64_t reads(const uint64_t read_count) {
-	return(read_count);
+  return(read_count);
 }
 
 std::vector<std::string> in_betweens(const std::string &s, const char start_char, const char end_char, size_t start_at) {
-    std::vector<std::string> tokens;
-	size_t i = 0;
-	size_t next_end = start_at-1;
-    
-	for (size_t next_start = s.find(start_char, next_end + 1); \
-		 next_start != string::npos;
-         next_start = s.find(start_char, next_end + 1), ++i) {
-
-		next_end = s.find(end_char, next_start + 1);
-		if (next_end == string::npos) {
-			cerr << "unmatched start and end!";
-			exit(1);
-		}
-
-        tokens.push_back(s.substr(next_start+1, next_end-1));
+  std::vector<std::string> tokens;
+  size_t i = 0;
+  size_t next_end = start_at-1;
+
+  for (size_t next_start = s.find(start_char, next_end + 1); \
+      next_start != string::npos;
+      next_start = s.find(start_char, next_end + 1), ++i) {
+
+    next_end = s.find(end_char, next_start + 1);
+    if (next_end == string::npos) {
+      cerr << "unmatched start and end!";
+      exit(1);
     }
 
-    return tokens;
+    tokens.push_back(s.substr(next_start+1, next_end-1));
+  }
+
+  return tokens;
 }
 
 
 
 std::vector<std::string> tokenise(const std::string &s, const std::string& delimiter, size_t max_fields, size_t end_chars) {
-    std::vector<std::string> tokens(max_fields);
-    size_t delim_length = delimiter.length();
-    size_t last = 0;
-    size_t i = 0;
-
-    for (size_t next = s.find(delimiter, last);
-         (max_fields > 0 && i < max_fields) && next != string::npos;
-         next = s.find(delimiter, last), ++i) {
-        tokens[i] = s.substr(last, next-last);
-        last = next + delim_length;
-    }
-    if (max_fields > 0 && i < max_fields) {
-        tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars);
-    }
+  std::vector<std::string> tokens(max_fields);
+  size_t delim_length = delimiter.length();
+  size_t last = 0;
+  size_t i = 0;
+
+  for (size_t next = s.find(delimiter, last);
+      (max_fields > 0 && i < max_fields) && next != string::npos;
+      next = s.find(delimiter, last), ++i) {
+    tokens[i] = s.substr(last, next-last);
+    last = next + delim_length;
+  }
+  if (max_fields > 0 && i < max_fields) {
+    tokens[max_fields-1] = s.substr(last, s.length()-last-end_chars);
+  }
 
-    return tokens;
+  return tokens;
 }
 
 std::vector<std::string> get_fields(const std::string &s, const std::string& delimiter, vector<size_t> fields) {
-    std::vector<std::string> tokens;
-	tokens.reserve(fields.size());
-    size_t delim_length = delimiter.length();
-    size_t last = 0;
-    size_t i = 0;
-	size_t current_field = 0;
-
-    for (size_t next = s.find(delimiter, last);
-         tokens.size() < fields.size() && next != string::npos;
-         next = s.find(delimiter, last), ++i) {
-		if (i == fields[current_field]) {
-          tokens.push_back(s.substr(last, next-last));
-           ++current_field;
-		}
-        last = next + delim_length;
+  std::vector<std::string> tokens;
+  tokens.reserve(fields.size());
+  size_t delim_length = delimiter.length();
+  size_t last = 0;
+  size_t i = 0;
+  size_t current_field = 0;
+
+  for (size_t next = s.find(delimiter, last);
+      tokens.size() < fields.size() && next != string::npos;
+      next = s.find(delimiter, last), ++i) {
+    if (i == fields[current_field]) {
+      tokens.push_back(s.substr(last, next-last));
+      ++current_field;
     }
+    last = next + delim_length;
+  }
 
-    return tokens;
+  return tokens;
 }
 
 
@@ -401,49 +401,49 @@ std::vector<std::string> get_fields(const std::string &s, const std::string& del
 //	readCountsOfChildren = 0;
 //}
 /*
-template<typename TAXID>
-bool TaxonomyEntryPtr_comp<TAXID>::operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const {
+   template<typename TAXID>
+   bool TaxonomyEntryPtr_comp<TAXID>::operator() ( const TaxonomyEntry<TAXID>* a, const TaxonomyEntry<TAXID>* b) const {
 
-	        return (
-					(reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren)));
-			    }
-*/
+   return (
+   (reads(a->readCounts)+reads(a->readCountsOfChildren)) > (reads(b->readCounts)+reads(b->readCountsOfChildren)));
+   }
+   */
 /*
-template<typename TAXID>
-TAXID TaxonomyDB<TAXID>::getByScientificName(string name) const {
-	for (const auto & tax : entries) {
-		if (tax.second.scientificName == name) {
-		  return tax.first;
-		}
-	}
-	return 0;
-}
-
-template<typename TAXID>
-std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID>::getScientificNameMap() const {
-	std::unordered_map<std::string, TAXID> scientificNameMap;
-	for (const auto & tax : entries) {
-		scientificNameMap[tax.second.scientificName] = tax.first;
-    }
-	return scientificNameMap;
-}
-*/
+   template<typename TAXID>
+   TAXID TaxonomyDB<TAXID>::getByScientificName(string name) const {
+   for (const auto & tax : entries) {
+   if (tax.second.scientificName == name) {
+   return tax.first;
+   }
+   }
+   return 0;
+   }
+
+   template<typename TAXID>
+   std::unordered_map<std::string, TAXID> TaxonomyDB<TAXID>::getScientificNameMap() const {
+   std::unordered_map<std::string, TAXID> scientificNameMap;
+   for (const auto & tax : entries) {
+   scientificNameMap[tax.second.scientificName] = tax.first;
+   }
+   return scientificNameMap;
+   }
+   */
 
 template<typename TAXID>
 unordered_map<TAXID, TAXID> TaxonomyDB<TAXID>::getParentMap() const {
-	unordered_map<TAXID, TAXID> Parent_map;
-	//for (const auto & tax : entries) {
-	for (auto tax_it = entries.begin(); tax_it != entries.end(); ++tax_it) {
-		if (tax_it->first == 0) 
-			continue;
-		if (tax_it->second.parent == NULL) {
-			//cerr << "Parent for " << tax.first << " is 0\n";
-			Parent_map[tax_it->first] = 0; // for kraken::lca
-		} else {
-			Parent_map[tax_it->first] = tax_it->second.parent->taxonomyID;
-		}
+  unordered_map<TAXID, TAXID> Parent_map;
+  //for (const auto & tax : entries) {
+  for (auto tax_it = entries.begin(); tax_it != entries.end(); ++tax_it) {
+    if (tax_it->first == 0) 
+      continue;
+    if (tax_it->second.parent == NULL) {
+      //cerr << "Parent for " << tax.first << " is 0\n";
+      Parent_map[tax_it->first] = 0; // for kraken::lca
+    } else {
+      Parent_map[tax_it->first] = tax_it->second.parent->taxonomyID;
     }
-	return Parent_map;
+  }
+  return Parent_map;
 }
 
 template<typename TAXID>
@@ -459,25 +459,25 @@ TaxonomyEntry<TAXID> TaxonomyDB<TAXID>::getEntry(TAXID taxID) const {
 
 template<typename TAXID>
 void createPointers(
-		std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries, 
-		const std::unordered_map<TAXID, TAXID>& parentMap) {
+    std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries, 
+    const std::unordered_map<TAXID, TAXID>& parentMap) {
   for (auto entry_it = entries.begin(); entry_it != entries.end(); ++entry_it) {
-	TAXID taxonomyID = entry_it->first;
-	auto parent_it = parentMap.find(taxonomyID);
-	if (parent_it == parentMap.end()) {
-	  cerr << "Cannot find parent for " << taxonomyID << endl;
-	} else {
-	  TAXID parentTaxonomyID = parent_it->second;
+    TAXID taxonomyID = entry_it->first;
+    auto parent_it = parentMap.find(taxonomyID);
+    if (parent_it == parentMap.end()) {
+      cerr << "Cannot find parent for " << taxonomyID << endl;
+    } else {
+      TAXID parentTaxonomyID = parent_it->second;
       if (taxonomyID != parentTaxonomyID) {
-	    auto parent_ptr = entries.find(parentTaxonomyID);
-	    if (parent_ptr != entries.end()) {
-		  entry_it->second.parent = &parent_ptr->second;
-		  parent_ptr->second.children.push_back(&entry_it->second);
-	    } else {
-		  cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl;
-	    }
-	  }
+	auto parent_ptr = entries.find(parentTaxonomyID);
+	if (parent_ptr != entries.end()) {
+	  entry_it->second.parent = &parent_ptr->second;
+	  parent_ptr->second.children.push_back(&entry_it->second);
+	} else {
+	  cerr << "Could not find parent with taxonomy ID " << parentTaxonomyID << " for taxonomy ID " << taxonomyID << endl;
 	}
+      }
+    }
   }
 }
 
@@ -487,7 +487,7 @@ TaxonomyDB<TAXID>::TaxonomyDB() : genomeSizes_are_set(false) { }
 template<typename TAXID>
 TaxonomyDB<TAXID>::TaxonomyDB(const std::string inFileName, bool hasGenomeSizes) :
   entries( readTaxonomyIndex_(inFileName, hasGenomeSizes) ), genomeSizes_are_set(hasGenomeSizes)
- { }
+{ }
 
 template<typename TAXID>
 unordered_map<TAXID, TaxonomyEntry<TAXID>> readDumps(const std::string namesDumpFileName, const std::string nodesDumpFileName) {
@@ -502,8 +502,8 @@ unordered_map<TAXID, TaxonomyEntry<TAXID>> readDumps(const std::string namesDump
 
 template<typename TAXID>
 TaxonomyDB<TAXID>::TaxonomyDB(const std::string namesDumpFileName, const std::string nodesDumpFileName) : 
-	entries(readDumps<TAXID>(namesDumpFileName, nodesDumpFileName)) {
-}
+  entries(readDumps<TAXID>(namesDumpFileName, nodesDumpFileName)) {
+  }
 
 template<typename TAXID>
 std::unordered_map<TAXID,TAXID> parseNodesDump(const std::string nodesDumpFileName, std::unordered_map<TAXID, TaxonomyEntry<TAXID> >& entries) {
@@ -525,9 +525,9 @@ std::unordered_map<TAXID,TAXID> parseNodesDump(const std::string nodesDumpFileNa
     auto entryIt = entries.find(taxonomyID);
     if (entryIt == entries.end()) {
       entries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, NULL, rank, "");
-	  parentMap[taxonomyID] = parentTaxonomyID;
+      parentMap[taxonomyID] = parentTaxonomyID;
     } else {
-	  parentMap[taxonomyID] = parentTaxonomyID;
+      parentMap[taxonomyID] = parentTaxonomyID;
       entryIt->second.rank = rank;
     }
 
@@ -556,11 +556,11 @@ void parseNamesDump(const std::string namesDumpFileName, std::unordered_map<TAXI
 
     if (type == "scientific name") {
       auto entryIt = entries.find(taxonomyID);
-     if (entryIt == entries.end()) {
-		 cerr << "Entry for " << taxonomyID << " does not exist - it should!" << '\n';
-        //entries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, NULL, "", scientificName);
+      if (entryIt == entries.end()) {
+	cerr << "Entry for " << taxonomyID << " does not exist - it should!" << '\n';
+	//entries[taxonomyID] = TaxonomyEntry<TAXID>(taxonomyID, NULL, "", scientificName);
       } else {
-        entryIt->second.scientificName = scientificName;
+	entryIt->second.scientificName = scientificName;
       }
     }
     namesDumpFile.ignore(2560, '\n');
@@ -572,7 +572,7 @@ std::vector<KeyType> getSortedKeys(const std::unordered_map<KeyType, ValueType>&
   std::vector<KeyType> keys;
   keys.reserve (my_unordered_map.size());
   for (auto it = my_unordered_map.begin(); it != my_unordered_map.end(); ++it) {
-	keys.push_back(it->first);
+    keys.push_back(it->first);
   }
   std::sort (keys.begin(), keys.end());
   return keys;
@@ -582,15 +582,15 @@ template<typename TAXID>
 void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs) const {
   std::vector<TAXID> sorted_keys = getSortedKeys(entries);
   for (size_t i = 0; i < sorted_keys.size(); ++i) {
-	TAXID taxonomyID = sorted_keys[i];
-	const auto& entry = entries.at(taxonomyID);
-	TAXID parentTaxonomyID = (entry.parent==NULL? taxonomyID : entry.parent->taxonomyID);
+    TAXID taxonomyID = sorted_keys[i];
+    const auto& entry = entries.at(taxonomyID);
+    TAXID parentTaxonomyID = (entry.parent==NULL? taxonomyID : entry.parent->taxonomyID);
     outs << taxonomyID << '\t' << parentTaxonomyID << '\t'
-            << entry.scientificName << '\t' << entry.rank;
+      << entry.scientificName << '\t' << entry.rank;
     if (genomeSizes_are_set) {
-		outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren;
-	}
-	outs << '\n';
+      outs << '\t' << entry.genomeSize << '\t' << entry.genomeSizeOfChildren;
+    }
+    outs << '\n';
   }
   outs.flush();
 }
@@ -598,7 +598,7 @@ void TaxonomyDB<TAXID>::writeTaxonomyIndex(std::ostream & outs) const {
 template<typename TAXID>
 void TaxonomyDB<TAXID>::setGenomeSizes(const std::unordered_map<TAXID, uint64_t> & genomeSizes) {
   for (auto it = genomeSizes.begin(); it != genomeSizes.end(); ++it) {
-	setGenomeSize(it->first, it->second);
+    setGenomeSize(it->first, it->second);
   }
   genomeSizes_are_set = true;
 }
@@ -611,7 +611,7 @@ void TaxonomyDB<TAXID>::readTaxonomyIndex(const std::string inFileName, bool has
 
 template<typename TAXID>
 std::unordered_map<TAXID, TaxonomyEntry<TAXID> > 
- TaxonomyDB<TAXID>::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) {
+TaxonomyDB<TAXID>::readTaxonomyIndex_(const std::string inFileName, bool hasGenomeSizes) {
   log_msg("Reading taxonomy index from " + inFileName);
   std::ifstream inFile(inFileName);
   if (!inFile.is_open())
@@ -634,10 +634,10 @@ std::unordered_map<TAXID, TaxonomyEntry<TAXID> >
     inFile.get(); // read tab
     std::getline(inFile, scientificName, '\t');
     if (hasGenomeSizes) {
-  	  std::getline(inFile, rank, '\t');
-  	  inFile >> genomeSize >> genomeSizeOfChildren;
+      std::getline(inFile, rank, '\t');
+      inFile >> genomeSize >> genomeSizeOfChildren;
     } else {
-  	  std::getline(inFile, rank, '\n');
+      std::getline(inFile, rank, '\n');
     }
     TaxonomyEntry<TAXID> newEntry(taxonomyID, NULL, rank, scientificName, genomeSize, genomeSizeOfChildren);
 
@@ -679,76 +679,76 @@ TAXID TaxonomyDB<TAXID>::getTaxIDAtNextProperRank(TAXID a) const {
 
 template<typename TAXID>
 pair<TAXID,int> TaxonomyDB<TAXID>::getLowestCommonAncestor(TAXID a, TAXID b) const {
-    if (a == 0 || b == 0) {
-      return a ? pair<TAXID,int>(a,-1) : pair<TAXID,int>(b,-1); 
-    }
+  if (a == 0 || b == 0) {
+    return a ? pair<TAXID,int>(a,-1) : pair<TAXID,int>(b,-1); 
+  }
 
-    // create a path from a to the root
-    std::unordered_set<uint32_t> a_path;
-    int distA = 0;
-    while (a > 0 && a != getParentTaxID(a)) {
-      if (a == b)
-        return pair<TAXID,int>{a, distA};
-      a_path.insert(a);
-      a = getParentTaxID(a);
-      ++distA;
-    }
+  // create a path from a to the root
+  std::unordered_set<uint32_t> a_path;
+  int distA = 0;
+  while (a > 0 && a != getParentTaxID(a)) {
+    if (a == b)
+      return pair<TAXID,int>{a, distA};
+    a_path.insert(a);
+    a = getParentTaxID(a);
+    ++distA;
+  }
 
-    int distB = 0;
-    // search for b in the path from a to the root
-    while (b > 0 && b != getParentTaxID(b)) {
-      auto it = a_path.find(b);
-      if (it != a_path.end()) {
-        return pair<TAXID,int>(b, distB + std::distance(a_path.begin(), it));
-      }
-      b = getParentTaxID(b);
-      ++distB;
+  int distB = 0;
+  // search for b in the path from a to the root
+  while (b > 0 && b != getParentTaxID(b)) {
+    auto it = a_path.find(b);
+    if (it != a_path.end()) {
+      return pair<TAXID,int>(b, distB + std::distance(a_path.begin(), it));
     }
-    return pair<TAXID,int>(1, distA+distB);
+    b = getParentTaxID(b);
+    ++distB;
+  }
+  return pair<TAXID,int>(1, distA+distB);
 }
 
 /*
 
-template<typename TAXID>
-TAXID TaxonomyDB<TAXID>::getLowestCommonAncestor(
-    const std::vector<TAXID>& taxIDs) const {
-  if (taxIDs.size() == 0) {
-    return 0;
-  }
-  std::vector<std::vector<READCOUNTS> > paths;
-  for (auto& taxID : taxIDs) {
-    bool good = true;
-    std::vector<READCOUNTS> path;
-    TAXID tempTaxID = taxID;
-    while (tempTaxID != 0) {
-      path.push_back(tempTaxID);
-      tempTaxID = getParentTaxID(tempTaxID);
-    }
-    if (good) paths.push_back(path);
-  }
-  if (paths.size() == 0) {
-    return 0;
-  }
-  for (auto& path : paths)
-    std::reverse(path.begin(), path.end());
-  std::sort(paths.begin(), paths.end(),
-            [](std::vector<READCOUNTS> i, std::vector<READCOUNTS> j) {
-    return i.size() < j.size();
-  });
-  TAXID consensus = 0;
-  // assumes equal paths lengths??
-  for (unsigned i = 0; i < paths[0].size(); i++) {
-    TAXID temp = 0;
-    for (auto& path : paths) {
-      if (temp == 0)
-        temp = path[i];
-      else if (temp != path[i]) {
-        return consensus;
-      }
-    }
-    consensus = temp;
-  }
-  return consensus;
+   template<typename TAXID>
+   TAXID TaxonomyDB<TAXID>::getLowestCommonAncestor(
+   const std::vector<TAXID>& taxIDs) const {
+   if (taxIDs.size() == 0) {
+   return 0;
+   }
+   std::vector<std::vector<READCOUNTS> > paths;
+   for (auto& taxID : taxIDs) {
+   bool good = true;
+   std::vector<READCOUNTS> path;
+   TAXID tempTaxID = taxID;
+   while (tempTaxID != 0) {
+   path.push_back(tempTaxID);
+   tempTaxID = getParentTaxID(tempTaxID);
+   }
+   if (good) paths.push_back(path);
+   }
+   if (paths.size() == 0) {
+   return 0;
+   }
+   for (auto& path : paths)
+   std::reverse(path.begin(), path.end());
+   std::sort(paths.begin(), paths.end(),
+   [](std::vector<READCOUNTS> i, std::vector<READCOUNTS> j) {
+   return i.size() < j.size();
+   });
+   TAXID consensus = 0;
+// assumes equal paths lengths??
+for (unsigned i = 0; i < paths[0].size(); i++) {
+TAXID temp = 0;
+for (auto& path : paths) {
+if (temp == 0)
+temp = path[i];
+else if (temp != path[i]) {
+return consensus;
+}
+}
+consensus = temp;
+}
+return consensus;
 }
 */
 
@@ -837,26 +837,26 @@ std::string TaxonomyDB<TAXID>::getMetaPhlAnLineage(TAXID taxonomyID) const {
     if (taxonomyID != 131567) {
       std::string rank = getRank(taxonomyID);
       if (rank == "species") {
-        lineage.insert(0, "|s__");
-        lineage.insert(4, getScientificName(taxonomyID));
+	lineage.insert(0, "|s__");
+	lineage.insert(4, getScientificName(taxonomyID));
       } else if (rank == "genus") {
-        lineage.insert(0, "|g__");
-        lineage.insert(4, getScientificName(taxonomyID));
+	lineage.insert(0, "|g__");
+	lineage.insert(4, getScientificName(taxonomyID));
       } else if (rank == "family") {
-        lineage.insert(0, "|f__");
-        lineage.insert(4, getScientificName(taxonomyID));
+	lineage.insert(0, "|f__");
+	lineage.insert(4, getScientificName(taxonomyID));
       } else if (rank == "order") {
-        lineage.insert(0, "|o__");
-        lineage.insert(4, getScientificName(taxonomyID));
+	lineage.insert(0, "|o__");
+	lineage.insert(4, getScientificName(taxonomyID));
       } else if (rank == "class") {
-        lineage.insert(0, "|c__");
-        lineage.insert(4, getScientificName(taxonomyID));
+	lineage.insert(0, "|c__");
+	lineage.insert(4, getScientificName(taxonomyID));
       } else if (rank == "phylum") {
-        lineage.insert(0, "|p__");
-        lineage.insert(4, getScientificName(taxonomyID));
+	lineage.insert(0, "|p__");
+	lineage.insert(4, getScientificName(taxonomyID));
       } else if (rank == "superkingdom") {
-        lineage.insert(0, "k__");
-        lineage.insert(3, getScientificName(taxonomyID));
+	lineage.insert(0, "k__");
+	lineage.insert(3, getScientificName(taxonomyID));
       }
     }
     taxonomyID = getParentTaxID(taxonomyID);
@@ -870,42 +870,42 @@ std::string TaxonomyDB<TAXID>::getMetaPhlAnLineage(TAXID taxonomyID) const {
 
 template<typename TAXID>
 TAXID TaxonomyDB<TAXID>::getTaxIDAtRank(const TAXID taxID,
-                                    const std::string& rank) const {
+    const std::string& rank) const {
   if (taxID == 0 || taxID == 1)
     return 0;
   auto entry_it = entries.find(taxID);
   // cerr << "getTaxIDAtRank(" << taxID << "," << rank << ")" << endl;
   if (entry_it != entries.end()) {
-	const TaxonomyEntry<TAXID>* entry_ptr = &entry_it->second;
-  while (entry_ptr != NULL
-      && entry_ptr->parent != NULL) {
-    // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl;
-    if (entry_ptr->rank == rank) {
-      return entry_ptr->taxonomyID;
-    } else {
-      entry_ptr = entry_ptr->parent;
+    const TaxonomyEntry<TAXID>* entry_ptr = &entry_it->second;
+    while (entry_ptr != NULL
+	&& entry_ptr->parent != NULL) {
+      // cerr << "Checking rank of " << entry->second.taxonomyID << ": " << entry->second.rank << endl;
+      if (entry_ptr->rank == rank) {
+	return entry_ptr->taxonomyID;
+      } else {
+	entry_ptr = entry_ptr->parent;
+      }
     }
   }
-  }
   return 0;
 }
 
 
 template<typename TAXID>
 void TaxonomyDB<TAXID>::setGenomeSize(const TAXID taxid, const uint64_t genomeSize) {
-	auto it = entries.find(taxid);
-		if (it == entries.end()) {
-			cerr << "No taxonomy entry for " << taxid << "!!" << endl;
-			return;
-		}
-		TaxonomyEntry<TAXID>* tax = &it->second;
-		tax->genomeSize += genomeSize;
-
-		while (tax->parent != NULL) {
-			tax = tax->parent;
-			//std::cerr << "setting genomeSizeOfChildren of parent" << std::endl;
-			tax->genomeSizeOfChildren += genomeSize;
-		}
+  auto it = entries.find(taxid);
+  if (it == entries.end()) {
+    cerr << "No taxonomy entry for " << taxid << "!!" << endl;
+    return;
+  }
+  TaxonomyEntry<TAXID>* tax = &it->second;
+  tax->genomeSize += genomeSize;
+
+  while (tax->parent != NULL) {
+    tax = tax->parent;
+    //std::cerr << "setting genomeSizeOfChildren of parent" << std::endl;
+    tax->genomeSizeOfChildren += genomeSize;
+  }
 }
 
 template<typename TAXID>
@@ -927,31 +927,32 @@ void TaxonomyDB<TAXID>::readGenomeSizes(string file) {
 }
 
 /*
-template<typename TAXID>
-void TaxonomyDB<TAXID>::setReadCounts(const unordered_map<TAXID>& readCounts) {
-	for (auto& elem : readCounts) {
-		addReadCount(elem.first, elem.second);
-	 }
+   template<typename TAXID>
+   void TaxonomyDB<TAXID>::setReadCounts(const unordered_map<TAXID>& readCounts) {
+   for (auto& elem : readCounts) {
+   addReadCount(elem.first, elem.second);
+   }
 
-	for (auto& tax : entries) {
-		std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID>());
-	}
-}
-*/
+   for (auto& tax : entries) {
+   std::sort(tax.second.children.begin(), tax.second.children.end(),TaxonomyEntryPtr_comp<TAXID>());
+   }
+   }
+   */
 
 
 template<typename TAXID, typename READCOUNTS>
-  TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID>& taxdb, 
-	std::unordered_map<TAXID, READCOUNTS> readCounts,
+TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, TaxonomyDB<TAXID>& taxdb, 
+    std::unordered_map<TAXID, READCOUNTS> readCounts,
     bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _readCounts(readCounts), _show_zeros(show_zeros) {
 
-	for (auto it = _readCounts.begin(); it != _readCounts.end(); ++it) {
-		TaxonomyEntry<TAXID>* tax = &taxdb.entries.at(it->first);
-		while (tax != NULL) {
-			_readCountsIncludingChildren[tax->taxonomyID] += it->second;
-			tax = tax->parent;
-		}
-	}
+  for (auto it = _readCounts.begin(); it != _readCounts.end(); ++it) {
+    TaxonomyEntry<TAXID>* tax = &taxdb.entries.at(it->first);
+    while (tax != NULL) {
+      _readCountsIncludingChildren[tax->taxonomyID] += it->second;
+      tax = tax->parent;
+    }
+  }
+
 
   _report_cols = {REPORTCOLS::PERCENTAGE, REPORTCOLS::NUM_READS_CLADE, REPORTCOLS::NUM_READS, 
     REPORTCOLS::NUM_KMERS_CLADE, REPORTCOLS::NUM_UNIQUE_KMERS_CLADE, 
@@ -962,94 +963,75 @@ template<typename TAXID, typename READCOUNTS>
 
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::setReportCols(std::vector<std::string> names) {
-	_report_cols.clear();
-	for (size_t i = 0; i< names.size(); ++i) {
-		auto& s = names[i];
-		auto it = report_col_name_map.find(s);
-		if (it == report_col_name_map.end()) {
-			throw std::runtime_error(s + " is not a valid report column name");
-		}
-		_report_cols.push_back(it->second);
-	}
-	_report_col_names = names;
+  _report_cols.clear();
+  for (size_t i = 0; i< names.size(); ++i) {
+    auto& s = names[i];
+    auto it = report_col_name_map.find(s);
+    if (it == report_col_name_map.end()) {
+      throw std::runtime_error(s + " is not a valid report column name");
+    }
+    _report_cols.push_back(it->second);
+  }
+  _report_col_names = names;
 
 }
 
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::printReport(std::string format, std::string rank) {
-	uint64_t _total_n_reads = reads(_readCountsIncludingChildren[0]) + reads(_readCountsIncludingChildren[1]);
-	if (_total_n_reads == 0) {
-		std::cerr << "total number of reads is zero - not creating a report!" << endl;
-		return;
-	}
-	if (_report_cols.size() == _report_col_names.size()) {
-		// print header
-		bool first_one = true;
-		for (size_t i=0; i < _report_col_names.size(); ++i) {
-			const std::string& s = _report_col_names[i];
-			if (first_one) {
-				first_one = false;
-			} else {
-				_reportOfb << '\t';
-			}
-			_reportOfb << s;
-		}
-		_reportOfb << endl;
-	}
+  _total_n_reads = reads(_readCountsIncludingChildren[0]) + reads(_readCountsIncludingChildren[1]);
+  if (_total_n_reads == 0) {
+    std::cerr << "total number of reads is zero - not creating a report!" << endl;
+    return;
+  }
+  if (_report_cols.size() == _report_col_names.size()) {
+    // print header
+    bool first_one = true;
+    for (size_t i=0; i < _report_col_names.size(); ++i) {
+      const std::string& s = _report_col_names[i];
+      if (first_one) {
+	first_one = false;
+      } else {
+	_reportOfb << '\t';
+      }
+      _reportOfb << s;
+    }
+    _reportOfb << endl;
+  }
 
-	if (format == "kraken") {
-		// A: print number of unidentified reads
-		printReport(_taxdb.entries.at(0),0u);
-		// B: print normal results
-		printReport(_taxdb.entries.at(1),0u);
-		// C: Print Unclassified stuff
-		auto it = _taxdb.entries.find(-1);
-		if (it != _taxdb.entries.end()) {
-		  printReport(it->second,0u);
-		}
-	} else {
-		// print stuff at a certain level ..
-		//_uid_abundance;
-		//_taxinfo
+  if (format == "kraken") {
+    // A: print number of unidentified reads
+    printReport(_taxdb.entries.at(0),0u);
+    // B: print normal results
+    printReport(_taxdb.entries.at(1),0u);
+    // C: Print Unclassified stuff
+    auto it = _taxdb.entries.find(-1);
+    if (it != _taxdb.entries.end()) {
+      printReport(it->second,0u);
+    }
+  } else {
+    // print stuff at a certain level ..
+    //_uid_abundance;
+    //_taxinfo
 
-	}
+  }
 }
 
-template<typename ReadCounts>
-struct CompareReadCounts : std::binary_function<size_t, size_t, bool> {
-	CompareReadCounts(std::vector<ReadCounts*> counts_) : counts(counts_) {}
-
-	bool operator()(size_t a, size_t b) const {
-		if (counts[a]->n_reads == counts[b]->n_reads) {
-			return counts[a]->n_kmers < counts[b]->n_kmers;
-		} else {
-			return counts[a]->n_reads < counts[b]->n_reads;
-		}
-	}
-
-	std::vector<ReadCounts*>& counts;
-};
-
 template<typename TAXID, typename READCOUNTS>
 void TaxReport<TAXID,READCOUNTS>::printReport(TaxonomyEntry<TAXID>& tax, unsigned depth) {
-	if (_show_zeros || reads(_readCountsIncludingChildren[tax.taxonomyID]) > 0) {
-		printLine(tax, depth);
-		// TODO: Order children ...
-
-		std::vector<size_t> pos(tax.children.size());
-		std::vector<READCOUNTS*> counts(tax.children.size());
-		for (size_t i=0; i < tax.children.size(); ++i) {
-			pos[i] = i;
-			counts[i] = &_readCountsIncludingChildren[i];
-		}
-
-		std::sort(pos.begin(), pos.end(), CompareReadCounts<READCOUNTS>(counts));
-
-		for (size_t i=0; i < tax.children.size(); ++i) {
-			auto child_it = tax.children[ pos[i] ];
-			printReport(*child_it, depth+1);
-		}
-	}
+  if (_show_zeros || reads(_readCountsIncludingChildren[tax.taxonomyID]) > 0) {
+    printLine(tax, depth);
+
+    // Sort children
+    std::vector<size_t> pos(tax.children.size());
+    for (size_t i=0; i < tax.children.size(); ++i) { pos[i] = i; }
+    std::sort(pos.begin(), pos.end(), 
+	[&](size_t a, size_t b) { return _readCountsIncludingChildren[tax.children[b]->taxonomyID] <  _readCountsIncludingChildren[tax.children[a]->taxonomyID] ;} );
+
+    for (size_t i=0; i < tax.children.size(); ++i) {
+      auto child_it = tax.children[ pos[i] ];
+      printReport(*child_it, depth+1);
+    }
+  }
 }
 
 template<typename TAXID, typename READCOUNTS>
@@ -1058,56 +1040,56 @@ void TaxReport<TAXID,READCOUNTS>::printLine(TaxonomyEntry<TAXID>& tax, unsigned
   long long unique_kmers_for_clade = _readCountsIncludingChildren[tax.taxonomyID].kmers.cardinality();
   double genome_size = double(tax.genomeSize+tax.genomeSizeOfChildren);
 
-	for (size_t i = 0; i< _report_cols.size(); ++i) {
-		auto& col = _report_cols[i];
-		switch (col) {
-		case REPORTCOLS::NAME:              _reportOfb << tax.scientificName ; break;
-		case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
-		case REPORTCOLS::TAX_ID:            _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
-		case REPORTCOLS::DEPTH:             _reportOfb << depth; break;
-		case REPORTCOLS::PERCENTAGE:       _reportOfb << setprecision(4) << 100.0*(reads(_readCountsIncludingChildren[tax.taxonomyID]))/_total_n_reads; break;
-		//case REPORTCOLS::ABUNDANCE:      _reportOfb << 100*counts.abundance[0]; break;
-		//case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
-		case REPORTCOLS::NUM_READS:        _reportOfb << reads(_readCounts[tax.taxonomyID]); break;
-		case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(_readCountsIncludingChildren[tax.taxonomyID])); break;
-		case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << _readCounts[tax.taxonomyID].kmers.cardinality(); break;
-		case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE:  _reportOfb << unique_kmers_for_clade; break;
-		case REPORTCOLS::NUM_KMERS:        _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break;
-		case REPORTCOLS::NUM_KMERS_CLADE:  _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break;
-		case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
-		case REPORTCOLS::CLADE_KMER_COVERAGE: 
-		  if (genome_size == 0) { 
-		    _reportOfb << "NA"; 
-		  } else {
-		   _reportOfb << setprecision(4) << (unique_kmers_for_clade  / genome_size); 
-		  }; break;
-		case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break;
-		case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
-		//case REPORTCOLS::GENOME_SIZE: ; break;
-		//case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
-		//case REPORTCOLS::SUM_SCORE: ; break;
-		case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break;
-		default: _reportOfb << "NA";
-		}
-		if (&col == &_report_cols.back()) {
-			_reportOfb << '\n';
-		} else {
-			_reportOfb << '\t';
-		}
-	}
+  for (size_t i = 0; i< _report_cols.size(); ++i) {
+    auto& col = _report_cols[i];
+    switch (col) {
+      case REPORTCOLS::NAME:              _reportOfb << tax.scientificName ; break;
+      case REPORTCOLS::SPACED_NAME:       _reportOfb << string(2*depth, ' ') + tax.scientificName; break;
+      case REPORTCOLS::TAX_ID:            _reportOfb << (tax.taxonomyID == (uint32_t)-1? -1 : (int32_t) tax.taxonomyID); break;
+      case REPORTCOLS::DEPTH:             _reportOfb << depth; break;
+      case REPORTCOLS::PERCENTAGE:       _reportOfb << setprecision(4) << 100.0*(reads(_readCountsIncludingChildren[tax.taxonomyID]))/_total_n_reads; break;
+					 //case REPORTCOLS::ABUNDANCE:      _reportOfb << 100*counts.abundance[0]; break;
+					 //case REPORTCOLS::ABUNDANCE_LEN:  _reportOfb << 100*counts.abundance[1]; break;
+      case REPORTCOLS::NUM_READS:        _reportOfb << reads(_readCounts[tax.taxonomyID]); break;
+      case REPORTCOLS::NUM_READS_CLADE:  _reportOfb << (reads(_readCountsIncludingChildren[tax.taxonomyID])); break;
+      case REPORTCOLS::NUM_UNIQUE_KMERS: _reportOfb << _readCounts[tax.taxonomyID].kmers.cardinality(); break;
+      case REPORTCOLS::NUM_UNIQUE_KMERS_CLADE:  _reportOfb << unique_kmers_for_clade; break;
+      case REPORTCOLS::NUM_KMERS:        _reportOfb << _readCounts[tax.taxonomyID].n_kmers; break;
+      case REPORTCOLS::NUM_KMERS_CLADE:  _reportOfb << _readCountsIncludingChildren[tax.taxonomyID].n_kmers; break;
+      case REPORTCOLS::NUM_KMERS_IN_DATABASE: _reportOfb << tax.genomeSize; break;
+      case REPORTCOLS::CLADE_KMER_COVERAGE: 
+					      if (genome_size == 0) { 
+						_reportOfb << "NA"; 
+					      } else {
+						_reportOfb << setprecision(4) << (unique_kmers_for_clade  / genome_size); 
+					      }; break;
+      case REPORTCOLS::CLADE_KMER_DUPLICITY: _reportOfb << setprecision(3) << ( double(_readCountsIncludingChildren[tax.taxonomyID].n_kmers) / unique_kmers_for_clade ); break;
+      case REPORTCOLS::NUM_KMERS_IN_DATABASE_CLADE: _reportOfb << tax.genomeSize + tax.genomeSizeOfChildren; break;
+						    //case REPORTCOLS::GENOME_SIZE: ; break;
+						    //case REPORTCOLS::NUM_WEIGHTED_READS: ; break;
+						    //case REPORTCOLS::SUM_SCORE: ; break;
+      case REPORTCOLS::TAX_RANK: _reportOfb << tax.rank; break;
+      default: _reportOfb << "NA";
+    }
+    if (&col == &_report_cols.back()) {
+      _reportOfb << '\n';
+    } else {
+      _reportOfb << '\t';
+    }
+  }
 }
 
 
 template<typename K,typename V>
 inline
 V find_or_use_default(const std::unordered_map<K, V>& my_map, const K& query, const V default_value) {
-	auto itr = my_map.find(query);
+  auto itr = my_map.find(query);
 
-	if (itr == my_map.end()) {
-		return default_value;
-	}
+  if (itr == my_map.end()) {
+    return default_value;
+  }
 
-	return itr->second;
+  return itr->second;
 }
 
 

From 10f5998a54bf4642e20b9d1f3790ff9e23afcd96 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Thu, 9 Nov 2017 23:23:35 -0500
Subject: [PATCH 103/105] Fixed HLL bug introduced by using vector for sparse
 representation

---
 src/Makefile           | 11 ++++-
 src/count_unique.cpp   | 56 ++++++++++++++++++++++++++
 src/hyperloglogplus.h  | 12 +++---
 src/krakendb.cpp       |  1 +
 src/test_hll_on_db.cpp | 91 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 163 insertions(+), 8 deletions(-)
 create mode 100644 src/count_unique.cpp
 create mode 100644 src/test_hll_on_db.cpp

diff --git a/src/Makefile b/src/Makefile
index 37e068d..0c8f50e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -2,7 +2,8 @@ CXX = g++
 FOPENMP?=-fopenmp
 CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS}
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
-PROGS = classify db_sort set_lcas db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
+PROGS = classify db_sort set_lcas db_shrink build_taxdb read_uid_mapping count_unique
+TEST_PROGS = grade_classification dump_taxdb test_hll_on_db
 #LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS}
 LIBFLAGS = -L. -lz ${LDFLAGS}
 
@@ -10,11 +11,13 @@ LIBFLAGS = -L. -lz ${LDFLAGS}
 
 all: $(PROGS)
 
+allall: $(PROGS) $(TEST_PROGS)
+
 install: $(PROGS)
 	cp $(PROGS) $(KRAKEN_DIR)/
 
 clean:
-	rm -rf $(PROGS) *.o *.dSYM
+	rm -rf $(PROGS) $(TEST_PROGS) *.o *.dSYM *.gch
 
 db_shrink: krakendb.o quickfile.o
 
@@ -26,6 +29,10 @@ grade_classification: taxdb.h report-cols.h
 
 read_uid_mapping: quickfile.o
 
+count_unique: hyperloglogplus.h
+
+test_hll_on_db: krakendb.o hyperloglogplus.h quickfile.o
+
 classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o gzstream.o hyperloglogplus.h taxdb.h report-cols.h
 	$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)
 
diff --git a/src/count_unique.cpp b/src/count_unique.cpp
new file mode 100644
index 0000000..3299a5a
--- /dev/null
+++ b/src/count_unique.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hyperloglogplus.h"
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+int main(int argc, char **argv) {
+  if (argc != 4) {
+    std::cerr << "USAGE:\n" 
+      << "count_unique PRECISION SPARSE TEST_MODE\n"
+      << "\n"
+      << "Valid precision values: 10-18. SPARSE can be 0 or 1. If TEST_MODE is 1, then a HLL estimate is given with each number. \n"
+      << "Returns the cardinality of the input stream (has to be uint64_t)\n";
+    return 1;
+  }
+
+  size_t p = stoi(argv[1]);
+  bool sparse = bool(stoi(argv[2]));
+  bool test_mode = bool(stoi(argv[3]));
+  HyperLogLogPlusMinus<uint64_t> hll(p, sparse); // unique k-mer count per taxon
+  uint64_t nr;
+  uint64_t ctr = 0;
+  if (test_mode) {
+    cout << "observed\testimated\n";
+  }
+  while (cin >> nr) {
+    hll.add(nr);
+    if (test_mode) {
+      cout << ++ctr << '\t' << hll.cardinality() << '\n';
+    }
+  }
+  if (!test_mode) {
+    cout << hll.cardinality() << endl;
+  }
+  
+}
+
diff --git a/src/hyperloglogplus.h b/src/hyperloglogplus.h
index 495ef11..21f7d19 100644
--- a/src/hyperloglogplus.h
+++ b/src/hyperloglogplus.h
@@ -117,7 +117,7 @@ double alpha(uint32_t m)  {
 /**
  * calculate the raw estimate as harmonic mean of the ranks in the register
  */
-double calculateRawEstimate(vector<uint8_t> M) {
+inline double calculateRawEstimate(const vector<uint8_t>& M) {
   double inverseSum = 0.0;
   for (size_t i = 0; i < M.size(); ++i) {
     // TODO: pre-calculate the power calculation
@@ -159,7 +159,7 @@ T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {
 inline 
 void insert_hash(vector<uint32_t>& vec, uint32_t val) {
   auto it = std::lower_bound( vec.begin(), vec.end(), val); // find proper position in descending order
-  if (it == vec.end()) {
+  if (it == vec.end() || *it != val) {
     vec.insert( it, val ); // insert before iterator it
   }
 }
@@ -429,7 +429,7 @@ class HyperLogLogPlusMinus {
    *
    * @return cardinality estimate
    */
-  uint64_t cardinality(bool verbose=true) {
+  uint64_t cardinality(bool verbose=true) const {
     if (sparse) {
       // if we are 'sparse', then use linear counting with increased precision pPrime
       return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
@@ -502,7 +502,7 @@ class HyperLogLogPlusMinus {
       return rank_val;
     }
 
-  vector<double> rawEstimateData(size_t p) {
+  vector<double> rawEstimateData(size_t p) const {
     switch (p) {
       case  4: return vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4));
       case  5: return vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5));
@@ -523,7 +523,7 @@ class HyperLogLogPlusMinus {
     return vector<double>();
   }
 
-  vector<double> biasData(size_t p) {
+  vector<double> biasData(size_t p) const {
     switch(p) {
       case  4: return vector<double>(biasData_precision4,arr_len(biasData_precision4));
       case  5: return vector<double>(biasData_precision5,arr_len(biasData_precision5));
@@ -551,7 +551,7 @@ class HyperLogLogPlusMinus {
    * @param est
    * @return correction value for
    */
-  double getEstimateBias(double estimate) {
+  double getEstimateBias(double estimate) const {
     vector<double> rawEstimateTable = rawEstimateData(p);
     vector<double> biasTable = biasData(p);
   
diff --git a/src/krakendb.cpp b/src/krakendb.cpp
index cae738f..d49a66f 100644
--- a/src/krakendb.cpp
+++ b/src/krakendb.cpp
@@ -1,4 +1,5 @@
 /*
+ * Portions (c) 2017, Florian Breitwieser <fbreitwieser@jhu.edu> as part of KrakenHLL
  * Copyright 2013-2015, Derrick Wood <dwood@cs.jhu.edu>
  *
  * This file is part of the Kraken taxonomic sequence classification system.
diff --git a/src/test_hll_on_db.cpp b/src/test_hll_on_db.cpp
new file mode 100644
index 0000000..365ad5e
--- /dev/null
+++ b/src/test_hll_on_db.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2017, Florian Breitwieser
+ *
+ * This file is part of the KrakenHLL taxonomic sequence classification system.
+ *
+ * KrakenHLL is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * KrakenHLL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hyperloglogplus.h"
+#include "kraken_headers.hpp"
+#include "quickfile.hpp"
+#include "krakendb.hpp"
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+using namespace kraken;
+
+//using std::map to have the keys sorted
+void count_n_random_kmers(size_t nr, char* ptr, size_t pair_sz, size_t key_ct, size_t key_len) {
+}
+
+int main(int argc, char **argv) {
+  if (argc != 5) {
+    std::cerr << "USAGE:\n" 
+      << "count_unique DATABASE PRECISION SPARSE NR_KMERS\n"
+      << "\n"
+      << "Valid precision values: 10-18. SPARSE can be 0 or 1. \n";
+    return 1;
+  }
+
+  char *db_name = argv[1];
+  QuickFile db_file;
+  db_file.open_file(db_name);
+  //db_file.load_file();
+  KrakenDB db(db_file.ptr());
+
+  size_t p = stoi(argv[2]);
+  bool sparse = bool(stoi(argv[3]));
+  size_t nr = stoi(argv[4]);
+
+  HyperLogLogPlusMinus<uint64_t> hll(p, sparse); // unique k-mer count per taxon
+
+  char* ptr = db.get_ptr();
+  //char* pair_ptr = db.get_pair_ptr();
+  uint64_t key_len = db.get_key_len();     // how many bytes does each key occupy?
+  //uint64_t val_len = db.get_val_len();     // how many bytes does each value occupy?
+  uint64_t key_ct = db.get_key_ct();      // how many key/value pairs are there?
+  uint64_t pair_sz = db.pair_size();       // how many bytes does each pair occupy?
+
+  if (nr > key_ct) {
+    cerr << nr << " is greater than " << key_ct << "!!!" << endl;
+    exit(1);
+  }
+
+  if (ptr == NULL) { 
+    std::cerr << "Kraken database pointer is NULL [pair_sz: " << pair_sz << ", key_ct: "<<key_ct<<", key_len: "<< key_len<<"]!" << std::endl;
+    exit(1);
+  }
+  double prob = double(nr)/double(key_ct);
+  std::random_device rd;  //Will be used to obtain a seed for the random number engine
+  std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+
+  size_t ctr = 0;
+  for (uint64_t i = 0; i < key_ct; i++) {
+    if (dis(gen) < prob) {
+      uint64_t* kmer = (uint64_t *) (ptr + pair_sz * i);
+      //uint32_t* taxon = (uint32_t *) (ptr + pair_sz * i + key_len);
+      //if (taxon == NULL) {
+      //  std::cerr << "taxon is NULL (i is " << i << " and key_ct is " << key_ct << ")" << std::endl;
+      hll.add(*kmer);
+      ++ctr;
+      if (ctr < 10 || floor(log10(ctr)) == log10(ctr)) {
+        cout << ctr << '\t' << hll.cardinality() << '\n';
+      }
+    }
+  }
+}
+

From 1d37b547a179e2d7b965d0241216c235fdcf2547 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Fri, 10 Nov 2017 15:09:18 -0500
Subject: [PATCH 104/105] Fix Makefile

---
 src/Makefile           |  3 ++-
 src/test_hll_on_db.cpp | 34 ++++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 0c8f50e..b067a9c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -2,8 +2,9 @@ CXX = g++
 FOPENMP?=-fopenmp
 CXXFLAGS = -Wall -std=c++0x $(FOPENMP) -I./gzstream -O2 -Wfatal-errors ${CPPFLAGS}
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
-PROGS = classify db_sort set_lcas db_shrink build_taxdb read_uid_mapping count_unique
+PROGS1 = classify db_sort set_lcas db_shrink build_taxdb read_uid_mapping count_unique
 TEST_PROGS = grade_classification dump_taxdb test_hll_on_db
+PROGS = $(PROGS1) $(TEST_PROGS)
 #LIBFLAGS = -L. -lz -lgzstream ${LDFLAGS}
 LIBFLAGS = -L. -lz ${LDFLAGS}
 
diff --git a/src/test_hll_on_db.cpp b/src/test_hll_on_db.cpp
index 365ad5e..0ded629 100644
--- a/src/test_hll_on_db.cpp
+++ b/src/test_hll_on_db.cpp
@@ -43,14 +43,23 @@ int main(int argc, char **argv) {
   char *db_name = argv[1];
   QuickFile db_file;
   db_file.open_file(db_name);
-  //db_file.load_file();
+  db_file.load_file();
+  cerr << "Fully loaded\n";
   KrakenDB db(db_file.ptr());
 
   size_t p = stoi(argv[2]);
   bool sparse = bool(stoi(argv[3]));
   size_t nr = stoi(argv[4]);
 
-  HyperLogLogPlusMinus<uint64_t> hll(p, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll10(10, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll11(11, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll12(12, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll13(13, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll14(14, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll15(15, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll16(16, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll17(17, sparse); // unique k-mer count per taxon
+  HyperLogLogPlusMinus<uint64_t> hll18(18, sparse); // unique k-mer count per taxon
 
   char* ptr = db.get_ptr();
   //char* pair_ptr = db.get_pair_ptr();
@@ -73,6 +82,7 @@ int main(int argc, char **argv) {
   std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
   std::uniform_real_distribution<> dis(0.0, 1.0);
 
+  cout << "precision\ttrue_count\testimate\n";
   size_t ctr = 0;
   for (uint64_t i = 0; i < key_ct; i++) {
     if (dis(gen) < prob) {
@@ -80,10 +90,26 @@ int main(int argc, char **argv) {
       //uint32_t* taxon = (uint32_t *) (ptr + pair_sz * i + key_len);
       //if (taxon == NULL) {
       //  std::cerr << "taxon is NULL (i is " << i << " and key_ct is " << key_ct << ")" << std::endl;
-      hll.add(*kmer);
+      hll10.add(*kmer);
+      hll11.add(*kmer);
+      hll12.add(*kmer);
+      hll13.add(*kmer);
+      hll14.add(*kmer);
+      hll15.add(*kmer);
+      hll16.add(*kmer);
+      hll17.add(*kmer);
+      hll18.add(*kmer);
       ++ctr;
       if (ctr < 10 || floor(log10(ctr)) == log10(ctr)) {
-        cout << ctr << '\t' << hll.cardinality() << '\n';
+        cout << 10 << '\t' << ctr << '\t' << hll10.cardinality() << '\n';
+        cout << 11 << '\t' << ctr << '\t' << hll11.cardinality() << '\n';
+        cout << 12 << '\t' << ctr << '\t' << hll12.cardinality() << '\n';
+        cout << 13 << '\t' << ctr << '\t' << hll13.cardinality() << '\n';
+        cout << 14 << '\t' << ctr << '\t' << hll14.cardinality() << '\n';
+        cout << 15 << '\t' << ctr << '\t' << hll15.cardinality() << '\n';
+        cout << 16 << '\t' << ctr << '\t' << hll16.cardinality() << '\n';
+        cout << 17 << '\t' << ctr << '\t' << hll17.cardinality() << '\n';
+        cout << 18 << '\t' << ctr << '\t' << hll18.cardinality() << '\n';
       }
     }
   }

From a95bd8554653f3a37ced6405495b32a941fe8b55 Mon Sep 17 00:00:00 2001
From: Florian Breitwieser <florian.bw@gmail.com>
Date: Fri, 10 Nov 2017 21:15:00 -0500
Subject: [PATCH 105/105] Allow setting custom precision

---
 scripts/krakenhll  |  6 +++++-
 src/classify.cpp   | 27 ++++++++++++++++-----------
 src/readcounts.hpp |  3 ++-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/scripts/krakenhll b/scripts/krakenhll
index 69e3b56..6db00b2 100755
--- a/scripts/krakenhll
+++ b/scripts/krakenhll
@@ -61,6 +61,7 @@ my $outfile;
 my $report_file;
 my $print_sequence = 0;
 my $uid_mapping = 0;
+my $hll_precision = 12;
 
 GetOptions(
   "help" => \&display_help,
@@ -78,6 +79,7 @@ GetOptions(
   "report-file=s" => \$report_file,
   "preload" => \$preload,
   "paired" => \$paired,
+  "precision=i", \$hll_precision,
   "check-names" => \$check_names,
   "gzip-compressed" => \$gunzip,
   "bzip2-compressed" => \$bunzip2,
@@ -94,7 +96,7 @@ if (! @ARGV) {
   usage();
 }
 
-if (!defined $report_file) {
+if (!defined $report_file && !$preload) {
   print STDERR "Need to specify a report file with --report-file!
 See --help for more details.\n";
   exit 1;
@@ -157,6 +159,7 @@ push @flags, "-M" if $preload;
 push @flags, "-r", $report_file if defined $report_file;
 push @flags, "-a", $db_prefix[0]."/taxDB";
 push @flags, "-s" if $print_sequence;
+push @flags, "-p" $precision;
 if ($uid_mapping) {
   my $uid_mapping_file = "$db_prefix[0]/uid_to_taxid.map";
   if (!-f $uid_mapping_file) {
@@ -245,6 +248,7 @@ Options:
   --fastq-input           Input is FASTQ format
   --gzip-compressed       Input is gzip compressed
   --bzip2-compressed      Input is bzip2 compressed
+  --precision INT         Precision for unique k-mer counting, between 10 and 18 (default: $hll_precision)
   --quick                 Quick operation (use first hit or hits)
   --min-hits NUM          In quick op., number of hits req'd for classification
                           NOTE: this is ignored if --quick is not specified
diff --git a/src/classify.cpp b/src/classify.cpp
index 049955e..1d4e9e8 100644
--- a/src/classify.cpp
+++ b/src/classify.cpp
@@ -152,15 +152,6 @@ int main(int argc, char **argv) {
     //}
   }
 
-  if (!TaxDB_file.empty()) {
-    // TODO: Define if the taxDB has read counts or not!!
-      taxdb = TaxonomyDB<uint32_t>(TaxDB_file, false);
-      Parent_map = taxdb.getParentMap();
-  } else {
-      cerr << "TaxDB argument is required!" << endl;
-      return 1;
-  }
-
   if (Populate_memory)
     cerr << "Loading database(s)... " << endl;
 
@@ -190,6 +181,16 @@ int main(int argc, char **argv) {
   if (Populate_memory)
     cerr << "\ncomplete." << endl;
 
+
+  if (!TaxDB_file.empty()) {
+    // TODO: Define if the taxDB has read counts or not!!
+      taxdb = TaxonomyDB<uint32_t>(TaxDB_file, false);
+      Parent_map = taxdb.getParentMap();
+  } else {
+      cerr << "TaxDB argument is required!" << endl;
+      return 1;
+  }
+
   if (Print_classified) {
     Classified_output = cout_or_file(Classified_output_file);
   }
@@ -612,7 +613,7 @@ void parse_command_line(int argc, char **argv) {
 
   if (argc > 1 && strcmp(argv[1], "-h") == 0)
     usage(0);
-  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:sI:")) != -1) {
+  while ((opt = getopt(argc, argv, "d:i:t:u:n:m:o:qfcC:U:Ma:r:sI:p:")) != -1) {
     switch (opt) {
       case 'd' :
         DB_filenames.push_back(optarg);
@@ -631,6 +632,9 @@ void parse_command_line(int argc, char **argv) {
         omp_set_num_threads(Num_threads);
         #endif
         break;
+      case 'p' :
+        ReadCounts::HLL_PRECISION = stoi(optarg);
+        break;
       case 'q' :
         Quick_mode = true;
         break;
@@ -693,7 +697,7 @@ void parse_command_line(int argc, char **argv) {
     cerr << "Missing mandatory option -i" << endl;
     usage();
   }
-  if (optind == argc) {
+  if (optind == argc && !Populate_memory) {
     cerr << "No sequence data files specified" << endl;
   }
 }
@@ -708,6 +712,7 @@ void usage(int exit_code) {
        << "  -r filename      Output file for Kraken report output" << endl
        << "  -a filename      TaxDB" << endl
        << "  -I filename      UID to TaxId map" << endl
+       << "  -p #             Precision for unique k-mer counting, between 10 and 18" << endl
        << "  -t #             Number of threads" << endl
        << "  -u #             Thread work unit size (in bp)" << endl
        << "  -q               Quick operation" << endl
diff --git a/src/readcounts.hpp b/src/readcounts.hpp
index 74a52a6..c46d3e0 100644
--- a/src/readcounts.hpp
+++ b/src/readcounts.hpp
@@ -28,8 +28,9 @@ namespace kraken {
     uint64_t n_reads;
     uint64_t n_kmers;
     HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon
+    static size_t HLL_PRECISION = 12;
 
-    ReadCounts() : n_reads(0), n_kmers(0) { }
+    ReadCounts() : n_reads(0), n_kmers(0), kmers(HyperLogLogPlusMinus<uint64_t>(HLL_PRECISION)) { }
 
     ReadCounts(size_t precision) : kmers(HyperLogLogPlusMinus<uint64_t>(precision)) {
     }