Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for queries with invalid characters #502

Merged
merged 11 commits into from
Oct 8, 2024
2 changes: 1 addition & 1 deletion metagraph/src/graph/representation/canonical_dbg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ ::map_to_nodes_sequentially(std::string_view sequence,
path.reserve(sequence.size() - get_k() + 1);

if (const auto sshash = std::dynamic_pointer_cast<const DBGSSHash>(graph_)) {
sshash->map_to_nodes_with_rc<>(sequence, [&](node_index node, bool orientation) {
sshash->map_to_nodes_with_rc<true>(sequence, [&](node_index node, bool orientation) {
adamant-pwn marked this conversation as resolved.
Show resolved Hide resolved
callback(node && orientation ? reverse_complement(node) : node);
}, terminate);
return;
Expand Down
46 changes: 35 additions & 11 deletions metagraph/src/graph/representation/hash/dbg_sshash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "common/seq_tools/reverse_complement.hpp"
#include "common/threads/threading.hpp"
#include "common/logger.hpp"
#include "common/algorithms.hpp"
#include "kmer/kmer_extractor.hpp"


Expand Down Expand Up @@ -99,6 +100,36 @@ void DBGSSHash::add_sequence(std::string_view sequence,
throw std::logic_error("adding sequences not supported");
}

void DBGSSHash
::map_to_nodes_with_rc_advanced(std::string_view sequence,
const std::function<void(sshash::lookup_result)>& callback,
bool with_rc,
const std::function<bool()>& terminate) const {
if (terminate() || sequence.size() < k_)
return;

std::visit([&](const auto &dict) {
using kmer_t = get_kmer_t<decltype(dict)>;

std::vector<char> seq_encoded;
seq_encoded.reserve(sequence.size());
for (size_t i = 0; i < sequence.size(); ++i) {
seq_encoded.emplace_back(!kmer_t::is_valid(sequence[i]));
}

auto invalid = utils::drag_and_mark_segments(seq_encoded, 1, k_);

kmer_t uint_kmer = sshash::util::string_to_uint_kmer<kmer_t>(sequence.data(), k_ - 1);
uint_kmer.pad_char();
for (size_t i = k_ - 1; i < sequence.size() && !terminate(); ++i) {
uint_kmer.drop_char();
uint_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(sequence[i]));
callback(!invalid[i] ? dict.lookup_advanced_uint(uint_kmer, with_rc)
: sshash::lookup_result());
hmusta marked this conversation as resolved.
Show resolved Hide resolved
}
}, dict_);
}

template <bool with_rc>
void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence,
const std::function<void(node_index, bool)>& callback,
Expand All @@ -113,18 +144,11 @@ void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence,
return;
}

std::visit([&](const auto &dict) {
using kmer_t = get_kmer_t<decltype(dict)>;
kmer_t uint_kmer = sshash::util::string_to_uint_kmer<kmer_t>(sequence.data(), k_ - 1);
uint_kmer.pad_char();
for (size_t i = k_ - 1; i < sequence.size() && !terminate(); ++i) {
uint_kmer.drop_char();
uint_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(sequence[i]));
auto res = dict.lookup_advanced_uint(uint_kmer, with_rc);
callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation);
}
}, dict_);
map_to_nodes_with_rc_advanced(sequence, [&](sshash::lookup_result res) {
callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation);
}, with_rc, terminate);
}

template
void DBGSSHash::map_to_nodes_with_rc<true>(std::string_view,
const std::function<void(node_index, bool)>&,
Expand Down
6 changes: 6 additions & 0 deletions metagraph/src/graph/representation/hash/dbg_sshash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ class DBGSSHash : public DeBruijnGraph {
size_t num_nodes_;
Mode mode_;

void map_to_nodes_with_rc_advanced(
adamant-pwn marked this conversation as resolved.
Show resolved Hide resolved
std::string_view sequence,
const std::function<void(sshash::lookup_result)>& callback,
bool with_rc,
const std::function<bool()>& terminate = []() { return false; }) const;

size_t dict_size() const;
};

Expand Down
7 changes: 3 additions & 4 deletions metagraph/tests/annotation/test_annotated_dbg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
#include "gtest/gtest.h"

#include "../test_helpers.hpp"
#include "../graph/all/test_dbg_helpers.hpp"

#include "common/threads/threading.hpp"
#include "common/vectors/bit_vector_dyn.hpp"
#include "common/vectors/vector_algorithm.hpp"
#include "annotation/representation/column_compressed/annotate_column_compressed.hpp"
#include "graph/representation/bitmap/dbg_bitmap.hpp"
#include "graph/representation/hash/dbg_hash_string.hpp"
#include "graph/representation/hash/dbg_hash_ordered.hpp"
#include "graph/representation/hash/dbg_hash_fast.hpp"

#define protected public
#define private public
Expand Down Expand Up @@ -987,6 +984,7 @@ typedef ::testing::Types<std::pair<DBGBitmap, annot::ColumnCompressed<>>,
std::pair<DBGHashOrdered, annot::ColumnCompressed<>>,
std::pair<DBGHashFast, annot::ColumnCompressed<>>,
std::pair<DBGSuccinct, annot::ColumnCompressed<>>,
std::pair<DBGSSHash, annot::ColumnCompressed<>>,
std::pair<DBGBitmap, annot::RowFlatAnnotator>,
std::pair<DBGHashString, annot::RowFlatAnnotator>,
std::pair<DBGHashOrdered, annot::RowFlatAnnotator>,
Expand Down Expand Up @@ -1016,6 +1014,7 @@ class AnnotatedDBGNoNTest : public ::testing::Test {};
typedef ::testing::Types<std::pair<DBGBitmap, annot::ColumnCompressed<>>,
std::pair<DBGHashOrdered, annot::ColumnCompressed<>>,
std::pair<DBGHashFast, annot::ColumnCompressed<>>,
std::pair<DBGSSHash, annot::ColumnCompressed<>>,
std::pair<DBGBitmap, annot::RowFlatAnnotator>,
std::pair<DBGHashOrdered, annot::RowFlatAnnotator>,
std::pair<DBGHashFast, annot::RowFlatAnnotator>,
Expand Down
1 change: 1 addition & 0 deletions metagraph/tests/annotation/test_annotated_dbg_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ template std::unique_ptr<AnnotatedDBG> build_anno_graph<DBGBitmap, ColumnCompres
template std::unique_ptr<AnnotatedDBG> build_anno_graph<DBGHashOrdered, ColumnCompressed<>>(uint64_t, const std::vector<std::string> &, const std::vector<std::string>&, DeBruijnGraph::Mode, bool);
template std::unique_ptr<AnnotatedDBG> build_anno_graph<DBGHashFast, ColumnCompressed<>>(uint64_t, const std::vector<std::string> &, const std::vector<std::string>&, DeBruijnGraph::Mode, bool);
template std::unique_ptr<AnnotatedDBG> build_anno_graph<DBGHashString, ColumnCompressed<>>(uint64_t, const std::vector<std::string> &, const std::vector<std::string>&, DeBruijnGraph::Mode, bool);
template std::unique_ptr<AnnotatedDBG> build_anno_graph<DBGSSHash, ColumnCompressed<>>(uint64_t, const std::vector<std::string> &, const std::vector<std::string>&, DeBruijnGraph::Mode, bool);

template std::unique_ptr<AnnotatedDBG> build_anno_graph<DBGSuccinct, RowFlatAnnotator>(uint64_t, const std::vector<std::string> &, const std::vector<std::string>&, DeBruijnGraph::Mode, bool);
template std::unique_ptr<AnnotatedDBG> build_anno_graph<DBGBitmap, RowFlatAnnotator>(uint64_t, const std::vector<std::string> &, const std::vector<std::string>&, DeBruijnGraph::Mode, bool);
Expand Down
7 changes: 6 additions & 1 deletion metagraph/tests/graph/all/test_dbg_helpers.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#include "test_dbg_helpers.hpp"

#include "../../annotation/test_annotated_dbg_helpers.hpp"
#include "annotation/representation/column_compressed/annotate_column_compressed.hpp"

#include "gtest/gtest.h"
#include "graph/annotated_dbg.hpp"
hmusta marked this conversation as resolved.
Show resolved Hide resolved
#include "graph/representation/canonical_dbg.hpp"
#include "graph/representation/succinct/boss.hpp"
#include "graph/representation/succinct/boss_construct.hpp"
Expand Down Expand Up @@ -146,6 +150,7 @@ void writeFastaFile(const std::vector<std::string>& sequences, const std::string

fastaFile.close();
}

template <>
std::shared_ptr<DeBruijnGraph>
build_graph<DBGSSHash>(uint64_t k,
Expand All @@ -155,7 +160,7 @@ build_graph<DBGSSHash>(uint64_t k,
return std::make_shared<DBGSSHash>(k, mode);

// use DBGHashString to get contigs for SSHash
hmusta marked this conversation as resolved.
Show resolved Hide resolved
auto string_graph = build_graph<DBGHashString>(k, sequences, mode);
auto string_graph = build_graph<DBGHashFast>(k, sequences, mode);

std::vector<std::string> contigs;
size_t num_kmers = 0;
Expand Down
Loading