Skip to content

Commit

Permalink
indicate colour breakpoints to support monotig graphs with sshash
Browse files Browse the repository at this point in the history
  • Loading branch information
hmusta committed Oct 8, 2024
1 parent c842ee8 commit 8fddd96
Show file tree
Hide file tree
Showing 17 changed files with 335 additions and 41 deletions.
6 changes: 5 additions & 1 deletion metagraph/src/cli/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,16 @@ int build_graph(Config *config) {
}

} else if (config->graph_type == Config::GraphType::SSHASH && !config->dynamic) {
graph.reset(new DBGSSHash(files.at(0), config->k, config->graph_mode, config->num_chars));
if (files.size() > 1) {
logger->error("DBGSSHash does not support multiple input files.");
exit(1);
}

graph.reset(new DBGSSHash(files.at(0),
config->k,
config->graph_mode,
config->num_chars,
config->is_monochromatic));
} else {
//slower method
switch (config->graph_type) {
Expand Down
3 changes: 3 additions & 0 deletions metagraph/src/cli/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ Config::Config(int argc, char *argv[]) {
dynamic = true;
} else if (!strcmp(argv[i], "--mask-dummy")) {
mark_dummy_kmers = true;
} else if (!strcmp(argv[i], "--is-monochromatic")) {
is_monochromatic = true;
} else if (!strcmp(argv[i], "--anno-filename")) {
filename_anno = true;
} else if (!strcmp(argv[i], "--anno-header")) {
Expand Down Expand Up @@ -972,6 +974,7 @@ if (advanced) {
fprintf(stderr, "\t --mode \t\tk-mer indexing mode: basic / canonical / primary [basic]\n");
#endif
fprintf(stderr, "\t --complete \t\tconstruct a complete graph (only for Bitmap graph) [off]\n");
fprintf(stderr, "\t --is-monochromatic \t\tindicate that the input sequences are monochromatic (i.e., their colouring is constant) [off]\n");
fprintf(stderr, "\t --mem-cap-gb [INT] \tpreallocated buffer size in GB [1]\n");
if (advanced) {
fprintf(stderr, "\t --dynamic \t\tuse dynamic build method [off]\n");
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/cli/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Config {
bool complete = false;
bool dynamic = false;
bool mark_dummy_kmers = false;
bool is_monochromatic = false;
bool filename_anno = false;
bool annotate_sequence_headers = false;
bool to_adj_list = false;
Expand Down
5 changes: 3 additions & 2 deletions metagraph/src/cli/query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -955,8 +955,9 @@ construct_query_graph(const AnnotatedDBG &anno_graph,
#pragma omp parallel for num_threads(num_threads)
for (size_t i = 0; i < contigs.size(); ++i) {
contigs[i].second.reserve(contigs[i].first.length() - graph_init->get_k() + 1);
full_dbg.map_to_nodes(contigs[i].first,
[&](node_index node) { contigs[i].second.push_back(node); });
call_annotated_nodes_offsets(full_dbg, contigs[i].first, [&](node_index node, int64_t) {
contigs[i].second.push_back(node);
});
}
logger->trace("[Query graph construction] Contigs mapped to the full graph in {} sec",
timer.elapsed());
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/graph/alignment/annotation_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ void AnnotationBuffer::fetch_queued_annotations() {

for (const auto &path : queued_paths_) {
std::vector<node_index> base_path;
std::vector<int64_t> base_path_offsets;
if (base_graph->get_mode() == DeBruijnGraph::CANONICAL) {
// TODO: avoid this call of spell_path
std::string query = spell_path(graph_, path);
Expand Down
101 changes: 78 additions & 23 deletions metagraph/src/graph/annotated_dbg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "annotation/representation/row_compressed/annotate_row_compressed.hpp"
#include "annotation/int_matrix/base/int_matrix.hpp"
#include "graph/representation/canonical_dbg.hpp"
#include "graph/representation/hash/dbg_sshash.hpp"
#include "common/aligned_vector.hpp"
#include "common/vectors/vector_algorithm.hpp"
#include "common/vector_map.hpp"
Expand All @@ -23,6 +24,21 @@ using Column = mtg::annot::matrix::BinaryMatrix::Column;
typedef AnnotatedDBG::Label Label;
typedef std::pair<Label, size_t> StringCountPair;

void call_annotated_nodes_offsets(const SequenceGraph &graph,
std::string_view sequence,
const std::function<void(SequenceGraph::node_index, int64_t)> &callback) {
if (const auto *sshash = dynamic_cast<const DBGSSHash*>(&graph)) {
if (sshash->is_monochromatic()) {
sshash->map_to_contigs_with_rc(sequence, [&](SequenceGraph::node_index i, int64_t offset, bool) {
callback(i, offset);
});
return;
}
}

graph.map_to_nodes(sequence, [&](SequenceGraph::node_index i) { callback(i, 0); });
}


AnnotatedSequenceGraph
::AnnotatedSequenceGraph(std::shared_ptr<SequenceGraph> graph,
Expand All @@ -46,10 +62,9 @@ ::annotate_sequence(std::string_view sequence,

std::vector<row_index> indices;
indices.reserve(sequence.size());

graph_->map_to_nodes(sequence, [&](node_index i) {
call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t) {
if (i > 0)
indices.push_back(graph_to_anno_index(i));
indices.emplace_back(graph_to_anno_index(i));
});

if (!indices.size())
Expand Down Expand Up @@ -80,9 +95,9 @@ ::annotate_sequences(const std::vector<std::pair<std::string, std::vector<Label>
if (!indices.capacity())
indices.reserve(data[t].first.size());

graph_->map_to_nodes(data[t].first, [&](node_index i) {
call_annotated_nodes_offsets(*graph_, data[t].first, [&](node_index i, int64_t) {
if (i > 0)
indices.push_back(graph_to_anno_index(i));
indices.emplace_back(graph_to_anno_index(i));
});
}

Expand Down Expand Up @@ -117,10 +132,10 @@ void AnnotatedDBG::add_kmer_counts(std::string_view sequence,
indices.reserve(sequence.size() - dbg_.get_k() + 1);
size_t end = 0;

graph_->map_to_nodes(sequence, [&](node_index i) {
call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t) {
// only insert indexes for matched k-mers and shift counts accordingly
if (i > 0) {
indices.push_back(graph_to_anno_index(i));
indices.emplace_back(graph_to_anno_index(i));
kmer_counts[indices.size() - 1] = kmer_counts[end++];
}
});
Expand All @@ -139,15 +154,22 @@ void AnnotatedDBG::add_kmer_coord(std::string_view sequence,
if (sequence.size() < dbg_.get_k())
return;

std::vector<row_index> indices = map_to_nodes(dbg_, sequence);
std::vector<row_index> indices;
std::vector<int64_t> offsets;
call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t offset) {
indices.emplace_back(i);
offsets.emplace_back(offset);
});

std::lock_guard<std::mutex> lock(mutex_);

auto it = offsets.begin();
for (node_index i : indices) {
// only insert coordinates for matched k-mers and increment the coordinates
if (i > 0)
annotator_->add_label_coord(graph_to_anno_index(i), labels, coord);
annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it);
coord++;
++it;
}
}

Expand All @@ -156,10 +178,17 @@ void AnnotatedDBG::add_kmer_coords(
assert(check_compatibility());

std::vector<std::vector<row_index>> ids;
std::vector<std::vector<int64_t>> offsets;
ids.reserve(data.size());
for (const auto &[sequence, labels, _] : data) {
if (sequence.size() >= dbg_.get_k())
ids.push_back(map_to_nodes(dbg_, sequence));
if (sequence.size() >= dbg_.get_k()) {
auto &id = ids.emplace_back();
auto &offset = offsets.emplace_back();
call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t o) {
id.emplace_back(i);
offset.emplace_back(o);
});
}
}

std::lock_guard<std::mutex> lock(mutex_);
Expand All @@ -168,11 +197,13 @@ void AnnotatedDBG::add_kmer_coords(
const auto &labels = std::get<1>(data[t]);
uint64_t coord = std::get<2>(data[t]);

auto it = offsets[t].begin();
for (node_index i : ids[t]) {
// only insert coordinates for matched k-mers and increment the coordinates
if (i > 0)
annotator_->add_label_coord(graph_to_anno_index(i), labels, coord);
annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it);
coord++;
++it;
}
}
}
Expand Down Expand Up @@ -200,10 +231,10 @@ void AnnotatedDBG::annotate_kmer_coords(
coords[last].reserve(sequence.size() - dbg_.get_k() + 1);
}

graph_->map_to_nodes(sequence, [&](node_index i) {
call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t o) {
if (i > 0) {
ids[last].push_back(graph_to_anno_index(i));
coords[last].emplace_back(graph_to_anno_index(i), coord);
coords[last].emplace_back(graph_to_anno_index(i), coord - o);
}
coord++;
});
Expand Down Expand Up @@ -238,7 +269,7 @@ std::vector<Label> AnnotatedDBG::get_labels(std::string_view sequence,
size_t num_present_kmers = 0;
size_t num_missing_kmers = 0;

graph_->map_to_nodes(sequence, [&](node_index i) {
call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t) {
if (i > 0) {
index_counts[graph_to_anno_index(i)]++;
num_present_kmers++;
Expand Down Expand Up @@ -312,7 +343,7 @@ AnnotatedDBG::get_top_labels(std::string_view sequence,

size_t num_present_kmers = 0;

graph_->map_to_nodes(sequence, [&](node_index i) {
call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t) {
if (i > 0) {
index_counts[graph_to_anno_index(i)]++;
num_present_kmers++;
Expand Down Expand Up @@ -342,7 +373,13 @@ AnnotatedDBG::get_kmer_counts(std::string_view sequence,
size_t num_top_labels,
double discovery_fraction,
double presence_fraction) const {
std::vector<node_index> nodes = map_to_nodes(dbg_, sequence);
std::vector<node_index> nodes;
if (sequence.size() >= dbg_.get_k()) {
nodes.reserve(sequence.size() - dbg_.get_k() + 1);
call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t) {
nodes.emplace_back(i);
});
}
return get_kmer_counts(nodes, num_top_labels, discovery_fraction, presence_fraction);
}

Expand Down Expand Up @@ -454,19 +491,30 @@ AnnotatedDBG::get_kmer_coordinates(std::string_view sequence,
size_t num_top_labels,
double discovery_fraction,
double presence_fraction) const {
std::vector<node_index> nodes = map_to_nodes(dbg_, sequence);
return get_kmer_coordinates(nodes, num_top_labels, discovery_fraction, presence_fraction);
std::vector<node_index> nodes;
std::vector<int64_t> offsets;
if (sequence.size() >= dbg_.get_k()) {
nodes.reserve(sequence.size() - dbg_.get_k() + 1);
offsets.reserve(sequence.size() - dbg_.get_k() + 1);
call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t o) {
nodes.emplace_back(i);
offsets.emplace_back(o);
});
}
return get_kmer_coordinates(nodes, num_top_labels, discovery_fraction, presence_fraction, offsets);
}

std::vector<std::tuple<Label, size_t, std::vector<SmallVector<uint64_t>>>>
AnnotatedDBG::get_kmer_coordinates(const std::vector<node_index> &nodes,
size_t num_top_labels,
double discovery_fraction,
double presence_fraction) const {
double presence_fraction,
const std::vector<int64_t> &offsets) const {
assert(discovery_fraction >= 0.);
assert(discovery_fraction <= 1.);
assert(presence_fraction >= 0.);
assert(presence_fraction <= 1.);
assert(offsets.empty() || offsets.size() == nodes.size());
assert(check_compatibility());

if (!nodes.size())
Expand Down Expand Up @@ -528,8 +576,15 @@ AnnotatedDBG::get_kmer_coordinates(const std::vector<node_index> &nodes,
for (size_t i = 0; i < rows_tuples.size(); ++i) {
// set the non-empty tuples
for (auto &[j, tuple] : rows_tuples[i]) {
if (col_counts[j])
std::get<2>(result[col_counts[j] - 1])[kmer_positions[i]] = std::move(tuple);
if (col_counts[j]) {
auto &coords = std::get<2>(result[col_counts[j] - 1])[kmer_positions[i]];
coords = std::move(tuple);
if (offsets.size()) {
for (auto &c : coords) {
c += offsets[kmer_positions[i]];
}
}
}
}
}

Expand Down Expand Up @@ -574,7 +629,7 @@ AnnotatedDBG::get_top_label_signatures(std::string_view sequence,
kmer_positions.reserve(num_kmers);

size_t j = 0;
graph_->map_to_nodes(sequence, [&](node_index i) {
call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t o) {
if (i > 0) {
kmer_positions.push_back(j);
row_indices.push_back(graph_to_anno_index(i));
Expand Down
7 changes: 6 additions & 1 deletion metagraph/src/graph/annotated_dbg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ class AnnotatedDBG : public AnnotatedSequenceGraph {
get_kmer_coordinates(const std::vector<node_index> &nodes,
size_t num_top_labels,
double discovery_fraction,
double presence_fraction) const;
double presence_fraction,
const std::vector<int64_t> &offsets = {}) const;

std::vector<std::pair<Label, sdsl::bit_vector>>
get_top_label_signatures(std::string_view sequence,
Expand All @@ -165,6 +166,10 @@ class AnnotatedDBG : public AnnotatedSequenceGraph {
DeBruijnGraph &dbg_;
};

void call_annotated_nodes_offsets(const SequenceGraph &graph,
std::string_view sequence,
const std::function<void(SequenceGraph::node_index, int64_t)> &callback);

} // namespace graph
} // namespace mtg

Expand Down
2 changes: 2 additions & 0 deletions metagraph/src/graph/representation/base/sequence_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,8 @@ class DeBruijnGraph : public SequenceGraph {

// Call all nodes that have no incoming edges
virtual void call_source_nodes(const std::function<void(node_index)> &callback) const;

virtual bool is_monochromatic() const { return false; }
};


Expand Down
2 changes: 2 additions & 0 deletions metagraph/src/graph/representation/canonical_dbg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ class CanonicalDBG : public DBGWrapper<DeBruijnGraph> {

virtual bool operator==(const DeBruijnGraph &other) const override final;

virtual bool is_monochromatic() const override final { return graph_->is_monochromatic(); }

private:
const size_t cache_size_;

Expand Down
Loading

0 comments on commit 8fddd96

Please sign in to comment.