Skip to content

Commit

Permalink
sparse: accelerate the writing of index files by using inverted index
Browse files Browse the repository at this point in the history
It is too slow to construct the index file using the DAAT cursor
method. Switch to the TAAT traversal method instead.

Signed-off-by: Shawn Wang <[email protected]>
  • Loading branch information
sparknack committed Jan 8, 2025
1 parent 8380a96 commit 37b8c47
Showing 1 changed file with 25 additions and 13 deletions.
38 changes: 25 additions & 13 deletions src/index/sparse/sparse_inverted_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,25 +181,37 @@ class InvertedIndex : public BaseInvertedIndex<DType> {
}

auto dim_map_reverse = std::unordered_map<uint32_t, table_t>();
for (auto dim_it = dim_map_.begin(); dim_it != dim_map_.end(); ++dim_it) {
dim_map_reverse[dim_it->second] = dim_it->first;
for (const auto& [dim, idx] : dim_map_) {
dim_map_reverse[idx] = dim;
}

for (table_t vec_id = 0; vec_id < n_rows_internal_; ++vec_id) {
std::vector<std::pair<table_t, DType>> vec_row;
for (size_t i = 0; i < inverted_index_ids_.size(); ++i) {
if (cursors[i].cur_vec_id_ == vec_id) {
vec_row.emplace_back(dim_map_reverse[i], cursors[i].cur_vec_val());
cursors[i].next();
}
std::vector<size_t> row_sizes(n_rows_internal_, 0);
for (size_t i = 0; i < inverted_index_ids_.size(); ++i) {
for (const auto& id : inverted_index_ids_[i]) {
row_sizes[id]++;
}
}

std::vector<std::vector<std::pair<table_t, DType>>> vec_rows(n_rows_internal_);
for (size_t i = 0; i < n_rows_internal_; ++i) {
vec_rows[i].reserve(row_sizes[i]);
}

for (size_t i = 0; i < inverted_index_ids_.size(); ++i) {
const auto& ids = inverted_index_ids_[i];
const auto& vals = inverted_index_vals_[i];
const auto dim = dim_map_reverse[i];
for (size_t j = 0; j < ids.size(); ++j) {
vec_rows[ids[j]].emplace_back(dim, vals[j]);
}
}

SparseRow<DType> raw_row(vec_row);
for (table_t vec_id = 0; vec_id < n_rows_internal_; ++vec_id) {
SparseRow<DType> raw_row(vec_rows[vec_id]);
writeBinaryPOD(writer, raw_row.size());
if (raw_row.size() == 0) {
continue;
if (raw_row.size() > 0) {
writer.write(raw_row.data(), raw_row.size() * SparseRow<DType>::element_size());
}
writer.write(raw_row.data(), raw_row.size() * SparseRow<DType>::element_size());
}

return Status::success;
Expand Down

0 comments on commit 37b8c47

Please sign in to comment.