From 8380a9695c3cdb6af79868cf2dd84a7d3c4a7b54 Mon Sep 17 00:00:00 2001 From: sparknack Date: Tue, 7 Jan 2025 10:38:49 +0800 Subject: [PATCH] memory usage optimization for sparse vector (#1011) * sparse: remove raw data cache in sparse inverted index To reduce memory usage, remove the raw data cache in sparse inverted index. Note that config param `drop_ratio_build` and GetVectorByIds() are also be removed. Signed-off-by: Shawn Wang * sparse: quantize float to uint16_t for BM25 inverted index To reduce the memory usage of the inverted index, when BM25 metric is used, quantize term frequency from float to uint16_t. All values that exceeds the maximum of uint16_t is quantized to the maximum of uint16_t. Signed-off-by: Shawn Wang --------- Signed-off-by: Shawn Wang --- CMakeLists.txt | 4 + include/knowhere/sparse_utils.h | 97 +++- python/setup.py | 1 + src/index/sparse/sparse_index_node.cc | 89 ++-- src/index/sparse/sparse_inverted_index.h | 485 +++++++++--------- .../sparse/sparse_inverted_index_config.h | 1 + tests/ut/test_sparse.cc | 178 +------ tests/ut/utils.h | 29 ++ 8 files changed, 440 insertions(+), 444 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9235e6e92..329c5c0b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,9 @@ include(cmake/libs/libhnsw.cmake) include_directories(thirdparty/faiss) +find_package(Boost REQUIRED) +include_directories(${Boost_INCLUDE_DIRS}) + find_package(OpenMP REQUIRED) find_package(folly REQUIRED) @@ -177,6 +180,7 @@ endif() include_directories(src) include_directories(include) +list(APPEND KNOWHERE_LINKER_LIBS Boost::boost) list(APPEND KNOWHERE_LINKER_LIBS faiss) list(APPEND KNOWHERE_LINKER_LIBS glog::glog) list(APPEND KNOWHERE_LINKER_LIBS nlohmann_json::nlohmann_json) diff --git a/include/knowhere/sparse_utils.h b/include/knowhere/sparse_utils.h index ba582a5a2..be069c559 100644 --- a/include/knowhere/sparse_utils.h +++ b/include/knowhere/sparse_utils.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -59,6 +60,33 @@ GetDocValueBM25Computer(float k1, float b, float avgdl) { }; } +// A docid filter that tests whether a given id is in the list of docids, which is regarded as another form of BitSet. +// Note that all ids to be tested must be tested exactly once and in order. +class DocIdFilterByVector { + public: + DocIdFilterByVector(std::vector&& docids) : docids_(std::move(docids)) { + std::sort(docids_.begin(), docids_.end()); + } + + [[nodiscard]] bool + test(const table_t id) { + // find the first id that is greater than or equal to the specific id + while (pos_ < docids_.size() && docids_[pos_] < id) { + ++pos_; + } + return !(pos_ < docids_.size() && docids_[pos_] == id); + } + + [[nodiscard]] bool + empty() const { + return docids_.empty(); + } + + private: + std::vector docids_; + size_t pos_ = 0; +}; + template class SparseRow { static_assert(std::is_same_v, "SparseRow supports float only"); @@ -72,6 +100,15 @@ class SparseRow { SparseRow(size_t count, uint8_t* data, bool own_data) : data_(data), count_(count), own_data_(own_data) { } + SparseRow(const std::vector>& data) : count_(data.size()), own_data_(true) { + data_ = new uint8_t[count_ * element_size()]; + for (size_t i = 0; i < count_; ++i) { + auto* elem = reinterpret_cast(data_) + i; + elem->index = data[i].first; + elem->value = data[i].second; + } + } + // copy constructor and copy assignment operator perform deep copy SparseRow(const SparseRow& other) : SparseRow(other.count_) { std::memcpy(data_, other.data_, data_byte_size()); @@ -147,6 +184,9 @@ class SparseRow { void set_at(size_t i, table_t index, T value) { + if (i >= count_) { + throw std::out_of_range("set_at on a SparseRow with invalid index"); + } auto* elem = reinterpret_cast(data_) + i; elem->index = index; elem->value = value; @@ -300,12 +340,12 @@ class GrowableVectorView { mmap_element_count_ = 0; } - size_type + [[nodiscard]] size_type capacity() const { return mmap_byte_size_ / sizeof(T); } - size_type + [[nodiscard]] size_type size() const { return mmap_element_count_; } @@ -346,6 +386,59 @@ class GrowableVectorView { return reinterpret_cast(mmap_data_)[i]; } + class iterator : public boost::iterator_facade { + public: + iterator() = default; + explicit iterator(T* ptr) : ptr_(ptr) { + } + + friend class GrowableVectorView; + friend class boost::iterator_core_access; + + T& + dereference() const { + return *ptr_; + } + + void + increment() { + ++ptr_; + } + + void + decrement() { + --ptr_; + } + + void + advance(std::ptrdiff_t n) { + ptr_ += n; + } + + std::ptrdiff_t + distance_to(const iterator& other) const { + return other.ptr_ - ptr_; + } + + bool + equal(const iterator& other) const { + return ptr_ == other.ptr_; + } + + private: + T* ptr_ = nullptr; + }; + + iterator + begin() const { + return iterator(reinterpret_cast(mmap_data_)); + } + + iterator + end() const { + return iterator(reinterpret_cast(mmap_data_) + mmap_element_count_); + } + private: void* mmap_data_ = nullptr; size_type mmap_byte_size_ = 0; diff --git a/python/setup.py b/python/setup.py index 8893901fe..8a0f32140 100644 --- a/python/setup.py +++ b/python/setup.py @@ -51,6 +51,7 @@ def get_readme(): get_numpy_include(), os.path.join("..", "include"), os.path.join("..", "thirdparty"), + get_thirdparty_prefix("boost-headers") + "/include", get_thirdparty_prefix("nlohmann_json") + "/include", get_thirdparty_prefix("libglog") + "/include", get_thirdparty_prefix("gflags") + "/include" diff --git a/src/index/sparse/sparse_index_node.cc b/src/index/sparse/sparse_index_node.cc index fa5897b3d..b3bccf3a7 100644 --- a/src/index/sparse/sparse_index_node.cc +++ b/src/index/sparse/sparse_index_node.cc @@ -19,7 +19,6 @@ #include "knowhere/config.h" #include "knowhere/dataset.h" #include "knowhere/expected.h" -#include "knowhere/feature.h" #include "knowhere/index/index_factory.h" #include "knowhere/index/index_node.h" #include "knowhere/log.h" @@ -54,14 +53,12 @@ class SparseInvertedIndexNode : public IndexNode { LOG_KNOWHERE_ERROR_ << Type() << " only support metric_type IP or BM25"; return Status::invalid_metric_type; } - auto drop_ratio_build = cfg.drop_ratio_build.value_or(0.0f); auto index_or = CreateIndex(cfg); if (!index_or.has_value()) { return index_or.error(); } auto index = index_or.value(); - index->Train(static_cast*>(dataset->GetTensor()), dataset->GetRows(), - drop_ratio_build); + index->Train(static_cast*>(dataset->GetTensor()), dataset->GetRows()); if (index_ != nullptr) { LOG_KNOWHERE_WARNING_ << Type() << " has already been created, deleting old"; DeleteExistingIndex(); @@ -127,7 +124,7 @@ class SparseInvertedIndexNode : public IndexNode { public: RefineIterator(const sparse::BaseInvertedIndex* index, sparse::SparseRow&& query, std::shared_ptr precomputed_it, - const sparse::DocValueComputer& computer, bool use_knowhere_search_pool = true, + const sparse::DocValueComputer& computer, bool use_knowhere_search_pool = true, const float refine_ratio = 0.5f) : IndexIterator(true, use_knowhere_search_pool, refine_ratio), index_(index), @@ -158,7 +155,7 @@ class SparseInvertedIndexNode : public IndexNode { private: const sparse::BaseInvertedIndex* index_; sparse::SparseRow query_; - const sparse::DocValueComputer computer_; + const sparse::DocValueComputer computer_; std::shared_ptr precomputed_it_; bool first_return_ = true; }; @@ -185,7 +182,7 @@ class SparseInvertedIndexNode : public IndexNode { auto computer = computer_or.value(); auto drop_ratio_search = cfg.drop_ratio_search.value_or(0.0f); - const bool approximated = index_->IsApproximated() || drop_ratio_search > 0; + const bool approximated = drop_ratio_search > 0; auto vec = std::vector>(nq, nullptr); try { @@ -228,37 +225,17 @@ class SparseInvertedIndexNode : public IndexNode { [[nodiscard]] expected GetVectorByIds(const DataSetPtr dataset) const override { - if (!index_) { - return expected::Err(Status::empty_index, "index not loaded"); - } - - auto rows = dataset->GetRows(); - auto ids = dataset->GetIds(); - - auto data = std::make_unique[]>(rows); - int64_t dim = 0; - try { - for (int64_t i = 0; i < rows; ++i) { - auto& target = data[i]; - index_->GetVectorById(ids[i], target); - dim = std::max(dim, target.dim()); - } - } catch (std::exception& e) { - return expected::Err(Status::invalid_args, "GetVectorByIds failed"); - } - auto res = GenResultDataSet(rows, dim, data.release()); - res->SetIsSparse(true); - return res; + return expected::Err(Status::not_implemented, "GetVectorByIds not implemented"); } [[nodiscard]] bool HasRawData(const std::string& metric_type) const override { - return true; + return false; } [[nodiscard]] expected GetIndexMeta(std::unique_ptr cfg) const override { - throw std::runtime_error("GetIndexMeta not supported for current index type"); + return expected::Err(Status::not_implemented, "GetIndexMeta not supported for current index type"); } Status @@ -292,7 +269,7 @@ class SparseInvertedIndexNode : public IndexNode { return index_or.error(); } index_ = index_or.value(); - return index_->Load(reader); + return index_->Load(reader, 0, ""); } Status @@ -301,29 +278,36 @@ class SparseInvertedIndexNode : public IndexNode { LOG_KNOWHERE_WARNING_ << Type() << " has already been created, deleting old"; DeleteExistingIndex(); } + auto cfg = static_cast(*config); + auto index_or = CreateIndex(cfg); + if (!index_or.has_value()) { + return index_or.error(); + } + index_ = index_or.value(); + auto reader = knowhere::FileReader(filename); - map_size_ = reader.size(); + size_t map_size = reader.size(); int map_flags = MAP_SHARED; #ifdef MAP_POPULATE if (cfg.enable_mmap_pop.has_value() && cfg.enable_mmap_pop.value()) { map_flags |= MAP_POPULATE; } #endif - map_ = static_cast(mmap(nullptr, map_size_, PROT_READ, map_flags, reader.descriptor(), 0)); - if (map_ == MAP_FAILED) { - LOG_KNOWHERE_ERROR_ << "Failed to mmap file: " << strerror(errno); + void* mapped_memory = mmap(nullptr, map_size, PROT_READ, map_flags, reader.descriptor(), 0); + if (mapped_memory == MAP_FAILED) { + LOG_KNOWHERE_ERROR_ << "Failed to mmap file " << filename << ": " << strerror(errno); return Status::disk_file_error; } - if (madvise(map_, map_size_, MADV_RANDOM) != 0) { - LOG_KNOWHERE_WARNING_ << "Failed to madvise file: " << strerror(errno); - } - auto index_or = CreateIndex(cfg); - if (!index_or.has_value()) { - return index_or.error(); - } - index_ = index_or.value(); - MemoryIOReader map_reader((uint8_t*)map_, map_size_); + + auto cleanup_mmap = [map_size, filename](void* map_addr) { + if (munmap(map_addr, map_size) != 0) { + LOG_KNOWHERE_ERROR_ << "Failed to munmap file " << filename << ": " << strerror(errno); + } + }; + std::unique_ptr mmap_guard(mapped_memory, cleanup_mmap); + + MemoryIOReader map_reader(reinterpret_cast(mapped_memory), map_size); auto supplement_target_filename = filename + ".knowhere_sparse_index_supplement"; return index_->Load(map_reader, map_flags, supplement_target_filename); } @@ -364,7 +348,8 @@ class SparseInvertedIndexNode : public IndexNode { expected*> CreateIndex(const SparseInvertedIndexConfig& cfg) const { if (IsMetricType(cfg.metric_type.value(), metric::BM25)) { - auto idx = new sparse::InvertedIndex(); + // quantize float to uint16_t when BM25 metric type is used. + auto idx = new sparse::InvertedIndex(); if (!cfg.bm25_k1.has_value() || !cfg.bm25_b.has_value() || !cfg.bm25_avgdl.has_value()) { return expected*>::Err( Status::invalid_args, "BM25 parameters k1, b, and avgdl must be set when building/loading"); @@ -376,7 +361,7 @@ class SparseInvertedIndexNode : public IndexNode { idx->SetBM25Params(k1, b, avgdl, max_score_ratio); return idx; } else { - return new sparse::InvertedIndex(); + return new sparse::InvertedIndex(); } } @@ -386,23 +371,11 @@ class SparseInvertedIndexNode : public IndexNode { delete index_; index_ = nullptr; } - if (map_ != nullptr) { - auto res = munmap(map_, map_size_); - if (res != 0) { - LOG_KNOWHERE_ERROR_ << "Failed to munmap when trying to delete index: " << strerror(errno); - } - map_ = nullptr; - map_size_ = 0; - } } sparse::BaseInvertedIndex* index_{}; std::shared_ptr search_pool_; std::shared_ptr build_pool_; - - // if map_ is not nullptr, it means the index is mmapped from disk. - char* map_ = nullptr; - size_t map_size_ = 0; }; // class SparseInvertedIndexNode // Concurrent version of SparseInvertedIndexNode diff --git a/src/index/sparse/sparse_inverted_index.h b/src/index/sparse/sparse_inverted_index.h index 06665894a..611b0dd36 100644 --- a/src/index/sparse/sparse_inverted_index.h +++ b/src/index/sparse/sparse_inverted_index.h @@ -20,13 +20,14 @@ #include #include #include -#include +#include #include #include #include "index/sparse/sparse_inverted_index_config.h" #include "io/memory_io.h" #include "knowhere/bitsetview.h" +#include "knowhere/comp/index_param.h" #include "knowhere/expected.h" #include "knowhere/log.h" #include "knowhere/sparse_utils.h" @@ -44,10 +45,10 @@ class BaseInvertedIndex { // supplement_target_filename: when in mmap mode, we need an extra file to store the mmaped index data structure. // this file will be created during loading and deleted in the destructor. virtual Status - Load(MemoryIOReader& reader, int map_flags = MAP_SHARED, const std::string& supplement_target_filename = "") = 0; + Load(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename) = 0; virtual Status - Train(const SparseRow* data, size_t rows, float drop_ratio_build) = 0; + Train(const SparseRow* data, size_t rows) = 0; virtual Status Add(const SparseRow* data, size_t rows, int64_t dim) = 0; @@ -61,17 +62,11 @@ class BaseInvertedIndex { const DocValueComputer& computer) const = 0; virtual float - GetRawDistance(const label_t id, const SparseRow& query, const DocValueComputer& computer) const = 0; - - virtual void - GetVectorById(const label_t id, SparseRow& output) const = 0; + GetRawDistance(const label_t vec_id, const SparseRow& query, const DocValueComputer& computer) const = 0; virtual expected> GetDocValueComputer(const SparseInvertedIndexConfig& cfg) const = 0; - virtual bool - IsApproximated() const = 0; - [[nodiscard]] virtual size_t size() const = 0; @@ -82,8 +77,8 @@ class BaseInvertedIndex { n_cols() const = 0; }; -template -class InvertedIndex : public BaseInvertedIndex { +template +class InvertedIndex : public BaseInvertedIndex { public: explicit InvertedIndex() { } @@ -106,12 +101,15 @@ class InvertedIndex : public BaseInvertedIndex { } } + template + using Vector = std::conditional_t, std::vector>; + void SetBM25Params(float k1, float b, float avgdl, float max_score_ratio) { bm25_params_ = std::make_unique(k1, b, avgdl, max_score_ratio); } - expected> + expected> GetDocValueComputer(const SparseInvertedIndexConfig& cfg) const override { // if metric_type is set in config, it must match with how the index was built. auto metric_type = cfg.metric_type; @@ -119,33 +117,34 @@ class InvertedIndex : public BaseInvertedIndex { if (metric_type.has_value() && !IsMetricType(metric_type.value(), metric::IP)) { auto msg = "metric type not match, expected: " + std::string(metric::IP) + ", got: " + metric_type.value(); - return expected>::Err(Status::invalid_metric_type, msg); + return expected>::Err(Status::invalid_metric_type, msg); } - return GetDocValueOriginalComputer(); + return GetDocValueOriginalComputer(); } if (metric_type.has_value() && !IsMetricType(metric_type.value(), metric::BM25)) { auto msg = "metric type not match, expected: " + std::string(metric::BM25) + ", got: " + metric_type.value(); - return expected>::Err(Status::invalid_metric_type, msg); + return expected>::Err(Status::invalid_metric_type, msg); } // avgdl must be supplied during search if (!cfg.bm25_avgdl.has_value()) { - return expected>::Err(Status::invalid_args, "avgdl must be supplied during searching"); + return expected>::Err(Status::invalid_args, + "avgdl must be supplied during searching"); } auto avgdl = cfg.bm25_avgdl.value(); if constexpr (use_wand) { // wand: search time k1/b must equal load time config. if ((cfg.bm25_k1.has_value() && cfg.bm25_k1.value() != bm25_params_->k1) || ((cfg.bm25_b.has_value() && cfg.bm25_b.value() != bm25_params_->b))) { - return expected>::Err( + return expected>::Err( Status::invalid_args, "search time k1/b must equal load time config for WAND index."); } - return GetDocValueBM25Computer(bm25_params_->k1, bm25_params_->b, avgdl); + return GetDocValueBM25Computer(bm25_params_->k1, bm25_params_->b, avgdl); } else { // inverted index: search time k1/b may override load time config. auto k1 = cfg.bm25_k1.has_value() ? cfg.bm25_k1.value() : bm25_params_->k1; auto b = cfg.bm25_b.has_value() ? cfg.bm25_b.value() : bm25_params_->b; - return GetDocValueBM25Computer(k1, b, avgdl); + return GetDocValueBM25Computer(k1, b, avgdl); } } @@ -156,35 +155,58 @@ class InvertedIndex : public BaseInvertedIndex { * * 1. size_t rows * 2. size_t cols - * 3. T value_threshold_ + * 3. DType value_threshold_ (deprecated) * 4. for each row: * 1. size_t len * 2. for each non-zero value: * 1. table_t idx - * 2. T val + * 2. DType val (when QType is different from DType, the QType value of val is stored as a DType with + * precision loss) * - * inverted_lut_ and max_score_in_dim_ not serialized, they will be - * constructed dynamically during deserialization. + * inverted_index_ids_, inverted_index_vals_ and max_score_in_dim_ are + * not serialized, they will be constructed dynamically during + * deserialization. * * Data are densely packed in serialized bytes and no padding is added. */ - writeBinaryPOD(writer, n_rows_internal()); - writeBinaryPOD(writer, n_cols_internal()); - writeBinaryPOD(writer, value_threshold_); - for (size_t i = 0; i < n_rows_internal(); ++i) { - auto& row = raw_data_[i]; - writeBinaryPOD(writer, row.size()); - if (row.size() == 0) { + DType deprecated_value_threshold = 0; + writeBinaryPOD(writer, n_rows_internal_); + writeBinaryPOD(writer, max_dim_); + writeBinaryPOD(writer, deprecated_value_threshold); + BitsetView bitset(nullptr, 0); + + std::vector> cursors; + for (size_t i = 0; i < inverted_index_ids_.size(); ++i) { + cursors.emplace_back(inverted_index_ids_[i], inverted_index_vals_[i], n_rows_internal_, 0, 0, bitset); + } + + auto dim_map_reverse = std::unordered_map(); + for (auto dim_it = dim_map_.begin(); dim_it != dim_map_.end(); ++dim_it) { + dim_map_reverse[dim_it->second] = dim_it->first; + } + + for (table_t vec_id = 0; vec_id < n_rows_internal_; ++vec_id) { + std::vector> vec_row; + for (size_t i = 0; i < inverted_index_ids_.size(); ++i) { + if (cursors[i].cur_vec_id_ == vec_id) { + vec_row.emplace_back(dim_map_reverse[i], cursors[i].cur_vec_val()); + cursors[i].next(); + } + } + + SparseRow raw_row(vec_row); + writeBinaryPOD(writer, raw_row.size()); + if (raw_row.size() == 0) { continue; } - writer.write(row.data(), row.size() * SparseRow::element_size()); + writer.write(raw_row.data(), raw_row.size() * SparseRow::element_size()); } + return Status::success; } - Status - Load(MemoryIOReader& reader, int map_flags = MAP_SHARED, - const std::string& supplement_target_filename = "") override { + Load(MemoryIOReader& reader, int map_flags, const std::string& supplement_target_filename) override { + DType deprecated_value_threshold; int64_t rows; readBinaryPOD(reader, rows); // previous versions used the signness of rows to indicate whether to @@ -192,15 +214,11 @@ class InvertedIndex : public BaseInvertedIndex { // take the absolute value of rows. rows = std::abs(rows); readBinaryPOD(reader, max_dim_); - readBinaryPOD(reader, value_threshold_); - if (value_threshold_ > 0) { - drop_during_build_ = true; - } + readBinaryPOD(reader, deprecated_value_threshold); if constexpr (mmapped) { RETURN_IF_ERROR(PrepareMmap(reader, rows, map_flags, supplement_target_filename)); } else { - raw_data_.reserve(rows); if constexpr (bm25) { bm25_params_->row_sums.reserve(rows); } @@ -209,17 +227,21 @@ class InvertedIndex : public BaseInvertedIndex { for (int64_t i = 0; i < rows; ++i) { size_t count; readBinaryPOD(reader, count); + SparseRow raw_row; if constexpr (mmapped) { - raw_data_.emplace_back(count, reader.data() + reader.tellg(), false); - reader.advance(count * SparseRow::element_size()); + raw_row = std::move(SparseRow(count, reader.data() + reader.tellg(), false)); + reader.advance(count * SparseRow::element_size()); } else { - raw_data_.emplace_back(count); + raw_row = std::move(SparseRow(count)); if (count > 0) { - reader.read(raw_data_[i].data(), count * SparseRow::element_size()); + reader.read(raw_row.data(), count * SparseRow::element_size()); } } - add_row_to_index(raw_data_[i], i); + add_row_to_index(raw_row, i); } + + n_rows_internal_ = rows; + return Status::success; } @@ -227,7 +249,7 @@ class InvertedIndex : public BaseInvertedIndex { Status PrepareMmap(MemoryIOReader& reader, size_t rows, int map_flags, const std::string& supplement_target_filename) { const auto initial_reader_location = reader.tellg(); - const auto nnz = (reader.remaining() - (rows * sizeof(size_t))) / SparseRow::element_size(); + const auto nnz = (reader.remaining() - (rows * sizeof(size_t))) / SparseRow::element_size(); // count raw vector idx occurrences std::unordered_map idx_counts; @@ -242,22 +264,23 @@ class InvertedIndex : public BaseInvertedIndex { readBinaryPOD(reader, idx); idx_counts[idx]++; // skip value - reader.advance(sizeof(T)); + reader.advance(sizeof(DType)); } } // reset reader to the beginning reader.seekg(initial_reader_location); - auto raw_data_byte_size = rows * sizeof(typename decltype(raw_data_)::value_type); - auto inverted_lut_byte_size = idx_counts.size() * sizeof(typename decltype(inverted_lut_)::value_type); - // actually due to drop_ratio_build, the number of non-zero values that will be added to the luts is - // less than nnz. but since the memory is mmapped, it is ok to still allocate some extra space for those - // dropped values. - auto luts_byte_size = nnz * sizeof(typename decltype(inverted_lut_)::value_type::value_type); + auto inverted_index_ids_byte_size = + idx_counts.size() * sizeof(typename decltype(inverted_index_ids_)::value_type); + auto inverted_index_vals_byte_size = + idx_counts.size() * sizeof(typename decltype(inverted_index_vals_)::value_type); + auto plists_ids_byte_size = nnz * sizeof(typename decltype(inverted_index_ids_)::value_type::value_type); + auto plists_vals_byte_size = nnz * sizeof(typename decltype(inverted_index_vals_)::value_type::value_type); auto max_score_in_dim_byte_size = idx_counts.size() * sizeof(typename decltype(max_score_in_dim_)::value_type); size_t row_sums_byte_size = 0; - map_byte_size_ = raw_data_byte_size + inverted_lut_byte_size + luts_byte_size; + map_byte_size_ = + inverted_index_ids_byte_size + inverted_index_vals_byte_size + plists_ids_byte_size + plists_vals_byte_size; if constexpr (use_wand) { map_byte_size_ += max_score_in_dim_byte_size; } @@ -302,10 +325,10 @@ class InvertedIndex : public BaseInvertedIndex { char* ptr = map_; // initialize containers memory. - raw_data_.initialize(ptr, raw_data_byte_size); - ptr += raw_data_byte_size; - inverted_lut_.initialize(ptr, inverted_lut_byte_size); - ptr += inverted_lut_byte_size; + inverted_index_ids_.initialize(ptr, inverted_index_ids_byte_size); + ptr += inverted_index_ids_byte_size; + inverted_index_vals_.initialize(ptr, inverted_index_vals_byte_size); + ptr += inverted_index_vals_byte_size; if constexpr (use_wand) { max_score_in_dim_.initialize(ptr, max_score_in_dim_byte_size); @@ -317,15 +340,23 @@ class InvertedIndex : public BaseInvertedIndex { ptr += row_sums_byte_size; } + for (const auto& [idx, count] : idx_counts) { + auto& plist_ids = inverted_index_ids_.emplace_back(); + auto plist_ids_byte_size = count * sizeof(typename decltype(inverted_index_ids_)::value_type::value_type); + plist_ids.initialize(ptr, plist_ids_byte_size); + ptr += plist_ids_byte_size; + } + for (const auto& [idx, count] : idx_counts) { + auto& plist_vals = inverted_index_vals_.emplace_back(); + auto plist_vals_byte_size = count * sizeof(typename decltype(inverted_index_vals_)::value_type::value_type); + plist_vals.initialize(ptr, plist_vals_byte_size); + ptr += plist_vals_byte_size; + } size_t dim_id = 0; for (const auto& [idx, count] : idx_counts) { dim_map_[idx] = dim_id; - auto& lut = inverted_lut_.emplace_back(); - auto lut_byte_size = count * sizeof(typename decltype(inverted_lut_)::value_type::value_type); - lut.initialize(ptr, lut_byte_size); - ptr += lut_byte_size; if constexpr (use_wand) { - max_score_in_dim_.emplace_back(0); + max_score_in_dim_.emplace_back(0.0f); } ++dim_id; } @@ -338,62 +369,39 @@ class InvertedIndex : public BaseInvertedIndex { // Non zero drop ratio is only supported for static index, i.e. data should // include all rows that'll be added to the index. Status - Train(const SparseRow* data, size_t rows, float drop_ratio_build) override { + Train(const SparseRow* data, size_t rows) override { if constexpr (mmapped) { throw std::invalid_argument("mmapped InvertedIndex does not support Train"); } else { - if (drop_ratio_build == 0.0f) { - return Status::success; - } - // TODO: maybe i += 10 to down sample to speed up. - size_t amount = 0; - for (size_t i = 0; i < rows; ++i) { - amount += data[i].size(); - } - if (amount == 0) { - return Status::success; - } - std::vector vals; - vals.reserve(amount); - for (size_t i = 0; i < rows; ++i) { - for (size_t j = 0; j < data[i].size(); ++j) { - vals.push_back(fabs(data[i][j].val)); - } - } - value_threshold_ = get_threshold(vals, drop_ratio_build); - drop_during_build_ = true; return Status::success; } } Status - Add(const SparseRow* data, size_t rows, int64_t dim) override { + Add(const SparseRow* data, size_t rows, int64_t dim) override { if constexpr (mmapped) { throw std::invalid_argument("mmapped InvertedIndex does not support Add"); } else { - auto current_rows = n_rows_internal(); - if (current_rows > 0 && drop_during_build_) { - LOG_KNOWHERE_ERROR_ << "Not allowed to add data to a built index with drop_ratio_build > 0."; - return Status::invalid_args; - } + auto current_rows = n_rows_internal_; if ((size_t)dim > max_dim_) { max_dim_ = dim; } - raw_data_.insert(raw_data_.end(), data, data + rows); if constexpr (bm25) { bm25_params_->row_sums.reserve(current_rows + rows); } for (size_t i = 0; i < rows; ++i) { add_row_to_index(data[i], current_rows + i); } + n_rows_internal_ += rows; + return Status::success; } } void - Search(const SparseRow& query, size_t k, float drop_ratio_search, float* distances, label_t* labels, - size_t refine_factor, const BitsetView& bitset, const DocValueComputer& computer) const override { + Search(const SparseRow& query, size_t k, float drop_ratio_search, float* distances, label_t* labels, + size_t refine_factor, const BitsetView& bitset, const DocValueComputer& computer) const override { // initially set result distances to NaN and labels to -1 std::fill(distances, distances + k, std::numeric_limits::quiet_NaN()); std::fill(labels, labels + k, -1); @@ -401,18 +409,17 @@ class InvertedIndex : public BaseInvertedIndex { return; } - std::vector values(query.size()); + std::vector values(query.size()); for (size_t i = 0; i < query.size(); ++i) { values[i] = std::abs(query[i].val); } auto q_threshold = get_threshold(values, drop_ratio_search); - // if no data was dropped during both build and search, no refinement is - // needed. - if (!drop_during_build_ && drop_ratio_search == 0) { + // if no data was dropped during search, no refinement is needed. + if (drop_ratio_search == 0) { refine_factor = 1; } - MaxMinHeap heap(k * refine_factor); + MaxMinHeap heap(k * refine_factor); if constexpr (!use_wand) { search_brute_force(query, q_threshold, heap, bitset, computer); } else { @@ -428,54 +435,68 @@ class InvertedIndex : public BaseInvertedIndex { // Returned distances are inaccurate based on the drop_ratio. std::vector - GetAllDistances(const SparseRow& query, float drop_ratio_search, const BitsetView& bitset, - const DocValueComputer& computer) const override { + GetAllDistances(const SparseRow& query, float drop_ratio_search, const BitsetView& bitset, + const DocValueComputer& computer) const override { if (query.size() == 0) { return {}; } - std::vector values(query.size()); + std::vector values(query.size()); for (size_t i = 0; i < query.size(); ++i) { values[i] = std::abs(query[i].val); } auto q_threshold = get_threshold(values, drop_ratio_search); auto distances = compute_all_distances(query, q_threshold, computer); - for (size_t i = 0; i < distances.size(); ++i) { - if (bitset.empty() || !bitset.test(i)) { - continue; + if (!bitset.empty()) { + for (size_t i = 0; i < distances.size(); ++i) { + if (bitset.test(i)) { + distances[i] = 0.0f; + } } - distances[i] = 0.0f; } return distances; } float - GetRawDistance(const label_t id, const SparseRow& query, const DocValueComputer& computer) const override { - T doc_sum = bm25 ? bm25_params_->row_sums.at(id) : 0; - return query.dot(raw_data_[id], computer, doc_sum); - } + GetRawDistance(const label_t vec_id, const SparseRow& query, + const DocValueComputer& computer) const override { + float distance = 0.0f; - void - GetVectorById(const label_t id, SparseRow& output) const override { - output = raw_data_[id]; + for (size_t i = 0; i < query.size(); ++i) { + auto [idx, val] = query[i]; + auto dim_id = dim_map_.find(idx); + if (dim_id == dim_map_.end()) { + continue; + } + auto& plist_ids = inverted_index_ids_[dim_id->second]; + auto it = std::lower_bound(plist_ids.begin(), plist_ids.end(), vec_id, + [](const auto& x, table_t y) { return x < y; }); + if (it != plist_ids.end() && *it == vec_id) { + distance += val * computer(inverted_index_vals_[dim_id->second][it - plist_ids.begin()], + bm25 ? bm25_params_->row_sums.at(vec_id) : 0); + } + } + + return distance; } [[nodiscard]] size_t size() const override { size_t res = sizeof(*this); - for (size_t i = 0; i < raw_data_.size(); ++i) { - res += raw_data_[i].memory_usage(); - } res += dim_map_.size() * (sizeof(typename decltype(dim_map_)::key_type) + sizeof(typename decltype(dim_map_)::mapped_type)); if constexpr (mmapped) { return res + map_byte_size_; } else { - res += sizeof(typename decltype(raw_data_)::value_type) * raw_data_.capacity(); - - res += sizeof(typename decltype(inverted_lut_)::value_type) * inverted_lut_.capacity(); - for (size_t i = 0; i < inverted_lut_.size(); ++i) { - res += sizeof(typename decltype(inverted_lut_)::value_type::value_type) * inverted_lut_[i].capacity(); + res += sizeof(typename decltype(inverted_index_ids_)::value_type) * inverted_index_ids_.capacity(); + for (size_t i = 0; i < inverted_index_ids_.size(); ++i) { + res += sizeof(typename decltype(inverted_index_ids_)::value_type::value_type) * + inverted_index_ids_[i].capacity(); + } + res += sizeof(typename decltype(inverted_index_vals_)::value_type) * inverted_index_vals_.capacity(); + for (size_t i = 0; i < inverted_index_vals_.size(); ++i) { + res += sizeof(typename decltype(inverted_index_vals_)::value_type::value_type) * + inverted_index_vals_[i].capacity(); } if constexpr (use_wand) { res += sizeof(typename decltype(max_score_in_dim_)::value_type) * max_score_in_dim_.capacity(); @@ -486,25 +507,20 @@ class InvertedIndex : public BaseInvertedIndex { [[nodiscard]] size_t n_rows() const override { - return n_rows_internal(); + return n_rows_internal_; } [[nodiscard]] size_t n_cols() const override { - return n_cols_internal(); - } - - [[nodiscard]] virtual bool - IsApproximated() const override { - return drop_during_build_; + return max_dim_; } private: // Given a vector of values, returns the threshold value. // All values strictly smaller than the threshold will be ignored. // values will be modified in this function. - inline T - get_threshold(std::vector& values, float drop_ratio) const { + inline DType + get_threshold(std::vector& values, float drop_ratio) const { // drop_ratio is in [0, 1) thus drop_count is guaranteed to be less // than values.size(). auto drop_count = static_cast(drop_ratio * values.size()); @@ -516,125 +532,117 @@ class InvertedIndex : public BaseInvertedIndex { return *pos; } - size_t - n_rows_internal() const { - return raw_data_.size(); - } - - size_t - n_cols_internal() const { - return max_dim_; - } - std::vector - compute_all_distances(const SparseRow& q_vec, T q_threshold, const DocValueComputer& computer) const { - std::vector scores(n_rows_internal(), 0.0f); + compute_all_distances(const SparseRow& q_vec, DType q_threshold, + const DocValueComputer& computer) const { + std::vector scores(n_rows_internal_, 0.0f); for (size_t idx = 0; idx < q_vec.size(); ++idx) { auto [i, v] = q_vec[idx]; - if (v < q_threshold || i >= n_cols_internal()) { + if (v < q_threshold || i >= max_dim_) { continue; } auto dim_id = dim_map_.find(i); if (dim_id == dim_map_.end()) { continue; } - auto& lut = inverted_lut_[dim_id->second]; + auto& plist_ids = inverted_index_ids_[dim_id->second]; + auto& plist_vals = inverted_index_vals_[dim_id->second]; // TODO: improve with SIMD - for (size_t j = 0; j < lut.size(); j++) { - auto [doc_id, val] = lut[j]; - T val_sum = bm25 ? bm25_params_->row_sums.at(doc_id) : 0; + for (size_t j = 0; j < plist_ids.size(); ++j) { + auto doc_id = plist_ids[j]; + auto val = plist_vals[j]; + float val_sum = bm25 ? bm25_params_->row_sums.at(doc_id) : 0; scores[doc_id] += v * computer(val, val_sum); } } return scores; } - // find the top-k candidates using brute force search, k as specified by the capacity of the heap. - // any value in q_vec that is smaller than q_threshold and any value with dimension >= n_cols() will be ignored. - // TODO: may switch to row-wise brute force if filter rate is high. Benchmark needed. - void - search_brute_force(const SparseRow& q_vec, T q_threshold, MaxMinHeap& heap, const BitsetView& bitset, - const DocValueComputer& computer) const { - auto scores = compute_all_distances(q_vec, q_threshold, computer); - for (size_t i = 0; i < n_rows_internal(); ++i) { - if ((bitset.empty() || !bitset.test(i)) && scores[i] != 0) { - heap.push(i, scores[i]); - } - } - } - - // LUT supports size() and operator[] which returns an SparseIdVal. - template + template struct Cursor { public: - Cursor(const LUT& lut, size_t num_vec, float max_score, float q_value, const BitsetView bitset) - : lut_(lut), - lut_size_(lut.size()), + Cursor(const Vector& plist_ids, const Vector& plist_vals, size_t num_vec, float max_score, + float q_value, DocIdFilter filter) + : plist_ids_(plist_ids), + plist_vals_(plist_vals), + plist_size_(plist_ids.size()), total_num_vec_(num_vec), max_score_(max_score), q_value_(q_value), - bitset_(bitset) { - while (loc_ < lut_size_ && !bitset_.empty() && bitset_.test(lut_[loc_].id)) { - loc_++; - } + filter_(filter) { + skip_filtered_ids(); update_cur_vec_id(); } Cursor(const Cursor& rhs) = delete; + Cursor(Cursor&& rhs) noexcept = default; void next() { - next_internal(); + ++loc_; + skip_filtered_ids(); update_cur_vec_id(); } - // advance loc until cur_vec_id_ >= vec_id void seek(table_t vec_id) { - while (loc_ < lut_size_ && lut_[loc_].id < vec_id) { - next_internal(); + while (loc_ < plist_size_ && plist_ids_[loc_] < vec_id) { + ++loc_; } + skip_filtered_ids(); update_cur_vec_id(); } - T + QType cur_vec_val() const { - return lut_[loc_].val; + return plist_vals_[loc_]; } - const LUT& lut_; - const size_t lut_size_; + const Vector& plist_ids_; + const Vector& plist_vals_; + const size_t plist_size_; size_t loc_ = 0; size_t total_num_vec_ = 0; float max_score_ = 0.0f; float q_value_ = 0.0f; - const BitsetView bitset_; + DocIdFilter filter_; table_t cur_vec_id_ = 0; private: inline void update_cur_vec_id() { - if (loc_ >= lut_size_) { - cur_vec_id_ = total_num_vec_; - } else { - cur_vec_id_ = lut_[loc_].id; - } + cur_vec_id_ = (loc_ >= plist_size_) ? total_num_vec_ : plist_ids_[loc_]; } inline void - next_internal() { - loc_++; - while (loc_ < lut_size_ && !bitset_.empty() && bitset_.test(lut_[loc_].id)) { - loc_++; + skip_filtered_ids() { + while (loc_ < plist_size_ && !filter_.empty() && filter_.test(plist_ids_[loc_])) { + ++loc_; } } }; // struct Cursor + // find the top-k candidates using brute force search, k as specified by the capacity of the heap. + // any value in q_vec that is smaller than q_threshold and any value with dimension >= n_cols() will be ignored. + // TODO: may switch to row-wise brute force if filter rate is high. Benchmark needed. + template + void + search_brute_force(const SparseRow& q_vec, DType q_threshold, MaxMinHeap& heap, DocIdFilter& filter, + const DocValueComputer& computer) const { + auto scores = compute_all_distances(q_vec, q_threshold, computer); + for (size_t i = 0; i < n_rows_internal_; ++i) { + if ((filter.empty() || !filter.test(i)) && scores[i] != 0) { + heap.push(i, scores[i]); + } + } + } + // any value in q_vec that is smaller than q_threshold will be ignored. + template void - search_wand(const SparseRow& q_vec, T q_threshold, MaxMinHeap& heap, const BitsetView& bitset, - const DocValueComputer& computer) const { + search_wand(const SparseRow& q_vec, DType q_threshold, MaxMinHeap& heap, DocIdFilter& filter, + const DocValueComputer& computer) const { auto q_dim = q_vec.size(); - std::vector>> cursors(q_dim); + std::vector>> cursors(q_dim); size_t valid_q_dim = 0; for (size_t i = 0; i < q_dim; ++i) { auto [idx, val] = q_vec[i]; @@ -642,9 +650,10 @@ class InvertedIndex : public BaseInvertedIndex { if (dim_id == dim_map_.end() || std::abs(val) < q_threshold) { continue; } - auto& lut = inverted_lut_[dim_id->second]; - cursors[valid_q_dim++] = std::make_shared>( - lut, n_rows_internal(), max_score_in_dim_[dim_id->second] * val, val, bitset); + auto& plist_ids = inverted_index_ids_[dim_id->second]; + auto& plist_vals = inverted_index_vals_[dim_id->second]; + cursors[valid_q_dim++] = std::make_shared>( + plist_ids, plist_vals, n_rows_internal_, max_score_in_dim_[dim_id->second] * val, val, filter); } if (valid_q_dim == 0) { return; @@ -660,7 +669,7 @@ class InvertedIndex : public BaseInvertedIndex { size_t pivot; bool found_pivot = false; for (pivot = 0; pivot < valid_q_dim; ++pivot) { - if (cursors[pivot]->loc_ >= cursors[pivot]->lut_size_) { + if (cursors[pivot]->loc_ >= cursors[pivot]->plist_size_) { break; } upper_bound += cursors[pivot]->max_score_; @@ -679,7 +688,7 @@ class InvertedIndex : public BaseInvertedIndex { if (cursor->cur_vec_id_ != pivot_id) { break; } - T cur_vec_sum = bm25 ? bm25_params_->row_sums.at(cursor->cur_vec_id_) : 0; + float cur_vec_sum = bm25 ? bm25_params_->row_sums.at(cursor->cur_vec_id_) : 0; score += cursor->q_value_ * computer(cursor->cur_vec_val(), cur_vec_sum); cursor->next(); } @@ -701,23 +710,22 @@ class InvertedIndex : public BaseInvertedIndex { } void - refine_and_collect(const SparseRow& q_vec, MaxMinHeap& inaccurate, size_t k, float* distances, - label_t* labels, const DocValueComputer& computer) const { - std::priority_queue, std::vector>, std::greater>> heap; + refine_and_collect(const SparseRow& q_vec, MaxMinHeap& inacc_heap, size_t k, float* distances, + label_t* labels, const DocValueComputer& computer) const { + std::vector docids; + MaxMinHeap heap(k); - while (!inaccurate.empty()) { - auto [u, d] = inaccurate.top(); - inaccurate.pop(); - - T u_sum = bm25 ? bm25_params_->row_sums.at(u) : 0; + docids.reserve(inacc_heap.size()); + while (!inacc_heap.empty()) { + table_t u = inacc_heap.pop(); + docids.emplace_back(u); + } - auto dist_acc = q_vec.dot(raw_data_[u], computer, u_sum); - if (heap.size() < k) { - heap.emplace(u, dist_acc); - } else if (heap.top().val < dist_acc) { - heap.pop(); - heap.emplace(u, dist_acc); - } + DocIdFilterByVector filter(std::move(docids)); + if (use_wand) { + search_wand(q_vec, 0, heap, filter, computer); + } else { + search_brute_force(q_vec, 0, heap, filter, computer); } collect_result(heap, distances, labels); } @@ -734,8 +742,8 @@ class InvertedIndex : public BaseInvertedIndex { } inline void - add_row_to_index(const SparseRow& row, table_t id) { - [[maybe_unused]] T row_sum = 0; + add_row_to_index(const SparseRow& row, table_t vec_id) { + [[maybe_unused]] float row_sum = 0; for (size_t j = 0; j < row.size(); ++j) { auto [idx, val] = row[j]; if constexpr (bm25) { @@ -743,7 +751,7 @@ class InvertedIndex : public BaseInvertedIndex { } // Skip values equals to or close enough to zero(which contributes // little to the total IP score). - if (val == 0 || (drop_during_build_ && fabs(val) < value_threshold_)) { + if (val == 0) { continue; } auto dim_it = dim_map_.find(idx); @@ -752,14 +760,16 @@ class InvertedIndex : public BaseInvertedIndex { throw std::runtime_error("unexpected vector dimension in mmaped InvertedIndex"); } dim_it = dim_map_.insert({idx, next_dim_id_++}).first; - inverted_lut_.emplace_back(); + inverted_index_ids_.emplace_back(); + inverted_index_vals_.emplace_back(); if constexpr (use_wand) { - max_score_in_dim_.emplace_back(0); + max_score_in_dim_.emplace_back(0.0f); } } - inverted_lut_[dim_it->second].emplace_back(id, val); + inverted_index_ids_[dim_it->second].emplace_back(vec_id); + inverted_index_vals_[dim_it->second].emplace_back(get_quant_val(val)); if constexpr (use_wand) { - auto score = val; + auto score = static_cast(val); if constexpr (bm25) { score = bm25_params_->max_score_ratio * bm25_params_->wand_max_score_computer(val, row_sum); } @@ -771,24 +781,31 @@ class InvertedIndex : public BaseInvertedIndex { } } + inline QType + get_quant_val(DType val) const { + if constexpr (!std::is_same_v) { + const DType max_val = static_cast(std::numeric_limits::max()); + if (val >= max_val) { + return std::numeric_limits::max(); + } else if (val <= std::numeric_limits::min()) { + return std::numeric_limits::min(); + } else { + return static_cast(val); + } + } else { + return val; + } + } + // key is raw sparse vector dim/idx, value is the mapped dim/idx id in the index. std::unordered_map dim_map_; - template - using Vector = std::conditional_t, std::vector>; - // reserve, [], size, emplace_back - Vector> raw_data_; - - Vector>> inverted_lut_; - // If we want to drop small values during build, we must first train the - // index with all the data to compute value_threshold_. - bool drop_during_build_ = false; - // when drop_during_build_ is true, any value smaller than value_threshold_ - // will not be added to inverted_lut_. value_threshold_ is set to the - // drop_ratio_build-th percentile of all absolute values in the index. - T value_threshold_ = 0.0f; - Vector max_score_in_dim_; + Vector> inverted_index_ids_; + Vector> inverted_index_vals_; + Vector max_score_in_dim_; + + size_t n_rows_internal_ = 0; size_t max_dim_ = 0; uint32_t next_dim_id_ = 0; @@ -801,17 +818,17 @@ class InvertedIndex : public BaseInvertedIndex { float b; // row_sums is used to cache the sum of values of each row, which // corresponds to the document length of each doc in the BM25 formula. - Vector row_sums; + Vector row_sums; // below are used only for WAND index. float max_score_ratio; - DocValueComputer wand_max_score_computer; + DocValueComputer wand_max_score_computer; BM25Params(float k1, float b, float avgdl, float max_score_ratio) : k1(k1), b(b), max_score_ratio(max_score_ratio), - wand_max_score_computer(GetDocValueBM25Computer(k1, b, avgdl)) { + wand_max_score_computer(GetDocValueBM25Computer(k1, b, avgdl)) { } }; // struct BM25Params diff --git a/src/index/sparse/sparse_inverted_index_config.h b/src/index/sparse/sparse_inverted_index_config.h index 2d416e43a..7c56494eb 100644 --- a/src/index/sparse/sparse_inverted_index_config.h +++ b/src/index/sparse/sparse_inverted_index_config.h @@ -24,6 +24,7 @@ class SparseInvertedIndexConfig : public BaseConfig { CFG_INT refine_factor; CFG_FLOAT wand_bm25_max_score_ratio; KNOHWERE_DECLARE_CONFIG(SparseInvertedIndexConfig) { + // NOTE: drop_ratio_build has been deprecated, it won't change anything KNOWHERE_CONFIG_DECLARE_FIELD(drop_ratio_build) .description("drop ratio for build") .set_default(0.0f) diff --git a/tests/ut/test_sparse.cc b/tests/ut/test_sparse.cc index 85341bb05..c84886b32 100644 --- a/tests/ut/test_sparse.cc +++ b/tests/ut/test_sparse.cc @@ -45,12 +45,10 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { auto topk = 5; int64_t nq = 10; - auto [drop_ratio_build, drop_ratio_search] = GENERATE(table({ - {0.0, 0.0}, - {0.15, 0.3}, - })); - auto metric = GENERATE(knowhere::metric::IP, knowhere::metric::BM25); + + auto drop_ratio_search = metric == knowhere::metric::BM25 ? GENERATE(0.0, 0.1) : GENERATE(0.0, 0.3); + auto version = GenTestVersionList(); auto base_gen = [=, dim = dim]() { @@ -64,17 +62,22 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { return json; }; - auto sparse_inverted_index_gen = [base_gen, drop_ratio_build = drop_ratio_build, - drop_ratio_search = drop_ratio_search]() { + auto sparse_inverted_index_gen = [base_gen, drop_ratio_search = drop_ratio_search]() { knowhere::Json json = base_gen(); - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; return json; }; - const auto train_ds = GenSparseDataSet(nb, dim, doc_sparsity); - // it is possible the query has more dims than the train dataset. - const auto query_ds = GenSparseDataSet(nq, dim + 20, query_sparsity); + auto sparse_dataset_gen = [&](int nr, int dim, float sparsity) -> knowhere::DataSetPtr { + if (metric == knowhere::metric::BM25) { + return GenSparseDataSetWithMaxVal(nr, dim, sparsity, 256, true); + } else { + return GenSparseDataSet(nr, dim, sparsity); + } + }; + + auto train_ds = sparse_dataset_gen(nb, dim, doc_sparsity); + auto query_ds = sparse_dataset_gen(nq, dim + 20, query_sparsity); const knowhere::Json conf = { {knowhere::meta::METRIC_TYPE, metric}, {knowhere::meta::TOPK, topk}, {knowhere::meta::BM25_K1, 1.2}, @@ -144,9 +147,8 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { REQUIRE(results.has_value()); float recall = GetKNNRecall(*gt.value(), *results.value()); check_distance_decreasing(*results.value()); - auto drop_ratio_build = json[knowhere::indexparam::DROP_RATIO_BUILD].get(); auto drop_ratio_search = json[knowhere::indexparam::DROP_RATIO_SEARCH].get(); - if (drop_ratio_build == 0 && drop_ratio_search == 0) { + if (drop_ratio_search == 0) { REQUIRE(recall == 1); } else { // most test cases are above 0.95, only a few between 0.9 and 0.95 @@ -189,9 +191,8 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { float recall = GetKNNRecall(*filter_gt.value(), *results.value()); check_distance_decreasing(*results.value()); - auto drop_ratio_build = json[knowhere::indexparam::DROP_RATIO_BUILD].get(); auto drop_ratio_search = json[knowhere::indexparam::DROP_RATIO_SEARCH].get(); - if (drop_ratio_build == 0 && drop_ratio_search == 0) { + if (drop_ratio_search == 0) { REQUIRE(recall == 1); } else { REQUIRE(recall >= 0.8); @@ -258,10 +259,15 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { REQUIRE(idx.Size() > 0); REQUIRE(idx.Count() == nb); - auto [radius, range_filter] = GENERATE(table({ - {0.5, 1}, - {1, 1.5}, - })); + auto [radius, range_filter] = metric == knowhere::metric::BM25 ? GENERATE(table({ + {80.0, 100.0}, + {100.0, 200.0}, + })) + : GENERATE(table({ + {0.5, 1}, + {1, 1.5}, + })); + json[knowhere::meta::RADIUS] = radius; json[knowhere::meta::RANGE_FILTER] = range_filter; @@ -311,108 +317,6 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { } } -TEST_CASE("Test Mem Sparse Index GetVectorByIds", "[float metrics]") { - auto [nb, dim, doc_sparsity, query_sparsity] = GENERATE(table({ - // 300 dim, avg doc nnz 12, avg query nnz 9 - {2000, 300, 0.95, 0.97}, - // 300 dim, avg doc nnz 9, avg query nnz 3 - {2000, 300, 0.97, 0.99}, - // 3000 dim, avg doc nnz 90, avg query nnz 30 - {20000, 3000, 0.97, 0.99}, - })); - int64_t nq = GENERATE(10, 100); - - auto [drop_ratio_build, drop_ratio_search] = GENERATE(table({ - {0.0, 0.0}, - {0.32, 0.0}, - })); - - auto metric = knowhere::metric::IP; - auto version = GenTestVersionList(); - - auto base_gen = [=, dim = dim]() { - knowhere::Json json; - json[knowhere::meta::DIM] = dim; - json[knowhere::meta::METRIC_TYPE] = metric; - json[knowhere::meta::TOPK] = 1; - return json; - }; - - auto sparse_inverted_index_gen = [base_gen, drop_ratio_build = drop_ratio_build, - drop_ratio_search = drop_ratio_search]() { - knowhere::Json json = base_gen(); - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; - json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; - return json; - }; - - const auto train_ds = GenSparseDataSet(nb, dim, doc_sparsity); - - SECTION("Test GetVectorByIds") { - using std::make_tuple; - auto [name, gen] = GENERATE_REF(table>({ - make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, sparse_inverted_index_gen), - make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, sparse_inverted_index_gen), - })); - auto use_mmap = GENERATE(true, false); - auto tmp_file = "/tmp/knowhere_sparse_inverted_index_test"; - { - auto idx = knowhere::IndexFactory::Instance().Create(name, version).value(); - auto cfg_json = gen().dump(); - CAPTURE(name, cfg_json); - knowhere::Json json = knowhere::Json::parse(cfg_json); - - auto ids_ds = GenIdsDataSet(nb, nq); - REQUIRE(idx.Type() == name); - auto res = idx.Build(train_ds, json); - REQUIRE(idx.HasRawData(metric) == - knowhere::IndexStaticFaced::HasRawData(name, version, json)); - if (!idx.HasRawData(metric)) { - return; - } - REQUIRE(res == knowhere::Status::success); - knowhere::BinarySet bs; - idx.Serialize(bs); - - auto idx_new = knowhere::IndexFactory::Instance().Create(name, version).value(); - if (use_mmap) { - WriteBinaryToFile(tmp_file, bs.GetByName(idx.Type())); - REQUIRE(idx_new.DeserializeFromFile(tmp_file, json) == knowhere::Status::success); - } else { - REQUIRE(idx_new.Deserialize(bs, json) == knowhere::Status::success); - } - - auto retrieve_task = [&]() { - auto results = idx_new.GetVectorByIds(ids_ds); - REQUIRE(results.has_value()); - auto xb = (knowhere::sparse::SparseRow*)train_ds->GetTensor(); - auto res_data = (knowhere::sparse::SparseRow*)results.value()->GetTensor(); - for (int i = 0; i < nq; ++i) { - const auto id = ids_ds->GetIds()[i]; - const auto& truth_row = xb[id]; - const auto& res_row = res_data[i]; - REQUIRE(truth_row.size() == res_row.size()); - for (size_t j = 0; j < truth_row.size(); ++j) { - REQUIRE(truth_row[j] == res_row[j]); - } - } - }; - - std::vector> retrieve_task_list; - for (int i = 0; i < 20; i++) { - retrieve_task_list.push_back(std::async(std::launch::async, [&] { return retrieve_task(); })); - } - for (auto& task : retrieve_task_list) { - task.wait(); - } - // idx/idx_new to destroy and munmap - } - if (use_mmap) { - REQUIRE(std::remove(tmp_file) == 0); - } - } -} - TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { auto [base_data, has_first_result] = GENERATE(table>, bool>( {{std::vector>{ @@ -431,14 +335,9 @@ TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { auto metric = GENERATE(knowhere::metric::IP, knowhere::metric::BM25); auto version = GenTestVersionList(); - auto [drop_ratio_build, drop_ratio_search] = GENERATE(table({ - {0.0, 0.0}, - {0.32, 0.0}, - {0.32, 0.6}, - {0.0, 0.6}, - })); + auto drop_ratio_search = GENERATE(0.0, 0.6); - auto base_gen = [=, dim = dim, drop_ratio_build = drop_ratio_build, drop_ratio_search = drop_ratio_search]() { + auto base_gen = [=, dim = dim, drop_ratio_search = drop_ratio_search]() { knowhere::Json json; json[knowhere::meta::DIM] = dim; json[knowhere::meta::METRIC_TYPE] = metric; @@ -446,7 +345,6 @@ TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { json[knowhere::meta::BM25_K1] = 1.2; json[knowhere::meta::BM25_B] = 0.75; json[knowhere::meta::BM25_AVGDL] = 100; - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; return json; }; @@ -537,22 +435,6 @@ TEST_CASE("Test Mem Sparse Index Handle Empty Vector", "[float metrics]") { REQUIRE(results.has_value()); check_result(*results.value()); } - - SECTION("Test GetVectorByIds") { - std::vector ids = {0, 1, 2}; - auto results = idx.GetVectorByIds(GenIdsDataSet(3, ids)); - REQUIRE(results.has_value()); - auto xb = (knowhere::sparse::SparseRow*)train_ds->GetTensor(); - auto res_data = (knowhere::sparse::SparseRow*)results.value()->GetTensor(); - for (int i = 0; i < 3; ++i) { - const auto& truth_row = xb[i]; - const auto& res_row = res_data[i]; - REQUIRE(truth_row.size() == res_row.size()); - for (size_t j = 0; j < truth_row.size(); ++j) { - REQUIRE(truth_row[j] == res_row[j]); - } - } - } } TEST_CASE("Test Mem Sparse Index CC", "[float metrics]") { @@ -578,8 +460,6 @@ TEST_CASE("Test Mem Sparse Index CC", "[float metrics]") { auto query_ds = doc_vector_gen(nq, dim); - // drop ratio build is not supported in CC index - auto drop_ratio_build = 0.0; auto drop_ratio_search = GENERATE(0.0, 0.3); auto metric = GENERATE(knowhere::metric::IP); @@ -596,10 +476,8 @@ TEST_CASE("Test Mem Sparse Index CC", "[float metrics]") { return json; }; - auto sparse_inverted_index_gen = [base_gen, drop_ratio_build = drop_ratio_build, - drop_ratio_search = drop_ratio_search]() { + auto sparse_inverted_index_gen = [base_gen, drop_ratio_search = drop_ratio_search]() { knowhere::Json json = base_gen(); - json[knowhere::indexparam::DROP_RATIO_BUILD] = drop_ratio_build; json[knowhere::indexparam::DROP_RATIO_SEARCH] = drop_ratio_search; return json; }; diff --git a/tests/ut/utils.h b/tests/ut/utils.h index 3bbc3d794..3b62ba7ef 100644 --- a/tests/ut/utils.h +++ b/tests/ut/utils.h @@ -380,6 +380,35 @@ GenSparseDataSet(int32_t rows, int32_t cols, float sparsity, int seed = 42) { return GenSparseDataSet(data, cols); } +// Generate a sparse dataset with given sparsity and max value. +inline knowhere::DataSetPtr +GenSparseDataSetWithMaxVal(int32_t rows, int32_t cols, float sparsity, float max_val, bool use_bm25 = false, + int seed = 42) { + int32_t num_elements = static_cast(rows * cols * (1.0f - sparsity)); + + std::mt19937 rng(seed); + auto real_distrib = std::uniform_real_distribution(0, max_val); + auto row_distrib = std::uniform_int_distribution(0, rows - 1); + auto col_distrib = std::uniform_int_distribution(0, cols - 1); + + std::vector> data(rows); + + for (int32_t i = 0; i < num_elements; ++i) { + auto row = row_distrib(rng); + while (data[row].size() == (size_t)cols) { + row = row_distrib(rng); + } + auto col = col_distrib(rng); + while (data[row].find(col) != data[row].end()) { + col = col_distrib(rng); + } + auto val = use_bm25 ? static_cast(static_cast(real_distrib(rng))) : real_distrib(rng); + data[row][col] = val; + } + + return GenSparseDataSet(data, cols); +} + // a timer struct StopWatch { using timepoint_t = std::chrono::time_point;