From 04efb4343f5034fa7c6f43841bd6760dd621601a Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 13 Nov 2024 13:24:44 +0100 Subject: [PATCH 1/3] Separate bit vector lookup from bit vector read. Reduce proxying from DiskIndex to FieldIndex. --- .../diskindex/bitvector/bitvector_test.cpp | 20 ++++++---- .../diskindex/diskindex/diskindex_test.cpp | 17 +++++--- .../tests/diskindex/fusion/fusion_test.cpp | 30 ++++++++------ .../diskindex/bitvectordictionary.cpp | 23 +++++++---- .../searchlib/diskindex/bitvectordictionary.h | 23 ++++++++--- .../vespa/searchlib/diskindex/diskindex.cpp | 27 +------------ .../src/vespa/searchlib/diskindex/diskindex.h | 23 +---------- .../searchlib/diskindex/disktermblueprint.cpp | 40 ++++++++++--------- .../searchlib/diskindex/disktermblueprint.h | 13 +++--- .../vespa/searchlib/diskindex/field_index.cpp | 23 ++++++++--- .../vespa/searchlib/diskindex/field_index.h | 8 +++- .../bitvector_dictionary_lookup_result.h | 29 ++++++++++++++ 12 files changed, 159 insertions(+), 117 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/index/bitvector_dictionary_lookup_result.h diff --git a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp index 356acb2206f5..9b36194ec0ba 100644 --- a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp +++ b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp @@ -118,8 +118,8 @@ TEST_P(BitVectorTest, require_that_dictionary_handles_no_entries) EXPECT_TRUE(dict.open("dump/1/", tuneFileRead, bvScope)); EXPECT_EQ(5u, dict.getDocIdLimit()); EXPECT_EQ(0u, dict.getEntries().size()); - EXPECT_FALSE(dict.lookup(1)); - EXPECT_FALSE(dict.lookup(2)); + EXPECT_FALSE(dict.lookup(1).valid()); + EXPECT_FALSE(dict.lookup(2).valid()); } TEST_P(BitVectorTest, require_that_dictionary_handles_multiple_entries) @@ -176,16 +176,20 @@ TEST_P(BitVectorTest, require_that_dictionary_handles_multiple_entries) EXPECT_EQ(5u, e._wordNum); EXPECT_EQ(23u, e._numDocs); - EXPECT_FALSE(dict.lookup(2)); - EXPECT_FALSE(dict.lookup(3)); - EXPECT_FALSE(dict.lookup(4)); - EXPECT_FALSE(dict.lookup(6)); + EXPECT_FALSE(dict.lookup(2).valid()); + EXPECT_FALSE(dict.lookup(3).valid()); + EXPECT_FALSE(dict.lookup(4).valid()); + EXPECT_FALSE(dict.lookup(6).valid()); - BitVector::UP bv1act = dict.lookup(1); + auto bv1lr = dict.lookup(1); + EXPECT_TRUE(bv1lr.valid()); + auto bv1act = dict.read_bitvector(bv1lr); EXPECT_TRUE(bv1act); EXPECT_TRUE(*bv1exp == *bv1act); - BitVector::UP bv5act = dict.lookup(5); + auto bv5lr = dict.lookup(5); + EXPECT_TRUE(bv5lr.valid()); + auto bv5act = dict.read_bitvector(bv5lr); EXPECT_TRUE(bv5act); EXPECT_TRUE(*bv5exp == *bv5act); } diff --git a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp index 7c08746702e2..ee8bbae184ed 100644 --- a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp +++ b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp @@ -249,8 +249,9 @@ DiskIndexTest::requireThatWeCanReadPostingList() TermFieldMatchDataArray mda; { // field 'f1' auto r = _index->lookup(0, "w1"); - auto h = _index->readPostingList(r); - auto sb = _index->create_iterator(r, h, mda); + auto& field_index = _index->get_field_index(0); + auto h = field_index.read_posting_list(r); + auto sb = field_index.create_iterator(r, h, mda); EXPECT_EQ(SimpleResult({1,3}), SimpleResult().search(*sb)); } } @@ -274,16 +275,22 @@ DiskIndexTest::requireThatWeCanReadBitVector() { { // word 'w1' auto r = _index->lookup(1, "w1"); + auto& field_index = _index->get_field_index(1); // not bit vector for 'w1' - EXPECT_TRUE(_index->readBitVector(r).get() == NULL); + auto blr = field_index.lookup_bit_vector(r); + EXPECT_FALSE(blr.valid()); + EXPECT_TRUE(field_index.read_bit_vector(blr).get() == nullptr); } { // word 'w2' BitVector::UP exp(BitVector::create(32)); for (uint32_t docId = 1; docId < 18; ++docId) exp->setBit(docId); { // field 'f2' auto r = _index->lookup(1, "w2"); - BitVector::UP bv = _index->readBitVector(r); - EXPECT_TRUE(bv.get() != NULL); + auto& field_index = _index->get_field_index(1); + auto blr = field_index.lookup_bit_vector(r); + EXPECT_TRUE(blr.valid()); + BitVector::UP bv = field_index.read_bit_vector(blr); + EXPECT_TRUE(bv.get() != nullptr); EXPECT_TRUE(*bv == *exp); } } diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp index 9f8c40685b8b..69e3f0a5e5ec 100644 --- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp +++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp @@ -159,12 +159,13 @@ assert_interleaved_features(DiskIndex &d, const std::string &field, const std::s { const Schema &schema = d.getSchema(); uint32_t field_id(schema.getIndexFieldId(field)); + auto& field_index = d.get_field_index(field_id); auto lookup_result(d.lookup(field_id, term)); - auto handle(d.readPostingList(lookup_result)); + auto handle(field_index.read_posting_list(lookup_result)); TermFieldMatchData tfmd; TermFieldMatchDataArray tfmda; tfmda.add(&tfmd); - auto sbap(d.create_iterator(lookup_result, handle, tfmda)); + auto sbap(field_index.create_iterator(lookup_result, handle, tfmda)); sbap->initFullRange(); EXPECT_TRUE(sbap->seek(doc_id)); sbap->unpack(doc_id); @@ -179,12 +180,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights) { uint32_t id1(schema.getIndexFieldId("f0")); + auto& field_index = dw.get_field_index(id1); auto lr1(dw.lookup(id1, "c")); - auto wh1(dw.readPostingList(lr1)); + auto wh1(field_index.read_posting_list(lr1)); TermFieldMatchData f0; TermFieldMatchDataArray a; a.add(&f0); - auto sbap(dw.create_iterator(lr1, wh1, a)); + auto sbap(field_index.create_iterator(lr1, wh1, a)); sbap->initFullRange(); EXPECT_EQ(std::string("{1000000:}"), toString(f0.getIterator())); EXPECT_TRUE(sbap->seek(10)); @@ -193,12 +195,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights) } { uint32_t id1(schema.getIndexFieldId("f2")); + auto& field_index = dw.get_field_index(id1); auto lr1(dw.lookup(id1, "ax")); - auto wh1(dw.readPostingList(lr1)); + auto wh1(field_index.read_posting_list(lr1)); TermFieldMatchData f2; TermFieldMatchDataArray a; a.add(&f2); - auto sbap(dw.create_iterator(lr1, wh1, a)); + auto sbap(field_index.create_iterator(lr1, wh1, a)); sbap->initFullRange(); EXPECT_EQ(std::string("{1000000:}"), toString(f2.getIterator())); EXPECT_TRUE(sbap->seek(10)); @@ -213,12 +216,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights) } { uint32_t id1(schema.getIndexFieldId("f3")); + auto& field_index = dw.get_field_index(id1); auto lr1(dw.lookup(id1, "wx")); - auto wh1(dw.readPostingList(lr1)); + auto wh1(field_index.read_posting_list(lr1)); TermFieldMatchData f3; TermFieldMatchDataArray a; a.add(&f3); - auto sbap(dw.create_iterator(lr1, wh1, a)); + auto sbap(field_index.create_iterator(lr1, wh1, a)); sbap->initFullRange(); EXPECT_EQ(std::string("{1000000:}"), toString(f3.getIterator())); EXPECT_TRUE(sbap->seek(10)); @@ -233,12 +237,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights) } { uint32_t id1(schema.getIndexFieldId("f3"));; + auto& field_index = dw.get_field_index(id1); auto lr1(dw.lookup(id1, "zz")); - auto wh1(dw.readPostingList(lr1)); + auto wh1(field_index.read_posting_list(lr1)); TermFieldMatchData f3; TermFieldMatchDataArray a; a.add(&f3); - auto sbap(dw.create_iterator(lr1, wh1, a)); + auto sbap(field_index.create_iterator(lr1, wh1, a)); sbap->initFullRange(); EXPECT_EQ(std::string("{1000000:}"), toString(f3.getIterator())); EXPECT_TRUE(sbap->seek(11)); @@ -253,12 +258,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights) } { uint32_t id1(schema.getIndexFieldId("f3"));; + auto& field_index = dw.get_field_index(id1); auto lr1(dw.lookup(id1, "zz0")); - auto wh1(dw.readPostingList(lr1)); + auto wh1(field_index.read_posting_list(lr1)); TermFieldMatchData f3; TermFieldMatchDataArray a; a.add(&f3); - auto sbap(dw.create_iterator(lr1, wh1, a)); + auto sbap(field_index.create_iterator(lr1, wh1, a)); sbap->initFullRange(); EXPECT_EQ(std::string("{1000000:}"), toString(f3.getIterator())); EXPECT_TRUE(sbap->seek(12)); diff --git a/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.cpp b/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.cpp index a4ffc56ccc3d..8ea41148be9b 100644 --- a/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "bitvectordictionary.h" +#include #include #include #include @@ -9,6 +10,8 @@ #include LOG_SETUP(".diskindex.bitvectordictionary"); +using search::index::BitVectorDictionaryLookupResult; + namespace search::diskindex { using namespace tags; @@ -83,19 +86,25 @@ BitVectorDictionary::open(const std::string &pathPrefix, return true; } - -BitVector::UP -BitVectorDictionary::lookup(uint64_t wordNum) -{ +BitVectorDictionaryLookupResult +BitVectorDictionary::lookup(uint64_t wordNum) { WordSingleKey key; key._wordNum = wordNum; auto itr = std::lower_bound(_entries.begin(), _entries.end(), key); if (itr == _entries.end() || key < *itr) { return {}; } - int64_t pos = &*itr - &_entries[0]; - int64_t offset = ((int64_t) _vectorSize) * pos + _datHeaderLen; - return BitVector::create(_docIdLimit, *_datFile, offset, itr->_numDocs); + return BitVectorDictionaryLookupResult(itr - _entries.begin()); +} + +std::unique_ptr +BitVectorDictionary::read_bitvector(BitVectorDictionaryLookupResult lookup_result) +{ + if (!lookup_result.valid()) { + return {}; + } + int64_t offset = ((int64_t) _vectorSize) * lookup_result.idx + _datHeaderLen; + return BitVector::create(_docIdLimit, *_datFile, offset, _entries[lookup_result.idx]._numDocs); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h b/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h index 9e4abdb7a4df..2564853f48d0 100644 --- a/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h +++ b/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h @@ -2,12 +2,17 @@ #pragma once #include "bitvectorkeyscope.h" -#include +//#include +#include #include #include #include #include +class FastOS_FileInterface; + +namespace search { class BitVector; } + namespace search::diskindex { /** @@ -48,13 +53,19 @@ class BitVectorDictionary BitVectorKeyScope scope); /** - * Lookup the given word number and load and return the associated - * bit vector if found. + * Lookup the given word number. + * + * @param word_num the word number to lookup a bit vector for. + * @return a bitvector dictionary lookup result that can be passed to read_bitvector member function. + **/ + index::BitVectorDictionaryLookupResult lookup(uint64_t word_num); + /** + * load and return the associated bit vector if lookup result is valid. * - * @param wordNum the word number to lookup a bit vector for. - * @return the loaded bit vector or nullptr if not found. + * @param lookup_result the result returned from lookup. + * @return the loaded bit vector or empty if lookup result was invalid. **/ - BitVector::UP lookup(uint64_t wordNum); + std::unique_ptr read_bitvector(index::BitVectorDictionaryLookupResult lookup_result); uint32_t getDocIdLimit() const { return _docIdLimit; } diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp index 77759cd60e72..f579aecceed6 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp @@ -81,7 +81,7 @@ DiskIndex::openDictionaries(const TuneFileSearch &tuneFileSearch) { for (SchemaUtil::IndexIterator itr(_schema); itr.isValid(); ++itr) { std::string field_dir = _indexDir + "/" + itr.getName(); - _field_indexes.emplace_back(_posting_list_cache); + _field_indexes.emplace_back(itr.getIndex(), _posting_list_cache); if (!_field_indexes.back().open_dictionary(field_dir, tuneFileSearch)) { _field_indexes.clear(); return false; @@ -234,29 +234,6 @@ DiskIndex::read(const Key & key, LookupResultVector & result) return true; } -index::PostingListHandle -DiskIndex::readPostingList(const LookupResult &lookupRes) const -{ - auto& field_index = _field_indexes[lookupRes.indexId]; - return field_index.read_posting_list(lookupRes); -} - -BitVector::UP -DiskIndex::readBitVector(const LookupResult &lookupRes) const -{ - auto& field_index = _field_indexes[lookupRes.indexId]; - return field_index.read_bit_vector(lookupRes); -} - -std::unique_ptr -DiskIndex::create_iterator(const LookupResult& lookup_result, - const index::PostingListHandle& handle, - const search::fef::TermFieldMatchDataArray& tfmda) const -{ - auto& field_index = _field_indexes[lookup_result.indexId]; - return field_index.create_iterator(lookup_result, handle, tfmda); -} - namespace { const std::vector nonfield_file_names{ @@ -334,7 +311,7 @@ class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper { const DiskIndex::LookupResult & lookupRes = _cache.lookup(termStr, _fieldId); if (lookupRes.valid()) { bool useBitVector = _field.isFilter(); - setResult(std::make_unique(_field, _diskIndex, termStr, lookupRes, useBitVector)); + setResult(std::make_unique(_field, _diskIndex.get_field_index(_fieldId), termStr, lookupRes, useBitVector)); } else { setResult(std::make_unique(_field)); } diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.h b/searchlib/src/vespa/searchlib/diskindex/diskindex.h index d692e1a16a7d..13eec5cece96 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.h +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.h @@ -99,28 +99,6 @@ class DiskIndex : public queryeval::Searchable { LookupResultVector lookup(const std::vector & indexes, std::string_view word); - /** - * Read the posting list corresponding to the given lookup result. - * - * @param lookupRes the result of the previous dictionary lookup. - * @return a handle for the posting list in memory. - */ - index::PostingListHandle readPostingList(const LookupResult &lookupRes) const; - - std::unique_ptr - create_iterator(const LookupResult& lookup_result, - const index::PostingListHandle& handle, - const search::fef::TermFieldMatchDataArray& tfmda) const; - - /** - * Read the bit vector corresponding to the given lookup result. - * - * @param lookupRes the result of the previous dictionary lookup. - * @return the bit vector or nullptr if no bit vector exists for the - * word in the lookup result. - */ - BitVector::UP readBitVector(const LookupResult &lookupRes) const; - std::unique_ptr createBlueprint(const queryeval::IRequestContext & requestContext, const queryeval::FieldSpec &field, const query::Node &term) override; @@ -143,6 +121,7 @@ class DiskIndex : public queryeval::Searchable { index::FieldLengthInfo get_field_length_info(const std::string& field_name) const; const std::shared_ptr& get_posting_list_cache() const noexcept { return _posting_list_cache; } + const FieldIndex& get_field_index(uint32_t field_id) const noexcept { return _field_indexes[field_id]; } }; void swap(DiskIndex::LookupResult & a, DiskIndex::LookupResult & b); diff --git a/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp b/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp index 8a66ec5dd8ee..08578fce9f08 100644 --- a/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp @@ -14,6 +14,7 @@ LOG_SETUP(".diskindex.disktermblueprint"); using search::BitVectorIterator; using search::fef::TermFieldMatchDataArray; +using search::index::DictionaryLookupResult; using search::index::Schema; using search::queryeval::Blueprint; using search::queryeval::BooleanMatchIteratorWrapper; @@ -37,19 +38,20 @@ getName(uint32_t indexId) } DiskTermBlueprint::DiskTermBlueprint(const FieldSpec & field, - const DiskIndex & diskIndex, + const FieldIndex& field_index, const std::string& query_term, - DiskIndex::LookupResult lookupRes, - bool useBitVector) : - SimpleLeafBlueprint(field), - _field(field), - _diskIndex(diskIndex), - _query_term(query_term), - _lookupRes(std::move(lookupRes)), - _useBitVector(useBitVector), - _fetchPostingsDone(false), - _postingHandle(), - _bitVector() + DictionaryLookupResult lookupRes, + bool useBitVector) + : SimpleLeafBlueprint(field), + _field(field), + _field_index(field_index), + _query_term(query_term), + _lookupRes(std::move(lookupRes)), + _bitvector_lookup_result(_field_index.lookup_bit_vector(_lookupRes)), + _useBitVector(useBitVector), + _fetchPostingsDone(false), + _postingHandle(), + _bitVector() { setEstimate(HitEstimate(_lookupRes.counts._numDocs, _lookupRes.counts._numDocs == 0)); @@ -60,9 +62,9 @@ DiskTermBlueprint::fetchPostings(const queryeval::ExecuteInfo &execInfo) { (void) execInfo; if (!_fetchPostingsDone) { - _bitVector = _diskIndex.readBitVector(_lookupRes); + _bitVector = _field_index.read_bit_vector(_bitvector_lookup_result); if (!_useBitVector || !_bitVector) { - _postingHandle = _diskIndex.readPostingList(_lookupRes); + _postingHandle = _field_index.read_posting_list(_lookupRes); } } _fetchPostingsDone = true; @@ -80,17 +82,17 @@ DiskTermBlueprint::createLeafSearch(const TermFieldMatchDataArray & tfmda) const { if (_bitVector && (_useBitVector || tfmda[0]->isNotNeeded())) { LOG(debug, "Return BitVectorIterator: %s, wordNum(%" PRIu64 "), docCount(%" PRIu64 ")", - getName(_lookupRes.indexId).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs); + getName(_field_index.get_field_id()).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs); return BitVectorIterator::create(_bitVector.get(), *tfmda[0], strict()); } - auto search(_diskIndex.create_iterator(_lookupRes, _postingHandle, tfmda)); + auto search(_field_index.create_iterator(_lookupRes, _postingHandle, tfmda)); if (_useBitVector) { LOG(debug, "Return BooleanMatchIteratorWrapper: %s, wordNum(%" PRIu64 "), docCount(%" PRIu64 ")", - getName(_lookupRes.indexId).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs); + getName(_field_index.get_field_id()).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs); return std::make_unique(std::move(search), tfmda); } LOG(debug, "Return posting list iterator: %s, wordNum(%" PRIu64 "), docCount(%" PRIu64 ")", - getName(_lookupRes.indexId).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs); + getName(_field_index.get_field_id()).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs); return search; } @@ -102,7 +104,7 @@ DiskTermBlueprint::createFilterSearch(FilterConstraint) const if (_bitVector) { wrapper->wrap(BitVectorIterator::create(_bitVector.get(), *tfmda[0], strict())); } else { - wrapper->wrap(_diskIndex.create_iterator(_lookupRes, _postingHandle, tfmda)); + wrapper->wrap(_field_index.create_iterator(_lookupRes, _postingHandle, tfmda)); } return wrapper; } diff --git a/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.h b/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.h index 3709eba0dde4..1b0196914f4f 100644 --- a/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.h +++ b/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.h @@ -2,7 +2,7 @@ #pragma once -#include "diskindex.h" +#include "field_index.h" #include namespace search::diskindex { @@ -14,9 +14,10 @@ class DiskTermBlueprint : public queryeval::SimpleLeafBlueprint { private: queryeval::FieldSpec _field; - const DiskIndex & _diskIndex; + const FieldIndex& _field_index; std::string _query_term; - DiskIndex::LookupResult _lookupRes; + index::DictionaryLookupResult _lookupRes; + index::BitVectorDictionaryLookupResult _bitvector_lookup_result; bool _useBitVector; bool _fetchPostingsDone; index::PostingListHandle _postingHandle; @@ -27,14 +28,14 @@ class DiskTermBlueprint : public queryeval::SimpleLeafBlueprint * Create a new blueprint. * * @param field the field to search in. - * @param diskIndex the disk index used to read the bit vector or posting list. + * @param field_index the field index used to read the bit vector or posting list. * @param lookupRes the result after disk dictionary lookup. * @param useBitVector whether or not we should use bit vector. **/ DiskTermBlueprint(const queryeval::FieldSpec & field, - const DiskIndex & diskIndex, + const FieldIndex& field_index, const std::string& query_term, - DiskIndex::LookupResult lookupRes, + index::DictionaryLookupResult lookupRes, bool useBitVector); queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override; diff --git a/searchlib/src/vespa/searchlib/diskindex/field_index.cpp b/searchlib/src/vespa/searchlib/diskindex/field_index.cpp index c1ea0c1f7c65..cc318a68445a 100644 --- a/searchlib/src/vespa/searchlib/diskindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/field_index.cpp @@ -3,6 +3,7 @@ #include "field_index.h" #include "fileheader.h" #include "pagedict4randread.h" +#include #include #include #include @@ -10,6 +11,7 @@ #include LOG_SETUP(".diskindex.field_index"); +using search::index::BitVectorDictionaryLookupResult; using search::index::DictionaryLookupResult; using search::index::PostingListHandle; @@ -45,13 +47,15 @@ FieldIndex::FieldIndex() _file_id(0), _size_on_disk(0), _cache_disk_io_stats(std::make_shared()), - _posting_list_cache() + _posting_list_cache(), + _field_id(0) { } -FieldIndex::FieldIndex(std::shared_ptr posting_list_cache) +FieldIndex::FieldIndex(uint32_t field_id, std::shared_ptr posting_list_cache) : FieldIndex() { + _field_id = field_id; _posting_list_cache = std::move(posting_list_cache); } @@ -203,8 +207,8 @@ FieldIndex::read_posting_list(const DictionaryLookupResult& lookup_result) const return result; } -std::unique_ptr -FieldIndex::read_bit_vector(const DictionaryLookupResult& lookup_result) const +BitVectorDictionaryLookupResult +FieldIndex::lookup_bit_vector(const DictionaryLookupResult& lookup_result) const { if (!_bit_vector_dict) { return {}; @@ -212,8 +216,17 @@ FieldIndex::read_bit_vector(const DictionaryLookupResult& lookup_result) const return _bit_vector_dict->lookup(lookup_result.wordNum); } +std::unique_ptr +FieldIndex::read_bit_vector(BitVectorDictionaryLookupResult lookup_result) const +{ + if (!_bit_vector_dict) { + return {}; + } + return _bit_vector_dict->read_bitvector(lookup_result); +} + std::unique_ptr -FieldIndex::create_iterator(const search::index::DictionaryLookupResult& lookup_result, +FieldIndex::create_iterator(const DictionaryLookupResult& lookup_result, const index::PostingListHandle& handle, const search::fef::TermFieldMatchDataArray& tfmda) const { diff --git a/searchlib/src/vespa/searchlib/diskindex/field_index.h b/searchlib/src/vespa/searchlib/diskindex/field_index.h index bbe835e6f037..3fe002ab6775 100644 --- a/searchlib/src/vespa/searchlib/diskindex/field_index.h +++ b/searchlib/src/vespa/searchlib/diskindex/field_index.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -55,11 +56,12 @@ class FieldIndex : public IPostingListCache::IPostingListFileBacking { std::shared_ptr _cache_disk_io_stats; std::shared_ptr _posting_list_cache; static std::atomic _file_id_source; + uint32_t _field_id; static uint64_t get_next_file_id() noexcept { return _file_id_source.fetch_add(1) + 1; } public: FieldIndex(); - FieldIndex(std::shared_ptr posting_list_cache); + FieldIndex(uint32_t field_id, std::shared_ptr posting_list_cache); FieldIndex(const FieldIndex& rhs) = delete; FieldIndex(FieldIndex&& rhs); ~FieldIndex(); @@ -71,7 +73,8 @@ class FieldIndex : public IPostingListCache::IPostingListFileBacking { index::PostingListHandle read_uncached_posting_list(const search::index::DictionaryLookupResult& lookup_result) const; index::PostingListHandle read(const IPostingListCache::Key& key) const override; index::PostingListHandle read_posting_list(const search::index::DictionaryLookupResult& lookup_result) const; - std::unique_ptr read_bit_vector(const search::index::DictionaryLookupResult& lookup_result) const; + index::BitVectorDictionaryLookupResult lookup_bit_vector(const search::index::DictionaryLookupResult& lookup_result) const; + std::unique_ptr read_bit_vector(index::BitVectorDictionaryLookupResult lookup_result) const; std::unique_ptr create_iterator(const search::index::DictionaryLookupResult& lookup_result, const index::PostingListHandle& handle, const search::fef::TermFieldMatchDataArray& tfmda) const; @@ -79,6 +82,7 @@ class FieldIndex : public IPostingListCache::IPostingListFileBacking { index::DictionaryFileRandRead* get_dictionary() noexcept { return _dict.get(); } FieldIndexStats get_stats() const; + uint32_t get_field_id() const noexcept { return _field_id; } }; } diff --git a/searchlib/src/vespa/searchlib/index/bitvector_dictionary_lookup_result.h b/searchlib/src/vespa/searchlib/index/bitvector_dictionary_lookup_result.h new file mode 100644 index 000000000000..a81ca98127e3 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/bitvector_dictionary_lookup_result.h @@ -0,0 +1,29 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include + +namespace search::index { + +/** + * The result after performing a disk bitvector dictionary lookup. + **/ +class BitVectorDictionaryLookupResult { +public: + static constexpr uint32_t invalid = std::numeric_limits::max(); + uint64_t idx; + + explicit BitVectorDictionaryLookupResult(uint32_t idx_in) noexcept + : idx(idx_in) + { + } + BitVectorDictionaryLookupResult() noexcept + : BitVectorDictionaryLookupResult(invalid) + { + } + bool valid() const noexcept { return idx != invalid; } +}; + +} From c7bd8730f2b615141b932e501debecc6bccb159d Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 13 Nov 2024 13:53:16 +0100 Subject: [PATCH 2/3] Skip bitvector dictionary lookup for words with no posting list. --- searchlib/src/vespa/searchlib/diskindex/field_index.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchlib/src/vespa/searchlib/diskindex/field_index.cpp b/searchlib/src/vespa/searchlib/diskindex/field_index.cpp index cc318a68445a..dac7b1cf7a5f 100644 --- a/searchlib/src/vespa/searchlib/diskindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/field_index.cpp @@ -210,7 +210,7 @@ FieldIndex::read_posting_list(const DictionaryLookupResult& lookup_result) const BitVectorDictionaryLookupResult FieldIndex::lookup_bit_vector(const DictionaryLookupResult& lookup_result) const { - if (!_bit_vector_dict) { + if (!_bit_vector_dict || !lookup_result.valid()) { return {}; } return _bit_vector_dict->lookup(lookup_result.wordNum); From be58cca5f70af4c50d9e9b3c48a8e7b5605187c8 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 13 Nov 2024 16:45:06 +0100 Subject: [PATCH 3/3] Remove unused include and adjust member function comment. --- searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h b/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h index 2564853f48d0..b3ce7a183468 100644 --- a/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h +++ b/searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h @@ -2,7 +2,6 @@ #pragma once #include "bitvectorkeyscope.h" -//#include #include #include #include @@ -60,7 +59,7 @@ class BitVectorDictionary **/ index::BitVectorDictionaryLookupResult lookup(uint64_t word_num); /** - * load and return the associated bit vector if lookup result is valid. + * Load and return the associated bit vector if lookup result is valid. * * @param lookup_result the result returned from lookup. * @return the loaded bit vector or empty if lookup result was invalid.