Skip to content

Commit

Permalink
Merge pull request #32848 from vespa-engine/toregge/separate-bit-vect…
Browse files Browse the repository at this point in the history
…or-lookup-from-bit-vector-read

Separate bit vector lookup from bit vector read.
  • Loading branch information
geirst authored Nov 13, 2024
2 parents ce4e230 + be58cca commit 4b781c2
Show file tree
Hide file tree
Showing 12 changed files with 158 additions and 117 deletions.
20 changes: 12 additions & 8 deletions searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@ TEST_P(BitVectorTest, require_that_dictionary_handles_no_entries)
EXPECT_TRUE(dict.open("dump/1/", tuneFileRead, bvScope));
EXPECT_EQ(5u, dict.getDocIdLimit());
EXPECT_EQ(0u, dict.getEntries().size());
EXPECT_FALSE(dict.lookup(1));
EXPECT_FALSE(dict.lookup(2));
EXPECT_FALSE(dict.lookup(1).valid());
EXPECT_FALSE(dict.lookup(2).valid());
}

TEST_P(BitVectorTest, require_that_dictionary_handles_multiple_entries)
Expand Down Expand Up @@ -176,16 +176,20 @@ TEST_P(BitVectorTest, require_that_dictionary_handles_multiple_entries)
EXPECT_EQ(5u, e._wordNum);
EXPECT_EQ(23u, e._numDocs);

EXPECT_FALSE(dict.lookup(2));
EXPECT_FALSE(dict.lookup(3));
EXPECT_FALSE(dict.lookup(4));
EXPECT_FALSE(dict.lookup(6));
EXPECT_FALSE(dict.lookup(2).valid());
EXPECT_FALSE(dict.lookup(3).valid());
EXPECT_FALSE(dict.lookup(4).valid());
EXPECT_FALSE(dict.lookup(6).valid());

BitVector::UP bv1act = dict.lookup(1);
auto bv1lr = dict.lookup(1);
EXPECT_TRUE(bv1lr.valid());
auto bv1act = dict.read_bitvector(bv1lr);
EXPECT_TRUE(bv1act);
EXPECT_TRUE(*bv1exp == *bv1act);

BitVector::UP bv5act = dict.lookup(5);
auto bv5lr = dict.lookup(5);
EXPECT_TRUE(bv5lr.valid());
auto bv5act = dict.read_bitvector(bv5lr);
EXPECT_TRUE(bv5act);
EXPECT_TRUE(*bv5exp == *bv5act);
}
Expand Down
17 changes: 12 additions & 5 deletions searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,9 @@ DiskIndexTest::requireThatWeCanReadPostingList()
TermFieldMatchDataArray mda;
{ // field 'f1'
auto r = _index->lookup(0, "w1");
auto h = _index->readPostingList(r);
auto sb = _index->create_iterator(r, h, mda);
auto& field_index = _index->get_field_index(0);
auto h = field_index.read_posting_list(r);
auto sb = field_index.create_iterator(r, h, mda);
EXPECT_EQ(SimpleResult({1,3}), SimpleResult().search(*sb));
}
}
Expand All @@ -274,16 +275,22 @@ DiskIndexTest::requireThatWeCanReadBitVector()
{
{ // word 'w1'
auto r = _index->lookup(1, "w1");
auto& field_index = _index->get_field_index(1);
// not bit vector for 'w1'
EXPECT_TRUE(_index->readBitVector(r).get() == NULL);
auto blr = field_index.lookup_bit_vector(r);
EXPECT_FALSE(blr.valid());
EXPECT_TRUE(field_index.read_bit_vector(blr).get() == nullptr);
}
{ // word 'w2'
BitVector::UP exp(BitVector::create(32));
for (uint32_t docId = 1; docId < 18; ++docId) exp->setBit(docId);
{ // field 'f2'
auto r = _index->lookup(1, "w2");
BitVector::UP bv = _index->readBitVector(r);
EXPECT_TRUE(bv.get() != NULL);
auto& field_index = _index->get_field_index(1);
auto blr = field_index.lookup_bit_vector(r);
EXPECT_TRUE(blr.valid());
BitVector::UP bv = field_index.read_bit_vector(blr);
EXPECT_TRUE(bv.get() != nullptr);
EXPECT_TRUE(*bv == *exp);
}
}
Expand Down
30 changes: 18 additions & 12 deletions searchlib/src/tests/diskindex/fusion/fusion_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,13 @@ assert_interleaved_features(DiskIndex &d, const std::string &field, const std::s
{
const Schema &schema = d.getSchema();
uint32_t field_id(schema.getIndexFieldId(field));
auto& field_index = d.get_field_index(field_id);
auto lookup_result(d.lookup(field_id, term));
auto handle(d.readPostingList(lookup_result));
auto handle(field_index.read_posting_list(lookup_result));
TermFieldMatchData tfmd;
TermFieldMatchDataArray tfmda;
tfmda.add(&tfmd);
auto sbap(d.create_iterator(lookup_result, handle, tfmda));
auto sbap(field_index.create_iterator(lookup_result, handle, tfmda));
sbap->initFullRange();
EXPECT_TRUE(sbap->seek(doc_id));
sbap->unpack(doc_id);
Expand All @@ -179,12 +180,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights)

{
uint32_t id1(schema.getIndexFieldId("f0"));
auto& field_index = dw.get_field_index(id1);
auto lr1(dw.lookup(id1, "c"));
auto wh1(dw.readPostingList(lr1));
auto wh1(field_index.read_posting_list(lr1));
TermFieldMatchData f0;
TermFieldMatchDataArray a;
a.add(&f0);
auto sbap(dw.create_iterator(lr1, wh1, a));
auto sbap(field_index.create_iterator(lr1, wh1, a));
sbap->initFullRange();
EXPECT_EQ(std::string("{1000000:}"), toString(f0.getIterator()));
EXPECT_TRUE(sbap->seek(10));
Expand All @@ -193,12 +195,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights)
}
{
uint32_t id1(schema.getIndexFieldId("f2"));
auto& field_index = dw.get_field_index(id1);
auto lr1(dw.lookup(id1, "ax"));
auto wh1(dw.readPostingList(lr1));
auto wh1(field_index.read_posting_list(lr1));
TermFieldMatchData f2;
TermFieldMatchDataArray a;
a.add(&f2);
auto sbap(dw.create_iterator(lr1, wh1, a));
auto sbap(field_index.create_iterator(lr1, wh1, a));
sbap->initFullRange();
EXPECT_EQ(std::string("{1000000:}"), toString(f2.getIterator()));
EXPECT_TRUE(sbap->seek(10));
Expand All @@ -213,12 +216,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights)
}
{
uint32_t id1(schema.getIndexFieldId("f3"));
auto& field_index = dw.get_field_index(id1);
auto lr1(dw.lookup(id1, "wx"));
auto wh1(dw.readPostingList(lr1));
auto wh1(field_index.read_posting_list(lr1));
TermFieldMatchData f3;
TermFieldMatchDataArray a;
a.add(&f3);
auto sbap(dw.create_iterator(lr1, wh1, a));
auto sbap(field_index.create_iterator(lr1, wh1, a));
sbap->initFullRange();
EXPECT_EQ(std::string("{1000000:}"), toString(f3.getIterator()));
EXPECT_TRUE(sbap->seek(10));
Expand All @@ -233,12 +237,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights)
}
{
uint32_t id1(schema.getIndexFieldId("f3"));;
auto& field_index = dw.get_field_index(id1);
auto lr1(dw.lookup(id1, "zz"));
auto wh1(dw.readPostingList(lr1));
auto wh1(field_index.read_posting_list(lr1));
TermFieldMatchData f3;
TermFieldMatchDataArray a;
a.add(&f3);
auto sbap(dw.create_iterator(lr1, wh1, a));
auto sbap(field_index.create_iterator(lr1, wh1, a));
sbap->initFullRange();
EXPECT_EQ(std::string("{1000000:}"), toString(f3.getIterator()));
EXPECT_TRUE(sbap->seek(11));
Expand All @@ -253,12 +258,13 @@ validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights)
}
{
uint32_t id1(schema.getIndexFieldId("f3"));;
auto& field_index = dw.get_field_index(id1);
auto lr1(dw.lookup(id1, "zz0"));
auto wh1(dw.readPostingList(lr1));
auto wh1(field_index.read_posting_list(lr1));
TermFieldMatchData f3;
TermFieldMatchDataArray a;
a.add(&f3);
auto sbap(dw.create_iterator(lr1, wh1, a));
auto sbap(field_index.create_iterator(lr1, wh1, a));
sbap->initFullRange();
EXPECT_EQ(std::string("{1000000:}"), toString(f3.getIterator()));
EXPECT_TRUE(sbap->seek(12));
Expand Down
23 changes: 16 additions & 7 deletions searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "bitvectordictionary.h"
#include <vespa/searchlib/common/bitvector.h>
#include <vespa/searchlib/common/fileheadertags.h>
#include <vespa/vespalib/data/fileheader.h>
#include <vespa/fastos/file.h>
Expand All @@ -9,6 +10,8 @@
#include <vespa/log/log.h>
LOG_SETUP(".diskindex.bitvectordictionary");

using search::index::BitVectorDictionaryLookupResult;

namespace search::diskindex {

using namespace tags;
Expand Down Expand Up @@ -83,19 +86,25 @@ BitVectorDictionary::open(const std::string &pathPrefix,
return true;
}


BitVector::UP
BitVectorDictionary::lookup(uint64_t wordNum)
{
BitVectorDictionaryLookupResult
BitVectorDictionary::lookup(uint64_t wordNum) {
WordSingleKey key;
key._wordNum = wordNum;
auto itr = std::lower_bound(_entries.begin(), _entries.end(), key);
if (itr == _entries.end() || key < *itr) {
return {};
}
int64_t pos = &*itr - &_entries[0];
int64_t offset = ((int64_t) _vectorSize) * pos + _datHeaderLen;
return BitVector::create(_docIdLimit, *_datFile, offset, itr->_numDocs);
return BitVectorDictionaryLookupResult(itr - _entries.begin());
}

std::unique_ptr<BitVector>
BitVectorDictionary::read_bitvector(BitVectorDictionaryLookupResult lookup_result)
{
if (!lookup_result.valid()) {
return {};
}
int64_t offset = ((int64_t) _vectorSize) * lookup_result.idx + _datHeaderLen;
return BitVector::create(_docIdLimit, *_datFile, offset, _entries[lookup_result.idx]._numDocs);
}

}
22 changes: 16 additions & 6 deletions searchlib/src/vespa/searchlib/diskindex/bitvectordictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
#pragma once

#include "bitvectorkeyscope.h"
#include <vespa/searchlib/common/bitvector.h>
#include <vespa/searchlib/index/bitvector_dictionary_lookup_result.h>
#include <vespa/searchlib/index/bitvectorkeys.h>
#include <vespa/searchlib/common/tunefileinfo.h>
#include <string>
#include <vector>

class FastOS_FileInterface;

namespace search { class BitVector; }

namespace search::diskindex {

/**
Expand Down Expand Up @@ -48,13 +52,19 @@ class BitVectorDictionary
BitVectorKeyScope scope);

/**
* Lookup the given word number and load and return the associated
* bit vector if found.
* Lookup the given word number.
*
* @param word_num the word number to lookup a bit vector for.
* @return a bitvector dictionary lookup result that can be passed to read_bitvector member function.
**/
index::BitVectorDictionaryLookupResult lookup(uint64_t word_num);
/**
* Load and return the associated bit vector if lookup result is valid.
*
* @param wordNum the word number to lookup a bit vector for.
* @return the loaded bit vector or nullptr if not found.
* @param lookup_result the result returned from lookup.
* @return the loaded bit vector or empty if lookup result was invalid.
**/
BitVector::UP lookup(uint64_t wordNum);
std::unique_ptr<BitVector> read_bitvector(index::BitVectorDictionaryLookupResult lookup_result);

uint32_t getDocIdLimit() const { return _docIdLimit; }

Expand Down
27 changes: 2 additions & 25 deletions searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ DiskIndex::openDictionaries(const TuneFileSearch &tuneFileSearch)
{
for (SchemaUtil::IndexIterator itr(_schema); itr.isValid(); ++itr) {
std::string field_dir = _indexDir + "/" + itr.getName();
_field_indexes.emplace_back(_posting_list_cache);
_field_indexes.emplace_back(itr.getIndex(), _posting_list_cache);
if (!_field_indexes.back().open_dictionary(field_dir, tuneFileSearch)) {
_field_indexes.clear();
return false;
Expand Down Expand Up @@ -234,29 +234,6 @@ DiskIndex::read(const Key & key, LookupResultVector & result)
return true;
}

index::PostingListHandle
DiskIndex::readPostingList(const LookupResult &lookupRes) const
{
auto& field_index = _field_indexes[lookupRes.indexId];
return field_index.read_posting_list(lookupRes);
}

BitVector::UP
DiskIndex::readBitVector(const LookupResult &lookupRes) const
{
auto& field_index = _field_indexes[lookupRes.indexId];
return field_index.read_bit_vector(lookupRes);
}

std::unique_ptr<search::queryeval::SearchIterator>
DiskIndex::create_iterator(const LookupResult& lookup_result,
const index::PostingListHandle& handle,
const search::fef::TermFieldMatchDataArray& tfmda) const
{
auto& field_index = _field_indexes[lookup_result.indexId];
return field_index.create_iterator(lookup_result, handle, tfmda);
}

namespace {

const std::vector<std::string> nonfield_file_names{
Expand Down Expand Up @@ -334,7 +311,7 @@ class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper {
const DiskIndex::LookupResult & lookupRes = _cache.lookup(termStr, _fieldId);
if (lookupRes.valid()) {
bool useBitVector = _field.isFilter();
setResult(std::make_unique<DiskTermBlueprint>(_field, _diskIndex, termStr, lookupRes, useBitVector));
setResult(std::make_unique<DiskTermBlueprint>(_field, _diskIndex.get_field_index(_fieldId), termStr, lookupRes, useBitVector));
} else {
setResult(std::make_unique<EmptyBlueprint>(_field));
}
Expand Down
23 changes: 1 addition & 22 deletions searchlib/src/vespa/searchlib/diskindex/diskindex.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,28 +99,6 @@ class DiskIndex : public queryeval::Searchable {

LookupResultVector lookup(const std::vector<uint32_t> & indexes, std::string_view word);

/**
* Read the posting list corresponding to the given lookup result.
*
* @param lookupRes the result of the previous dictionary lookup.
* @return a handle for the posting list in memory.
*/
index::PostingListHandle readPostingList(const LookupResult &lookupRes) const;

std::unique_ptr<search::queryeval::SearchIterator>
create_iterator(const LookupResult& lookup_result,
const index::PostingListHandle& handle,
const search::fef::TermFieldMatchDataArray& tfmda) const;

/**
* Read the bit vector corresponding to the given lookup result.
*
* @param lookupRes the result of the previous dictionary lookup.
* @return the bit vector or nullptr if no bit vector exists for the
* word in the lookup result.
*/
BitVector::UP readBitVector(const LookupResult &lookupRes) const;

std::unique_ptr<queryeval::Blueprint> createBlueprint(const queryeval::IRequestContext & requestContext,
const queryeval::FieldSpec &field,
const query::Node &term) override;
Expand All @@ -143,6 +121,7 @@ class DiskIndex : public queryeval::Searchable {

index::FieldLengthInfo get_field_length_info(const std::string& field_name) const;
const std::shared_ptr<IPostingListCache>& get_posting_list_cache() const noexcept { return _posting_list_cache; }
const FieldIndex& get_field_index(uint32_t field_id) const noexcept { return _field_indexes[field_id]; }
};

void swap(DiskIndex::LookupResult & a, DiskIndex::LookupResult & b);
Expand Down
Loading

0 comments on commit 4b781c2

Please sign in to comment.