Skip to content

Commit

Permalink
Make it tuneable when to use bitvector when searching a disk index.
Browse files Browse the repository at this point in the history
The bitvector hit estimate limit can be used to tune performance at the cost of quality,
by not reading the large posocc posting lists for common words from disk.
The posocc posting list for the most common words are typically more than 100x larger than the bitvector.
  • Loading branch information
geirst committed Nov 27, 2024
1 parent 3e66324 commit 0f7b79a
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 58 deletions.
95 changes: 55 additions & 40 deletions searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class DiskIndexTest : public ::testing::Test, public TestDiskIndex {
void test_io_settings(const IOSettings& io_settings);
SimpleResult search(const FieldIndex& field_index, const DictionaryLookupResult& lookup_result,
const PostingListHandle& handle);
Blueprint::UP create_blueprint(const FieldSpec& field, const search::query::Node& term, uint32_t docid_limit=1000);
};

DiskIndexTest::DiskIndexTest() = default;
Expand Down Expand Up @@ -256,6 +257,14 @@ DiskIndexTest::search(const FieldIndex& field_index, const DictionaryLookupResul
return SimpleResult().search(*sb);
}

Blueprint::UP
DiskIndexTest::create_blueprint(const FieldSpec& field, const search::query::Node& term, uint32_t docid_limit)
{
auto b = _index->createBlueprint(_requestContext, field, term);
b->basic_plan(true, docid_limit);
b->fetchPostings(search::queryeval::ExecuteInfo::FULL);
return b;
}

void
DiskIndexTest::requireThatWeCanReadPostingList(const IOSettings& io_settings)
Expand Down Expand Up @@ -327,28 +336,23 @@ void
DiskIndexTest::requireThatBlueprintIsCreated()
{
{ // unknown field
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("none", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != NULL);
auto b = _index->createBlueprint(_requestContext, FieldSpec("none", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != nullptr);
}
{ // unknown word
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("none"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != NULL);
auto b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("none"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != nullptr);
}
{ // known field & word with hits
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<DiskTermBlueprint *>(b.get()) != NULL);
auto b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<DiskTermBlueprint *>(b.get()) != nullptr);
EXPECT_EQ(2u, b->getState().estimate().estHits);
EXPECT_TRUE(!b->getState().estimate().empty);
}
{ // known field & word without hits
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w2"));
// std::cerr << "BP = " << typeid(*b).name() << std::endl;
EXPECT_TRUE((dynamic_cast<DiskTermBlueprint *>(b.get()) != NULL) ||
(dynamic_cast<EmptyBlueprint *>(b.get()) != NULL));
auto b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w2"));
EXPECT_TRUE((dynamic_cast<DiskTermBlueprint *>(b.get()) != nullptr) ||
(dynamic_cast<EmptyBlueprint *>(b.get()) != nullptr));
EXPECT_EQ(0u, b->getState().estimate().estHits);
EXPECT_TRUE(b->getState().estimate().empty);
}
Expand All @@ -366,53 +370,64 @@ DiskIndexTest::requireThatBlueprintCanCreateSearchIterators()
SimpleResult result_f1_w2;
SimpleResult result_f2_w2({1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17});
auto upper_bound = Blueprint::FilterConstraint::UPPER_BOUND;
{ // bit vector due to isFilter
b = _index->createBlueprint(_requestContext, FieldSpec("f2", 0, 0, true), makeTerm("w2"));
b->basic_plan(true, 1000);
b->fetchPostings(search::queryeval::ExecuteInfo::FULL);
{ // bitvector due to is_filter_field=true
b = create_blueprint(FieldSpec("f2", 0, 0, true), makeTerm("w2"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != NULL);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
EXPECT_EQ(result_f2_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f2_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // bit vector due to no ranking needed
b = _index->createBlueprint(_requestContext, FieldSpec("f2", 0, 0, false), makeTerm("w2"));
b->basic_plan(true, 1000);
b->fetchPostings(ExecuteInfo::FULL);
{ // bitvector due to no ranking needed
b = create_blueprint(FieldSpec("f2", 0, 0, false), makeTerm("w2"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_FALSE(dynamic_cast<BitVectorIterator *>(s.get()) != NULL);
EXPECT_FALSE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
TermFieldMatchData md2;
md2.tagAsNotNeeded();
TermFieldMatchDataArray mda2;
mda2.add(&md2);
EXPECT_TRUE(mda2[0]->isNotNeeded());
s = (dynamic_cast<LeafBlueprint *>(b.get()))->createLeafSearch(mda2);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != NULL);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
EXPECT_EQ(result_f2_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f2_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // fake bit vector
b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0, true), makeTerm("w2"));
// std::cerr << "BP = " << typeid(*b).name() << std::endl;
b->basic_plan(true, 1000);
b->fetchPostings(ExecuteInfo::FULL);
{ // fake bitvector (wrapping posocc iterator)
b = create_blueprint(FieldSpec("f1", 0, 0, true), makeTerm("w1"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
// std::cerr << "SI = " << typeid(*s).name() << std::endl;
EXPECT_TRUE((dynamic_cast<BooleanMatchIteratorWrapper *>(s.get()) != NULL) ||
dynamic_cast<EmptySearch *>(s.get()));
EXPECT_EQ(result_f1_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
EXPECT_TRUE(dynamic_cast<BooleanMatchIteratorWrapper *>(s.get()) != nullptr);
EXPECT_EQ(result_f1_w1, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // posting list iterator
b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w1"));
b->basic_plan(true, 1000);
b->fetchPostings(ExecuteInfo::FULL);
b = create_blueprint(FieldSpec("f1", 0, 0), makeTerm("w1"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
ASSERT_TRUE((dynamic_cast<ZcRareWordPosOccIterator<true, false> *>(s.get()) != nullptr));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // bitvector used due to bitvector_limit set.
// The term 'w2' hits 17 docs in field 'f2' (bitvector for term exists).
double bitvector_limit = 16.0 / 100.0;
_requestContext.get_create_blueprint_params().disk_index_bitvector_limit = bitvector_limit;
b = create_blueprint(FieldSpec("f2", 0, 0, false), makeTerm("w2"), 100);
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
EXPECT_EQ(result_f2_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f2_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // fake bitvector (wrapping posocc iterator) used due to bitvector_limit set.
// The term 'w1' hits 2 docs in field 'f1' (bitvector for term doesn't exist).
double bitvector_limit = 1.0 / 100.0;
_requestContext.get_create_blueprint_params().disk_index_bitvector_limit = bitvector_limit;
b = create_blueprint(FieldSpec("f1", 0, 0, false), makeTerm("w1"), 100);
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
ASSERT_TRUE((dynamic_cast<ZcRareWordPosOccIterator<true, false> *>(s.get()) != NULL));
EXPECT_TRUE((dynamic_cast<BooleanMatchIteratorWrapper *>(s.get()) != nullptr));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
Expand Down Expand Up @@ -490,7 +505,7 @@ DiskIndexTest::test_io_settings(const IOSettings& io_settings)
ASSERT_TRUE(posting_list_cache);
auto stats = posting_list_cache->get_stats();
EXPECT_EQ(2, stats.misses);
EXPECT_EQ(1, stats.hits);
EXPECT_EQ(3, stats.hits);
} else {
ASSERT_FALSE(posting_list_cache);
}
Expand Down
13 changes: 8 additions & 5 deletions searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
#include "fileheader.h"
#include "pagedict4randread.h"
#include <vespa/searchlib/index/schemautil.h>
#include <vespa/searchlib/queryeval/create_blueprint_params.h>
#include <vespa/searchlib/queryeval/create_blueprint_visitor_helper.h>
#include <vespa/searchlib/queryeval/leaf_blueprints.h>
#include <vespa/searchlib/queryeval/intermediate_blueprints.h>
#include <vespa/searchlib/queryeval/irequestcontext.h>
#include <vespa/searchlib/queryeval/leaf_blueprints.h>
#include <vespa/searchlib/util/dirtraverse.h>
#include <vespa/searchlib/util/disk_space_calculator.h>
#include <vespa/vespalib/stllike/hash_set.h>
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <vespa/vespalib/stllike/cache.hpp>
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <vespa/vespalib/stllike/hash_set.h>
#include <filesystem>

#include <vespa/log/log.h>
Expand Down Expand Up @@ -310,8 +312,9 @@ class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper {
const std::string termStr = termAsString(n);
const DiskIndex::LookupResult & lookupRes = _cache.lookup(termStr, _fieldId);
if (lookupRes.valid()) {
bool useBitVector = _field.isFilter();
setResult(std::make_unique<DiskTermBlueprint>(_field, _diskIndex.get_field_index(_fieldId), termStr, lookupRes, useBitVector));
double bitvector_limit = getRequestContext().get_create_blueprint_params().disk_index_bitvector_limit;
setResult(std::make_unique<DiskTermBlueprint>
(_field, _diskIndex.get_field_index(_fieldId), termStr, lookupRes, _field.isFilter(), bitvector_limit));
} else {
setResult(std::make_unique<EmptyBlueprint>(_field));
}
Expand Down
19 changes: 14 additions & 5 deletions searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,16 @@ DiskTermBlueprint::DiskTermBlueprint(const FieldSpec & field,
const FieldIndex& field_index,
const std::string& query_term,
DictionaryLookupResult lookupRes,
bool useBitVector)
bool is_filter_field,
double bitvector_limit)
: SimpleLeafBlueprint(field),
_field(field),
_field_index(field_index),
_query_term(query_term),
_lookupRes(std::move(lookupRes)),
_bitvector_lookup_result(_field_index.lookup_bit_vector(_lookupRes)),
_useBitVector(useBitVector),
_is_filter_field(is_filter_field),
_bitvector_limit(bitvector_limit),
_fetchPostingsDone(false),
_postingHandle(),
_bitVector(),
Expand Down Expand Up @@ -90,7 +92,7 @@ DiskTermBlueprint::fetchPostings(const queryeval::ExecuteInfo &execInfo)
{
(void) execInfo;
if (!_fetchPostingsDone) {
if (_useBitVector && _bitvector_lookup_result.valid()) {
if (use_bitvector() && _bitvector_lookup_result.valid()) {
if (LOG_WOULD_LOG(debug)) [[unlikely]] {
log_bitvector_read();
}
Expand All @@ -113,6 +115,13 @@ DiskTermBlueprint::calculate_flow_stats(uint32_t docid_limit) const
return {rel_est, disk_index_cost(rel_est), disk_index_strict_cost(rel_est)};
}

bool
DiskTermBlueprint::use_bitvector() const
{
return _is_filter_field ||
((get_docid_limit() > 0) && ((double)_lookupRes.counts._numDocs / (double)get_docid_limit()) > _bitvector_limit);
}

const BitVector *
DiskTermBlueprint::get_bitvector() const
{
Expand All @@ -133,13 +142,13 @@ DiskTermBlueprint::get_bitvector() const
SearchIterator::UP
DiskTermBlueprint::createLeafSearch(const TermFieldMatchDataArray & tfmda) const
{
if (_bitvector_lookup_result.valid() && (_useBitVector || tfmda[0]->isNotNeeded())) {
if (_bitvector_lookup_result.valid() && (_bitVector || tfmda[0]->isNotNeeded())) {
LOG(debug, "Return BitVectorIterator: %s, wordNum(%" PRIu64 "), docCount(%" PRIu64 ")",
getName(_field_index.get_field_id()).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs);
return BitVectorIterator::create(get_bitvector(), *tfmda[0], strict());
}
auto search(_field_index.create_iterator(_lookupRes, _postingHandle, tfmda));
if (_useBitVector) {
if (use_bitvector()) {
LOG(debug, "Return BooleanMatchIteratorWrapper: %s, wordNum(%" PRIu64 "), docCount(%" PRIu64 ")",
getName(_field_index.get_field_id()).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs);
return std::make_unique<BooleanMatchIteratorWrapper>(std::move(search), tfmda);
Expand Down
22 changes: 14 additions & 8 deletions searchlib/src/vespa/searchlib/diskindex/disktermblueprint.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,39 @@ class DiskTermBlueprint : public queryeval::SimpleLeafBlueprint
private:
queryeval::FieldSpec _field;
const FieldIndex& _field_index;
std::string _query_term;
index::DictionaryLookupResult _lookupRes;
std::string _query_term;
index::DictionaryLookupResult _lookupRes;
index::BitVectorDictionaryLookupResult _bitvector_lookup_result;
bool _useBitVector;
bool _is_filter_field;
double _bitvector_limit;
bool _fetchPostingsDone;
index::PostingListHandle _postingHandle;
std::shared_ptr<BitVector> _bitVector;
mutable std::mutex _mutex;
mutable std::shared_ptr<BitVector> _late_bitvector;

bool use_bitvector() const;
const BitVector* get_bitvector() const;
void log_bitvector_read() const __attribute__((noinline));
void log_posting_list_read() const __attribute__((noinline));
public:
/**
* Create a new blueprint.
*
* @param field the field to search in.
* @param field_index the field index used to read the bit vector or posting list.
* @param lookupRes the result after disk dictionary lookup.
* @param useBitVector whether or not we should use bit vector.
* @param field The field to search in.
* @param field_index The field index used to read the bit vector or posting list.
* @param lookupRes The result after disk dictionary lookup.
* @param is_filter_field Whether this field is filter and we should force use of bit vector.
* @param bitvector_limit The hit estimate limit for whether bitvector should be used for searching this term.
This can be used to tune performance at the cost of quality.
If no bitvector exists for the term, a fake bitvector wrapping the posocc iterator is used.
**/
DiskTermBlueprint(const queryeval::FieldSpec & field,
const FieldIndex& field_index,
const std::string& query_term,
index::DictionaryLookupResult lookupRes,
bool useBitVector);
bool is_filter_field,
double bitvector_limit);

queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override;

Expand Down
2 changes: 2 additions & 0 deletions searchlib/src/vespa/searchlib/queryeval/fake_requestcontext.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class FakeRequestContext : public IRequestContext

const CreateBlueprintParams& get_create_blueprint_params() const override;
const MetaStoreReadGuardSP * getMetaStoreReadGuard() const override { return nullptr; }

CreateBlueprintParams& get_create_blueprint_params() { return _create_blueprint_params; }
private:
std::unique_ptr<vespalib::TestClock> _clock;
const vespalib::Doom _doom;
Expand Down

0 comments on commit 0f7b79a

Please sign in to comment.