Skip to content

Commit

Permalink
Merge pull request #32958 from vespa-engine/geirst/bitvector-limit-fo…
Browse files Browse the repository at this point in the history
…r-disk-index-search

Make it tuneable when to use bitvector when searching a disk index
  • Loading branch information
geirst authored Nov 27, 2024
2 parents 6534434 + 0f7b79a commit 7289777
Show file tree
Hide file tree
Showing 13 changed files with 146 additions and 75 deletions.
31 changes: 18 additions & 13 deletions searchcore/src/tests/proton/matching/matching_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1262,13 +1262,13 @@ TEST_F(MatchingTest, require_that_docsum_matcher_can_extract_matching_elements_f

using FMA = vespalib::FuzzyMatchingAlgorithm;

struct AttributeBlueprintParamsFixture {
struct CreateBlueprintParamsFixture {
BlueprintFactory factory;
search::fef::test::IndexEnvironment index_env;
RankSetup rank_setup;
Properties rank_properties;
AttributeBlueprintParamsFixture(double lower_limit, double upper_limit, double target_hits_max_adjustment_factor,
FMA fuzzy_matching_algorithm)
CreateBlueprintParamsFixture(double lower_limit, double upper_limit, double target_hits_max_adjustment_factor,
FMA fuzzy_matching_algorithm)
: factory(),
index_env(),
rank_setup(factory, index_env),
Expand All @@ -1281,52 +1281,57 @@ struct AttributeBlueprintParamsFixture {
}
void set_query_properties(std::string_view lower_limit, std::string_view upper_limit,
std::string_view target_hits_max_adjustment_factor,
const std::string & fuzzy_matching_algorithm) {
std::string_view fuzzy_matching_algorithm,
std::string_view disk_index_bitvector_limit) {
rank_properties.add(GlobalFilterLowerLimit::NAME, lower_limit);
rank_properties.add(GlobalFilterUpperLimit::NAME, upper_limit);
rank_properties.add(TargetHitsMaxAdjustmentFactor::NAME, target_hits_max_adjustment_factor);
rank_properties.add(FuzzyAlgorithm::NAME, fuzzy_matching_algorithm);
rank_properties.add(DiskIndexBitvectorLimit::NAME, disk_index_bitvector_limit);
}
~AttributeBlueprintParamsFixture();
~CreateBlueprintParamsFixture();
CreateBlueprintParams extract(uint32_t active_docids = 9, uint32_t docid_limit = 10) const {
return MatchToolsFactory::extract_create_blueprint_params(rank_setup, rank_properties, active_docids, docid_limit);
}
};

AttributeBlueprintParamsFixture::~AttributeBlueprintParamsFixture() = default;
CreateBlueprintParamsFixture::~CreateBlueprintParamsFixture() = default;

TEST_F(MatchingTest, attribute_blueprint_params_are_extracted_from_rank_profile)
TEST_F(MatchingTest, create_blueprint_params_are_extracted_from_rank_profile)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
f.rank_setup.set_disk_index_bitvector_limit(0.04);
auto params = f.extract();
EXPECT_EQ(0.2, params.global_filter_lower_limit);
EXPECT_EQ(0.8, params.global_filter_upper_limit);
EXPECT_EQ(5.0, params.target_hits_max_adjustment_factor);
EXPECT_EQ(FMA::DfaTable, params.fuzzy_matching_algorithm);
EXPECT_EQ(0.04, params.disk_index_bitvector_limit);
}

TEST_F(MatchingTest, attribute_blueprint_params_are_extracted_from_query)
TEST_F(MatchingTest, create_blueprint_params_are_extracted_from_query)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
f.set_query_properties("0.15", "0.75", "3.0", "dfa_explicit");
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
f.set_query_properties("0.15", "0.75", "3.0", "dfa_explicit", "0.02");
auto params = f.extract();
EXPECT_EQ(0.15, params.global_filter_lower_limit);
EXPECT_EQ(0.75, params.global_filter_upper_limit);
EXPECT_EQ(3.0, params.target_hits_max_adjustment_factor);
EXPECT_EQ(FMA::DfaExplicit, params.fuzzy_matching_algorithm);
EXPECT_EQ(0.02, params.disk_index_bitvector_limit);
}

TEST_F(MatchingTest, global_filter_params_are_scaled_with_active_hit_ratio)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
auto params = f.extract(5, 10);
EXPECT_EQ(0.12, params.global_filter_lower_limit);
EXPECT_EQ(0.48, params.global_filter_upper_limit);
}

TEST_F(MatchingTest, weak_and_stop_word_strategy_is_resolved_correctly)
{
AttributeBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
CreateBlueprintParamsFixture f(0.2, 0.8, 5.0, FMA::DfaTable);
EXPECT_EQ(WeakAndStopWordAdjustLimit::DEFAULT_VALUE, 1.0);
EXPECT_EQ(WeakAndStopWordDropLimit::DEFAULT_VALUE, 1.0);
EXPECT_EQ(f.rank_setup.get_weakand_stop_word_adjust_limit(), 1.0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ MatchToolsFactory::extract_create_blueprint_params(const RankSetup& rank_setup,
double weakand_range = temporary::WeakAndRange::lookup(rank_properties, rank_setup.get_weakand_range());
double weakand_stop_word_adjust_limit = WeakAndStopWordAdjustLimit::lookup(rank_properties, rank_setup.get_weakand_stop_word_adjust_limit());
double weakand_stop_word_drop_limit = WeakAndStopWordDropLimit::lookup(rank_properties, rank_setup.get_weakand_stop_word_drop_limit());
double disk_index_bitvector_limit = DiskIndexBitvectorLimit::lookup(rank_properties, rank_setup.get_disk_index_bitvector_limit());

// Note that we count the reserved docid 0 as active.
// This ensures that when searchable-copies=1, the ratio is 1.0.
Expand All @@ -367,7 +368,8 @@ MatchToolsFactory::extract_create_blueprint_params(const RankSetup& rank_setup,
fuzzy_matching_algorithm,
weakand_range,
StopWordStrategy(weakand_stop_word_adjust_limit,
weakand_stop_word_drop_limit, docid_limit)};
weakand_stop_word_drop_limit, docid_limit),
disk_index_bitvector_limit};
}

AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext,
Expand Down
95 changes: 55 additions & 40 deletions searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class DiskIndexTest : public ::testing::Test, public TestDiskIndex {
void test_io_settings(const IOSettings& io_settings);
SimpleResult search(const FieldIndex& field_index, const DictionaryLookupResult& lookup_result,
const PostingListHandle& handle);
Blueprint::UP create_blueprint(const FieldSpec& field, const search::query::Node& term, uint32_t docid_limit=1000);
};

DiskIndexTest::DiskIndexTest() = default;
Expand Down Expand Up @@ -256,6 +257,14 @@ DiskIndexTest::search(const FieldIndex& field_index, const DictionaryLookupResul
return SimpleResult().search(*sb);
}

Blueprint::UP
DiskIndexTest::create_blueprint(const FieldSpec& field, const search::query::Node& term, uint32_t docid_limit)
{
auto b = _index->createBlueprint(_requestContext, field, term);
b->basic_plan(true, docid_limit);
b->fetchPostings(search::queryeval::ExecuteInfo::FULL);
return b;
}

void
DiskIndexTest::requireThatWeCanReadPostingList(const IOSettings& io_settings)
Expand Down Expand Up @@ -327,28 +336,23 @@ void
DiskIndexTest::requireThatBlueprintIsCreated()
{
{ // unknown field
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("none", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != NULL);
auto b = _index->createBlueprint(_requestContext, FieldSpec("none", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != nullptr);
}
{ // unknown word
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("none"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != NULL);
auto b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("none"));
EXPECT_TRUE(dynamic_cast<EmptyBlueprint *>(b.get()) != nullptr);
}
{ // known field & word with hits
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<DiskTermBlueprint *>(b.get()) != NULL);
auto b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w1"));
EXPECT_TRUE(dynamic_cast<DiskTermBlueprint *>(b.get()) != nullptr);
EXPECT_EQ(2u, b->getState().estimate().estHits);
EXPECT_TRUE(!b->getState().estimate().empty);
}
{ // known field & word without hits
Blueprint::UP b =
_index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w2"));
// std::cerr << "BP = " << typeid(*b).name() << std::endl;
EXPECT_TRUE((dynamic_cast<DiskTermBlueprint *>(b.get()) != NULL) ||
(dynamic_cast<EmptyBlueprint *>(b.get()) != NULL));
auto b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w2"));
EXPECT_TRUE((dynamic_cast<DiskTermBlueprint *>(b.get()) != nullptr) ||
(dynamic_cast<EmptyBlueprint *>(b.get()) != nullptr));
EXPECT_EQ(0u, b->getState().estimate().estHits);
EXPECT_TRUE(b->getState().estimate().empty);
}
Expand All @@ -366,53 +370,64 @@ DiskIndexTest::requireThatBlueprintCanCreateSearchIterators()
SimpleResult result_f1_w2;
SimpleResult result_f2_w2({1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17});
auto upper_bound = Blueprint::FilterConstraint::UPPER_BOUND;
{ // bit vector due to isFilter
b = _index->createBlueprint(_requestContext, FieldSpec("f2", 0, 0, true), makeTerm("w2"));
b->basic_plan(true, 1000);
b->fetchPostings(search::queryeval::ExecuteInfo::FULL);
{ // bitvector due to is_filter_field=true
b = create_blueprint(FieldSpec("f2", 0, 0, true), makeTerm("w2"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != NULL);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
EXPECT_EQ(result_f2_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f2_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // bit vector due to no ranking needed
b = _index->createBlueprint(_requestContext, FieldSpec("f2", 0, 0, false), makeTerm("w2"));
b->basic_plan(true, 1000);
b->fetchPostings(ExecuteInfo::FULL);
{ // bitvector due to no ranking needed
b = create_blueprint(FieldSpec("f2", 0, 0, false), makeTerm("w2"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_FALSE(dynamic_cast<BitVectorIterator *>(s.get()) != NULL);
EXPECT_FALSE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
TermFieldMatchData md2;
md2.tagAsNotNeeded();
TermFieldMatchDataArray mda2;
mda2.add(&md2);
EXPECT_TRUE(mda2[0]->isNotNeeded());
s = (dynamic_cast<LeafBlueprint *>(b.get()))->createLeafSearch(mda2);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != NULL);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
EXPECT_EQ(result_f2_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f2_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // fake bit vector
b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0, true), makeTerm("w2"));
// std::cerr << "BP = " << typeid(*b).name() << std::endl;
b->basic_plan(true, 1000);
b->fetchPostings(ExecuteInfo::FULL);
{ // fake bitvector (wrapping posocc iterator)
b = create_blueprint(FieldSpec("f1", 0, 0, true), makeTerm("w1"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
// std::cerr << "SI = " << typeid(*s).name() << std::endl;
EXPECT_TRUE((dynamic_cast<BooleanMatchIteratorWrapper *>(s.get()) != NULL) ||
dynamic_cast<EmptySearch *>(s.get()));
EXPECT_EQ(result_f1_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
EXPECT_TRUE(dynamic_cast<BooleanMatchIteratorWrapper *>(s.get()) != nullptr);
EXPECT_EQ(result_f1_w1, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // posting list iterator
b = _index->createBlueprint(_requestContext, FieldSpec("f1", 0, 0), makeTerm("w1"));
b->basic_plan(true, 1000);
b->fetchPostings(ExecuteInfo::FULL);
b = create_blueprint(FieldSpec("f1", 0, 0), makeTerm("w1"));
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
ASSERT_TRUE((dynamic_cast<ZcRareWordPosOccIterator<true, false> *>(s.get()) != nullptr));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // bitvector used due to bitvector_limit set.
// The term 'w2' hits 17 docs in field 'f2' (bitvector for term exists).
double bitvector_limit = 16.0 / 100.0;
_requestContext.get_create_blueprint_params().disk_index_bitvector_limit = bitvector_limit;
b = create_blueprint(FieldSpec("f2", 0, 0, false), makeTerm("w2"), 100);
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
EXPECT_EQ(result_f2_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f2_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // fake bitvector (wrapping posocc iterator) used due to bitvector_limit set.
// The term 'w1' hits 2 docs in field 'f1' (bitvector for term doesn't exist).
double bitvector_limit = 1.0 / 100.0;
_requestContext.get_create_blueprint_params().disk_index_bitvector_limit = bitvector_limit;
b = create_blueprint(FieldSpec("f1", 0, 0, false), makeTerm("w1"), 100);
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
ASSERT_TRUE((dynamic_cast<ZcRareWordPosOccIterator<true, false> *>(s.get()) != NULL));
EXPECT_TRUE((dynamic_cast<BooleanMatchIteratorWrapper *>(s.get()) != nullptr));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
Expand Down Expand Up @@ -490,7 +505,7 @@ DiskIndexTest::test_io_settings(const IOSettings& io_settings)
ASSERT_TRUE(posting_list_cache);
auto stats = posting_list_cache->get_stats();
EXPECT_EQ(2, stats.misses);
EXPECT_EQ(1, stats.hits);
EXPECT_EQ(3, stats.hits);
} else {
ASSERT_FALSE(posting_list_cache);
}
Expand Down
2 changes: 2 additions & 0 deletions searchlib/src/tests/ranksetup/ranksetup_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ TEST_F(RankSetupTest, rank_setup)
env.getProperties().add(matching::FuzzyAlgorithm::NAME, "dfa_implicit");
env.getProperties().add(matching::WeakAndStopWordAdjustLimit::NAME, "0.05");
env.getProperties().add(matching::WeakAndStopWordDropLimit::NAME, "0.5");
env.getProperties().add(matching::DiskIndexBitvectorLimit::NAME, "0.04");

RankSetup rs(_factory, env);
EXPECT_FALSE(rs.has_match_features());
Expand Down Expand Up @@ -608,6 +609,7 @@ TEST_F(RankSetupTest, rank_setup)
EXPECT_EQ(rs.get_fuzzy_matching_algorithm(), vespalib::FuzzyMatchingAlgorithm::DfaImplicit);
EXPECT_EQ(rs.get_weakand_stop_word_adjust_limit(), 0.05);
EXPECT_EQ(rs.get_weakand_stop_word_drop_limit(), 0.5);
EXPECT_EQ(rs.get_disk_index_bitvector_limit(), 0.04);
}

bool
Expand Down
13 changes: 8 additions & 5 deletions searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
#include "fileheader.h"
#include "pagedict4randread.h"
#include <vespa/searchlib/index/schemautil.h>
#include <vespa/searchlib/queryeval/create_blueprint_params.h>
#include <vespa/searchlib/queryeval/create_blueprint_visitor_helper.h>
#include <vespa/searchlib/queryeval/leaf_blueprints.h>
#include <vespa/searchlib/queryeval/intermediate_blueprints.h>
#include <vespa/searchlib/queryeval/irequestcontext.h>
#include <vespa/searchlib/queryeval/leaf_blueprints.h>
#include <vespa/searchlib/util/dirtraverse.h>
#include <vespa/searchlib/util/disk_space_calculator.h>
#include <vespa/vespalib/stllike/hash_set.h>
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <vespa/vespalib/stllike/cache.hpp>
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <vespa/vespalib/stllike/hash_set.h>
#include <filesystem>

#include <vespa/log/log.h>
Expand Down Expand Up @@ -310,8 +312,9 @@ class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper {
const std::string termStr = termAsString(n);
const DiskIndex::LookupResult & lookupRes = _cache.lookup(termStr, _fieldId);
if (lookupRes.valid()) {
bool useBitVector = _field.isFilter();
setResult(std::make_unique<DiskTermBlueprint>(_field, _diskIndex.get_field_index(_fieldId), termStr, lookupRes, useBitVector));
double bitvector_limit = getRequestContext().get_create_blueprint_params().disk_index_bitvector_limit;
setResult(std::make_unique<DiskTermBlueprint>
(_field, _diskIndex.get_field_index(_fieldId), termStr, lookupRes, _field.isFilter(), bitvector_limit));
} else {
setResult(std::make_unique<EmptyBlueprint>(_field));
}
Expand Down
19 changes: 14 additions & 5 deletions searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,16 @@ DiskTermBlueprint::DiskTermBlueprint(const FieldSpec & field,
const FieldIndex& field_index,
const std::string& query_term,
DictionaryLookupResult lookupRes,
bool useBitVector)
bool is_filter_field,
double bitvector_limit)
: SimpleLeafBlueprint(field),
_field(field),
_field_index(field_index),
_query_term(query_term),
_lookupRes(std::move(lookupRes)),
_bitvector_lookup_result(_field_index.lookup_bit_vector(_lookupRes)),
_useBitVector(useBitVector),
_is_filter_field(is_filter_field),
_bitvector_limit(bitvector_limit),
_fetchPostingsDone(false),
_postingHandle(),
_bitVector(),
Expand Down Expand Up @@ -90,7 +92,7 @@ DiskTermBlueprint::fetchPostings(const queryeval::ExecuteInfo &execInfo)
{
(void) execInfo;
if (!_fetchPostingsDone) {
if (_useBitVector && _bitvector_lookup_result.valid()) {
if (use_bitvector() && _bitvector_lookup_result.valid()) {
if (LOG_WOULD_LOG(debug)) [[unlikely]] {
log_bitvector_read();
}
Expand All @@ -113,6 +115,13 @@ DiskTermBlueprint::calculate_flow_stats(uint32_t docid_limit) const
return {rel_est, disk_index_cost(rel_est), disk_index_strict_cost(rel_est)};
}

bool
DiskTermBlueprint::use_bitvector() const
{
return _is_filter_field ||
((get_docid_limit() > 0) && ((double)_lookupRes.counts._numDocs / (double)get_docid_limit()) > _bitvector_limit);
}

const BitVector *
DiskTermBlueprint::get_bitvector() const
{
Expand All @@ -133,13 +142,13 @@ DiskTermBlueprint::get_bitvector() const
SearchIterator::UP
DiskTermBlueprint::createLeafSearch(const TermFieldMatchDataArray & tfmda) const
{
if (_bitvector_lookup_result.valid() && (_useBitVector || tfmda[0]->isNotNeeded())) {
if (_bitvector_lookup_result.valid() && (_bitVector || tfmda[0]->isNotNeeded())) {
LOG(debug, "Return BitVectorIterator: %s, wordNum(%" PRIu64 "), docCount(%" PRIu64 ")",
getName(_field_index.get_field_id()).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs);
return BitVectorIterator::create(get_bitvector(), *tfmda[0], strict());
}
auto search(_field_index.create_iterator(_lookupRes, _postingHandle, tfmda));
if (_useBitVector) {
if (use_bitvector()) {
LOG(debug, "Return BooleanMatchIteratorWrapper: %s, wordNum(%" PRIu64 "), docCount(%" PRIu64 ")",
getName(_field_index.get_field_id()).c_str(), _lookupRes.wordNum, _lookupRes.counts._numDocs);
return std::make_unique<BooleanMatchIteratorWrapper>(std::move(search), tfmda);
Expand Down
Loading

0 comments on commit 7289777

Please sign in to comment.