Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove idf range from weak and blueprint. #32973

Merged
merged 1 commit into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ class BlueprintBuilderVisitor :

void buildWeakAnd(ProtonWeakAnd &n) {
auto *wand = new WeakAndBlueprint(n.getTargetNumHits(),
1.0 /* weakand_range */,
_requestContext.get_create_blueprint_params().weakand_stop_word_strategy,
is_search_multi_threaded());
Blueprint::UP result(wand);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -796,10 +796,10 @@ struct make {
static make ONEAR(uint32_t window) { return make(std::make_unique<ONearBlueprint>(window)); }
static make WEAKAND(uint32_t n) { return make(std::make_unique<WeakAndBlueprint>(n)); }
static make WEAKAND_ADJUST(double limit) {
return make(std::make_unique<WeakAndBlueprint>(100, 0.0, wand::StopWordStrategy(-limit, 1.0, 0), true));
return make(std::make_unique<WeakAndBlueprint>(100, wand::StopWordStrategy(-limit, 1.0, 0), true));
}
static make WEAKAND_DROP(double limit) {
return make(std::make_unique<WeakAndBlueprint>(100, 0.0, wand::StopWordStrategy(1.0, -limit, 0), true));
return make(std::make_unique<WeakAndBlueprint>(100, wand::StopWordStrategy(1.0, -limit, 0), true));
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ TEST("require that DotProductScorer calculates term score")

TEST("test bm25 idf scorer for wand")
{
wand::Bm25TermFrequencyScorer scorer(1000000, 1.0);
wand::Bm25TermFrequencyScorer scorer(1000000);
EXPECT_EQUAL(13410046, scorer.calculateMaxScore(1, 1));
EXPECT_EQUAL(11464136, scorer.calculateMaxScore(10, 1));
EXPECT_EQUAL(6907256, scorer.calculateMaxScore(1000, 1));
Expand All @@ -76,14 +76,4 @@ TEST("test bm25 idf scorer for wand")
EXPECT_EQUAL(10050, scorer.calculateMaxScore(990000, 1));
}

TEST("test limited range of bm25 idf scorer for wand")
{
wand::Bm25TermFrequencyScorer scorer08(1000000, 0.8);
wand::Bm25TermFrequencyScorer scorer10(1000000, 1.0);
EXPECT_EQUAL(8207814, scorer08.calculateMaxScore(1000, 1));
EXPECT_EQUAL(2690049, scorer08.calculateMaxScore(990000, 1));
EXPECT_EQUAL(6907256, scorer10.calculateMaxScore(1000, 1));
EXPECT_EQUAL(10050, scorer10.calculateMaxScore(990000, 1));
}

TEST_MAIN() { TEST_RUN_ALL(); }
Original file line number Diff line number Diff line change
Expand Up @@ -419,10 +419,9 @@ WeakAndBlueprint::my_flow(InFlow in_flow) const
return AnyFlow::create<OrFlow>(in_flow);
}

WeakAndBlueprint::WeakAndBlueprint(uint32_t n, float idf_range, wand::StopWordStrategy stop_word_strategy, bool thread_safe)
WeakAndBlueprint::WeakAndBlueprint(uint32_t n, wand::StopWordStrategy stop_word_strategy, bool thread_safe)
: _scores(WeakAndPriorityQueue::createHeap(n, thread_safe)),
_n(n),
_idf_range(idf_range),
_stop_word_strategy(stop_word_strategy),
_weights(),
_matching_phase(MatchingPhase::FIRST_PHASE)
Expand Down Expand Up @@ -520,11 +519,8 @@ WeakAndBlueprint::createIntermediateSearch(MultiSearch::Children sub_searches,
}
bool readonly_scores_heap = (_matching_phase != MatchingPhase::FIRST_PHASE);
wand::MatchParams innerParams{*_scores, _stop_word_strategy, wand::DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY, get_docid_limit()};
return (_idf_range == 0.0)
? WeakAndSearch::create(terms, innerParams, wand::TermFrequencyScorer(), _n, strict(),
readonly_scores_heap)
: WeakAndSearch::create(terms, innerParams, wand::Bm25TermFrequencyScorer(get_docid_limit(), _idf_range), _n, strict(),
readonly_scores_heap);
return WeakAndSearch::create(terms, innerParams, wand::Bm25TermFrequencyScorer(get_docid_limit()), _n, strict(),
readonly_scores_heap);
}

SearchIterator::UP
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ class WeakAndBlueprint : public IntermediateBlueprint
private:
std::unique_ptr<WeakAndPriorityQueue> _scores;
uint32_t _n;
float _idf_range;
wand::StopWordStrategy _stop_word_strategy;
std::vector<uint32_t> _weights;
MatchingPhase _matching_phase;
Expand All @@ -112,8 +111,8 @@ class WeakAndBlueprint : public IntermediateBlueprint
fef::MatchData &md) const override;
SearchIterator::UP createFilterSearch(FilterConstraint constraint) const override;

explicit WeakAndBlueprint(uint32_t n) : WeakAndBlueprint(n, 0.0, wand::StopWordStrategy::none(), true) {}
WeakAndBlueprint(uint32_t n, float idf_range, wand::StopWordStrategy stop_word_strategy, bool thread_safe);
explicit WeakAndBlueprint(uint32_t n) : WeakAndBlueprint(n, wand::StopWordStrategy::none(), true) {}
WeakAndBlueprint(uint32_t n, wand::StopWordStrategy stop_word_strategy, bool thread_safe);
~WeakAndBlueprint() override;
void addTerm(Blueprint::UP bp, uint32_t weight) {
addChild(std::move(bp));
Expand Down
13 changes: 3 additions & 10 deletions searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h
Original file line number Diff line number Diff line change
Expand Up @@ -480,18 +480,13 @@ class Bm25TermFrequencyScorer
{
public:
using Bm25Executor = features::Bm25Executor;
Bm25TermFrequencyScorer(uint32_t num_docs, float range) noexcept
: _num_docs(num_docs),
_range(range),
_max_idf(Bm25Executor::calculate_inverse_document_frequency({1, _num_docs}))
Bm25TermFrequencyScorer(uint32_t num_docs) noexcept
: _num_docs(num_docs)
{ }
double apply_range(double idf) const noexcept {
return (1.0 - _range)*_max_idf + _range * idf;
}
// weight * scaled_bm25_idf, scaled to fixedpoint
score_t calculateMaxScore(double estHits, double weight) const noexcept {
return score_t(TermFrequencyScorer_TERM_SCORE_FACTOR * weight *
apply_range(Bm25Executor::calculate_inverse_document_frequency({static_cast<uint64_t>(estHits), _num_docs})));
Bm25Executor::calculate_inverse_document_frequency({static_cast<uint64_t>(estHits), _num_docs}));
}

score_t calculateMaxScore(const Term &term) const noexcept {
Expand All @@ -504,8 +499,6 @@ class Bm25TermFrequencyScorer
}
private:
uint32_t _num_docs;
float _range;
double _max_idf;
};

//-----------------------------------------------------------------------------
Expand Down