Skip to content

Commit

Permalink
Merge pull request #32977 from vespa-engine/toregge/remove-legacy-wan…
Browse files Browse the repository at this point in the history
…d-term-freq-scorer

Remove legacy wand term freq scorer.
  • Loading branch information
geirst authored Dec 1, 2024
2 parents 28f529f + 3751c88 commit d667134
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ SearchIterator::UP create_weak_and() {
terms.emplace_back(T({1,2,3}).release(), 100, 3);
terms.emplace_back(T({5,6}).release(), 200, 2);
terms.emplace_back(T({8}).release(), 300, 1);
return WeakAndSearch::create(terms, wand::MatchParams(dummy_heap), wand::TermFrequencyScorer(), 100, true, true);
return WeakAndSearch::create(terms, wand::MatchParams(dummy_heap), wand::Bm25TermFrequencyScorer(num_docs), 100, true, true);
}

void collect(std::map<std::string,size_t> &counts, const auto &node) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ struct RiseWandFactory : SparseVectorFactory {
for (size_t i = 0; i < childCnt; ++i) {
terms.emplace_back(childFactory.createChild(i, limit), default_weight, limit / (i + 1));
}
return std::make_unique<rise::TermFrequencyRiseWand>(terms, n);
return std::make_unique<rise::TermFrequencyRiseWand>(terms, n, rise::TermFreqScorer(limit));
}
};

Expand Down
10 changes: 5 additions & 5 deletions searchlib/src/tests/queryeval/weak_and/rise_wand.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
#include <vespa/vespalib/util/priority_queue.h>
#include <functional>

using search::queryeval::wand::Bm25TermFrequencyScorer;
using search::queryeval::wand::DotProductScorer;
using search::queryeval::wand::TermFrequencyScorer;
using namespace search::queryeval;

namespace rise {

struct TermFreqScorer
{
[[no_unique_address]] TermFrequencyScorer _termFrequencyScorer;
TermFreqScorer() noexcept
: _termFrequencyScorer()
Bm25TermFrequencyScorer _termFrequencyScorer;
TermFreqScorer(uint32_t num_docs) noexcept
: _termFrequencyScorer(num_docs)
{ }
int64_t calculateMaxScore(const wand::Term &term) const noexcept {
return _termFrequencyScorer.calculateMaxScore(term);
Expand Down Expand Up @@ -127,7 +127,7 @@ class RiseWand : public search::queryeval::SearchIterator
void _sortMerge(uint32_t numStreamsToSort);

public:
RiseWand(const Terms &terms, uint32_t n);
RiseWand(const Terms &terms, uint32_t n, Scorer scorer);
~RiseWand() override;
void next();
void doSeek(uint32_t docid) override;
Expand Down
6 changes: 2 additions & 4 deletions searchlib/src/tests/queryeval/weak_and/rise_wand.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,18 @@
#include <vespa/searchlib/queryeval/wand/wand_parts.h>
#include <cmath>

using search::queryeval::wand::TermFrequencyScorer;

namespace rise {

template <typename Scorer, typename Cmp>
RiseWand<Scorer, Cmp>::RiseWand(const Terms &terms, uint32_t n)
RiseWand<Scorer, Cmp>::RiseWand(const Terms &terms, uint32_t n, Scorer scorer)
: _numStreams(0),
_streams(),
_lastPivotIdx(0),
_streamDocIds(new docid_t[terms.size()]),
_streamIndices(new uint16_t[terms.size()]),
_streamIndicesAux(new uint16_t[terms.size()]),
_streamComparator(_streamDocIds),
_scorer(),
_scorer(std::move(scorer)),
_n(n),
_limit(1),
_streamScores(new score_t[terms.size()]),
Expand Down
27 changes: 18 additions & 9 deletions searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,16 @@ VespaWandFactory::~VespaWandFactory() = default;
struct VespaArrayWandFactory : WandFactory {
mutable SharedWeakAndPriorityQueue _scores;
uint32_t n;
explicit VespaArrayWandFactory(uint32_t n_in)
uint32_t docid_limit;
explicit VespaArrayWandFactory(uint32_t n_in, uint32_t docid_limit_in)
: _scores(n_in),
n(n_in)
n(n_in),
docid_limit(docid_limit_in)
{}
~VespaArrayWandFactory() override;
std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); }
SearchIterator::UP create(const wand::Terms &terms) override {
return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::TermFrequencyScorer(), n, true, false);
return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::Bm25TermFrequencyScorer(docid_limit), n, true, false);
}
};

Expand All @@ -135,14 +137,16 @@ VespaArrayWandFactory::~VespaArrayWandFactory() = default;
struct VespaHeapWandFactory : WandFactory {
mutable SharedWeakAndPriorityQueue _scores;
uint32_t n;
explicit VespaHeapWandFactory(uint32_t n_in)
uint32_t docid_limit;
explicit VespaHeapWandFactory(uint32_t n_in, uint32_t docid_limit_in)
: _scores(n_in),
n(n_in)
n(n_in),
docid_limit(docid_limit_in)
{}
~VespaHeapWandFactory() override;
std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); }
SearchIterator::UP create(const wand::Terms &terms) override {
return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::TermFrequencyScorer(), n, true, false);
return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::Bm25TermFrequencyScorer(docid_limit), n, true, false);
}
};

Expand Down Expand Up @@ -191,11 +195,16 @@ VespaParallelHeapWandFactory::~VespaParallelHeapWandFactory() = default;

struct TermFrequencyRiseWandFactory : WandFactory {
uint32_t n;
explicit TermFrequencyRiseWandFactory(uint32_t n_in) noexcept : n(n_in) {}
uint32_t docid_limit;
explicit TermFrequencyRiseWandFactory(uint32_t n_in, uint32_t docid_limit_in) noexcept
: n(n_in),
docid_limit(docid_limit_in)
{
}
~TermFrequencyRiseWandFactory() override;
std::string name() const override { return make_string("RISE WAND TF (n=%u)", n); }
SearchIterator::UP create(const wand::Terms &terms) override {
return std::make_unique<rise::TermFrequencyRiseWand>(terms, n);
return std::make_unique<rise::TermFrequencyRiseWand>(terms, n, rise::TermFreqScorer(docid_limit));
}
};

Expand All @@ -207,7 +216,7 @@ struct DotProductRiseWandFactory : WandFactory {
~DotProductRiseWandFactory() override;
std::string name() const override { return make_string("RISE WAND DP (n=%u)", n); }
SearchIterator::UP create(const wand::Terms &terms) override {
return std::make_unique<rise::DotProductRiseWand>(terms, n);
return std::make_unique<rise::DotProductRiseWand>(terms, n, DotProductScorer());
}
};

Expand Down
26 changes: 14 additions & 12 deletions searchlib/src/tests/queryeval/weak_and/weak_and_bench.cpp
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "wand_bench_setup.hpp"

TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 10, 10000000)) { f2.benchmark(); }
TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000), WandSetup(f1, 10, 10000000)) { f2.benchmark(); }
TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 100, 10000000)) { f2.benchmark(); }
TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000), WandSetup(f1, 100, 10000000)) { f2.benchmark(); }
TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 1000, 10000000)) { f2.benchmark(); }
TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000), WandSetup(f1, 1000, 10000000)) { f2.benchmark(); }
constexpr uint32_t docid_limit = 10000000;

TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 10, 10000000)) { f3.benchmark(); }
TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 10, 10000000)) { f3.benchmark(); }
TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 100, 10000000)) { f3.benchmark(); }
TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 100, 10000000)) { f3.benchmark(); }
TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 1000, 10000000)) { f3.benchmark(); }
TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 1000, 10000000)) { f3.benchmark(); }
TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 10, docid_limit)) { f2.benchmark(); }
TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), WandSetup(f1, 10, docid_limit)) { f2.benchmark(); }
TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 100, docid_limit)) { f2.benchmark(); }
TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), WandSetup(f1, 100, docid_limit)) { f2.benchmark(); }
TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 1000, docid_limit)) { f2.benchmark(); }
TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), WandSetup(f1, 1000, docid_limit)) { f2.benchmark(); }

TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 10, docid_limit)) { f3.benchmark(); }
TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), FilterFactory(f1, 2), WandSetup(f2, 10, docid_limit)) { f3.benchmark(); }
TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 100, docid_limit)) { f3.benchmark(); }
TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), FilterFactory(f1, 2), WandSetup(f2, 100, docid_limit)) { f3.benchmark(); }
TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 1000, docid_limit)) { f3.benchmark(); }
TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), FilterFactory(f1, 2), WandSetup(f2, 1000, docid_limit)) { f3.benchmark(); }

TEST_MAIN() { TEST_RUN_ALL(); }
11 changes: 7 additions & 4 deletions searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ using History = SearchHistory;

namespace {

constexpr uint32_t docid_limit = 116;

struct MyWandSpec : public WandSpec
{
SharedWeakAndPriorityQueue scores;
Expand All @@ -30,7 +32,7 @@ struct MyWandSpec : public WandSpec
scores(n_in),
n(n_in),
matching_phase(MatchingPhase::FIRST_PHASE),
my_params(scores, wand::StopWordStrategy::none(), 1, 0)
my_params(scores, wand::StopWordStrategy::none(), 1, docid_limit)
{}
SearchIterator *create() {
bool readonly_scores_heap = (matching_phase != MatchingPhase::FIRST_PHASE);
Expand All @@ -39,7 +41,7 @@ struct MyWandSpec : public WandSpec
}
void set_second_phase() { matching_phase = MatchingPhase::SECOND_PHASE; }
void set_abs_stop_word_adjust_limit(double limit) {
my_params.stop_words = wand::StopWordStrategy(-limit, 1.0, 0);
my_params.stop_words = wand::StopWordStrategy(-limit, 1.0, docid_limit);
}
SimpleResult search() {
SearchIterator::UP search(create());
Expand Down Expand Up @@ -142,7 +144,8 @@ TEST(WeakAndTest, require_that_initial_docid_for_subsearches_are_taken_into_acco
terms.push_back(wand::Term(new TrackedSearch("foo", history, new EagerChild(search::endDocId)), 100, 1));
terms.push_back(wand::Term(new TrackedSearch("bar", history, new EagerChild(10)), 100, 2));
SharedWeakAndPriorityQueue scores(2);
auto search = std::make_unique<TrackedSearch>("WAND", history, WeakAndSearch::create(terms, wand::MatchParams(scores), 2, true, false));
wand::MatchParams match_params(scores, wand::StopWordStrategy::none(), wand::DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY, docid_limit);
auto search = std::make_unique<TrackedSearch>("WAND", history, WeakAndSearch::create(terms, match_params, 2, true, false));
SimpleResult hits;
hits.search(*search);
EXPECT_EQ(SimpleResult().addHit(10), hits);
Expand Down Expand Up @@ -191,7 +194,7 @@ class IteratorChildrenVerifier : public search::test::IteratorChildrenVerifier {
}
static constexpr size_t LARGE_ENOUGH_HEAP_FOR_ALL = 10000;
_scores.push_back(std::make_unique<SharedWeakAndPriorityQueue>(LARGE_ENOUGH_HEAP_FOR_ALL));
return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), wand::StopWordStrategy::none(), 1, 0), -1, strict, false);
return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), wand::StopWordStrategy::none(), 1, docid_limit), -1, strict, false);
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,25 +51,25 @@ TEST("require that mod search works") {
//---- WeakAndSearch ------------------------------------------------------------------------------

TEST_FF("require that (array) WAND and RISE WAND gives the same hits",
VespaArrayWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN))
VespaArrayWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT))
{
checkWandHits<WeakAndSearch, TermFrequencyRiseWand>(f1, f2, 1, 0);
}

TEST_FF("require that (heap) WAND and RISE WAND gives the same hits",
VespaHeapWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN))
VespaHeapWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT))
{
checkWandHits<WeakAndSearch, TermFrequencyRiseWand>(f1, f2, 1, 0);
}

TEST_FF("require that (array) WAND and RISE WAND gives the same hits with filtering and skipping",
VespaArrayWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN))
VespaArrayWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT))
{
checkWandHits<WeakAndSearch, TermFrequencyRiseWand>(f1, f2, 123, 5);
}

TEST_FF("require that (heap) WAND and RISE WAND gives the same hits with filtering and skipping",
VespaHeapWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN))
VespaHeapWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT))
{
checkWandHits<WeakAndSearch, TermFrequencyRiseWand>(f1, f2, 123, 5);
}
Expand Down
21 changes: 0 additions & 21 deletions searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h
Original file line number Diff line number Diff line change
Expand Up @@ -455,27 +455,6 @@ DualHeap<FutureHeap, PastHeap>::stringify() const {

constexpr double TermFrequencyScorer_TERM_SCORE_FACTOR = 1000000.0;

/**
* Scorer used with WeakAndAlgorithm that calculates a pseudo term frequency
* as max score and regular score for a term.
*/
struct TermFrequencyScorer
{
// weight * idf, scaled to fixedpoint
score_t calculateMaxScore(double estHits, double weight) const noexcept {
return (score_t) (TermFrequencyScorer_TERM_SCORE_FACTOR * weight / (1.0 + log(1.0 + (estHits / 1000.0))));
}

score_t calculateMaxScore(const Term &term) const noexcept {
return calculateMaxScore(term.estHits, term.weight) + 1;
}

template <typename Input>
score_t calculate_max_score(const Input &input, ref_t ref) const noexcept {
return calculateMaxScore(input.get_est_hits(ref), input.get_weight(ref)) + 1;
}
};

class Bm25TermFrequencyScorer
{
public:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,13 @@ WeakAndSearch::create(const Terms &terms, const MatchParams & params, const Scor
SearchIterator::UP
WeakAndSearch::create(const Terms &terms, const MatchParams & params, uint32_t n, bool strict, bool readonly_scores_heap)
{
return create(terms, params, wand::TermFrequencyScorer(), n, strict, readonly_scores_heap);
return create(terms, params, wand::Bm25TermFrequencyScorer(params.docid_limit), n, strict, readonly_scores_heap);
}

//-----------------------------------------------------------------------------

template SearchIterator::UP WeakAndSearch::create<wand::TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap);
template SearchIterator::UP WeakAndSearch::create<wand::Bm25TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap);
template SearchIterator::UP WeakAndSearch::createArrayWand<wand::TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap);
template SearchIterator::UP WeakAndSearch::createHeapWand<wand::TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap);
template SearchIterator::UP WeakAndSearch::createArrayWand<wand::Bm25TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap);
template SearchIterator::UP WeakAndSearch::createHeapWand<wand::Bm25TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap);

}

0 comments on commit d667134

Please sign in to comment.