Skip to content

Commit

Permalink
Merge pull request #32959 from vespa-engine/toregge/handle-missing-in…
Browse files Browse the repository at this point in the history
…terleaved-features-in-bm25-feature

Handle missing interleaved features in bm25 feature.
  • Loading branch information
geirst authored Nov 29, 2024
2 parents e6a50fd + 0e4c3bc commit ee52fe5
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 8 deletions.
7 changes: 7 additions & 0 deletions searchlib/src/tests/features/bm25/bm25_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,11 @@ TEST_F(Bm25ExecutorTest, inverse_document_frequency_can_be_overriden_with_signif
EXPECT_TRUE(execute(score(3.0, 20, 0.35)));
}

TEST_F(Bm25ExecutorTest, missing_interleaved_features_are_handled)
{
setup();
prepare_term(0, 0, 0, 0);
EXPECT_TRUE(execute(score(1.0, 10, idf(25))));
}

GTEST_MAIN_RUN_ALL_TESTS()
19 changes: 12 additions & 7 deletions searchlib/src/vespa/searchlib/features/bm25_feature.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,18 @@ Bm25Executor::execute(uint32_t doc_id)
feature_t score = 0;
for (const auto& term : _terms) {
if (term.tfmd->getDocId() == doc_id) {
feature_t num_occs = term.tfmd->getNumOccs();
feature_t norm_field_length = ((feature_t)term.tfmd->getFieldLength()) / _avg_field_length;

feature_t numerator = num_occs * term.idf_mul_k1_plus_one;
feature_t denominator = num_occs + (_k1_mul_one_minus_b + _k1_mul_b * norm_field_length);

score += numerator / denominator;
auto raw_num_occs = term.tfmd->getNumOccs();
if (raw_num_occs == 0) {
// Interleaved features are missing. Assume 1 occurrence and average field length.
score += term.degraded_score;
} else {
feature_t num_occs = raw_num_occs;
feature_t norm_field_length = ((feature_t) term.tfmd->getFieldLength()) / _avg_field_length;
feature_t numerator = num_occs * term.idf_mul_k1_plus_one;
feature_t denominator = num_occs + (_k1_mul_one_minus_b + _k1_mul_b * norm_field_length);

score += numerator / denominator;
}
}
}
outputs().set_number(0, score);
Expand Down
4 changes: 3 additions & 1 deletion searchlib/src/vespa/searchlib/features/bm25_feature.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ class Bm25Executor : public fef::FeatureExecutor {
fef::TermFieldHandle handle;
const fef::TermFieldMatchData* tfmd;
double idf_mul_k1_plus_one;
double degraded_score;
QueryTerm(fef::TermFieldHandle handle_, double inverse_doc_freq, double k1_param) noexcept
: handle(handle_),
tfmd(nullptr),
idf_mul_k1_plus_one(inverse_doc_freq * (k1_param + 1))
idf_mul_k1_plus_one(inverse_doc_freq * (k1_param + 1)),
degraded_score(inverse_doc_freq)
{}
};

Expand Down

0 comments on commit ee52fe5

Please sign in to comment.