From 9bc92284e77994e891261ba2476b17b466561bad Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 21 Aug 2024 13:49:31 +0200 Subject: [PATCH] Check fields instead of indexes when filling matching elements. --- .../searchlib/query/streaming/queryterm.h | 1 + .../matching_elements_filler_test.cpp | 97 +++++++++++++++++- .../matching_elements_filler.cpp | 99 +++++++++++++------ .../searchvisitor/matching_elements_filler.h | 5 +- .../src/vespa/searchvisitor/querytermdata.h | 1 + .../src/vespa/searchvisitor/searchvisitor.cpp | 3 +- 6 files changed, 172 insertions(+), 34 deletions(-) diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 38d13cc8b799..aec3387b37e1 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -95,6 +95,7 @@ class QueryTerm : public QueryTermUCS4, public QueryNode FieldInfo & getFieldInfo(size_t fid) { return _fieldInfo[fid]; } size_t getFieldInfoSize() const { return _fieldInfo.size(); } QueryNodeResultBase & getQueryItem() { return *_result; } + const QueryNodeResultBase & getQueryItem() const noexcept { return *_result; } const HitList & getHitList() const { return _hitList; } void visitMembers(vespalib::ObjectVisitor &visitor) const override; void setIndex(std::string index_) override { _index = std::move(index_); } diff --git a/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp b/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp index f7e886a7624c..ed4e35e0eacd 100644 --- a/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp +++ b/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -44,7 +45,12 @@ using search::fef::MatchData; using search::query::StackDumpCreator; using search::query::Weight; using search::streaming::Query; +using search::streaming::QueryConnector; +using search::streaming::QueryNode; using search::streaming::QueryNodeResultFactory; +using search::streaming::QueryTerm; +using streaming::QueryTermData; +using streaming::QueryTermDataFactory; using streaming::HitCollector; using streaming::MatchingElementsFiller; using vdslib::SearchResult; @@ -86,8 +92,8 @@ struct BoundTerm { Query make_query(std::unique_ptr root) { std::string stack_dump = StackDumpCreator::create(*root); - QueryNodeResultFactory empty; - Query query(empty, stack_dump); + QueryTermDataFactory factory(nullptr); + Query query(factory, stack_dump); return query; } @@ -128,9 +134,12 @@ struct MyDocType { const ArrayDataType _elem_array_type; const MapDataType _elem_map_type; const MapDataType _str_int_map_type; + const ArrayDataType _string_array_type; const Field _elem_array_field; const Field _elem_map_field; const Field _str_int_map_field; + const Field _apples_field; + const Field _oranges_field; DocumentType _document_type; MyDocType(); @@ -139,6 +148,7 @@ struct MyDocType { std::unique_ptr make_elem_array(std::vector> values) const; std::unique_ptr make_elem_map(std::map> values) const; std::unique_ptr make_str_int_map(std::map values) const; + std::unique_ptr make_str_array(std::vector values) const; FieldPath make_field_path(std::string path) const; std::unique_ptr make_test_doc() const; }; @@ -150,14 +160,19 @@ MyDocType::MyDocType() _elem_array_type(_elem_type), _elem_map_type(*DataType::STRING, _elem_type), _str_int_map_type(*DataType::STRING, *DataType::INT), + _string_array_type(*DataType::STRING), _elem_array_field("elem_array", 3, _elem_array_type), _elem_map_field("elem_map", 4, _elem_map_type), _str_int_map_field("str_int_map", _str_int_map_type), + _apples_field("apples", _string_array_type), + _oranges_field("oranges", _string_array_type), _document_type("test") { _document_type.addField(_elem_array_field); _document_type.addField(_elem_map_field); _document_type.addField(_str_int_map_field); + _document_type.addField(_apples_field); + _document_type.addField(_oranges_field); } MyDocType::~MyDocType() = default; @@ -201,6 +216,16 @@ MyDocType::make_str_int_map(std::map values) const return ret; } +std::unique_ptr +MyDocType::make_str_array(std::vector values) const +{ + auto ret = std::make_unique(_string_array_type); + for (auto& v : values) { + ret->add(StringFieldValue(v)); + } + return ret; +} + FieldPath MyDocType::make_field_path(std::string path) const { @@ -217,6 +242,8 @@ MyDocType::make_test_doc() const // the elements in maps are ordered on the key doc->setValue("elem_map", *make_elem_map({{"@foo", {"foo", 10}}, {"@bar", {"bar", 20}},{"@baz", {"baz", 30}},{"@foo@", {"foo", 40}},{"@zap", {"zap", 20}}, {"@zap@", {"zap", 20}}})); doc->setValue("str_int_map", *make_str_int_map({{"@foo", 10}, {"@bar", 20}, {"@baz", 30}, {"@foo@", 40}, {"@zap", 20}, {"@zap@", 20}})); + doc->setValue("apples", make_str_array({"one", "two", "three"})); + doc->setValue("oranges", *make_str_array({"three", "two", "one"})); return doc; } @@ -229,9 +256,29 @@ vsm::SharedFieldPathMap make_field_path_map(const MyDocType& doc_type) { ret->emplace_back(doc_type.make_field_path("elem_map.value.weight")); ret->emplace_back(doc_type.make_field_path("str_int_map.key")); ret->emplace_back(doc_type.make_field_path("str_int_map.value")); + ret->emplace_back(doc_type.make_field_path("apples")); + ret->emplace_back(doc_type.make_field_path("oranges")); return ret; } +void +setup_index_env(streaming::IndexEnvironment& index_env) +{ + using DataType = search::fef::FieldInfo::DataType; + // Populate mapping from field id to field name, cf. IndexEnvPrototype::detectFields + index_env.addField("elem_array.name", true, DataType::STRING); + index_env.addField("elem_array.weight", true, DataType::INT32); + index_env.addField("elem_map.key", true, DataType::STRING); + index_env.addField("elem_map.value.name", true, DataType::STRING); + index_env.addField("elem_map.value.weight", true, DataType::INT32); + index_env.addField("str_int_map.key", true, DataType::STRING); + index_env.addField("str_int_map.value", true, DataType::INT32); + index_env.addField("apples", true, DataType::STRING); + index_env.addField("oranges", true, DataType::STRING); + // Allocate field ids for virtual fields elem_array, elem_map, elem_map.value and str_int_map + index_env.add_virtual_fields(); +} + vsm::FieldIdTSearcherMap make_field_searcher_map() { vsm::FieldIdTSearcherMap ret; ret.emplace_back(std::make_unique(0)); @@ -241,6 +288,8 @@ vsm::FieldIdTSearcherMap make_field_searcher_map() { ret.emplace_back(std::make_unique(4)); ret.emplace_back(std::make_unique(5)); ret.emplace_back(std::make_unique(6)); + ret.emplace_back(std::make_unique(7)); + ret.emplace_back(std::make_unique(8)); return ret; } @@ -254,6 +303,9 @@ vsm::DocumentTypeIndexFieldMapT make_index_to_field_ids() { index_map["elem_map.value.weight"] = FieldIdTList{4}; index_map["str_int_map.key"] = FieldIdTList{5}; index_map["str_int_map.value"] = FieldIdTList{6}; + index_map["apples"] = FieldIdTList{7}; + index_map["oranges"] = FieldIdTList{8}; + index_map["fruit"] = FieldIdTList{7, 8}; return ret; } @@ -266,6 +318,8 @@ MatchingElementsFields make_matching_elements_fields() { fields.add_mapping("elem_map", "elem_map.value.weight"); fields.add_mapping("str_int_map", "str_int_map.key"); fields.add_mapping("str_int_map", "str_int_map.value"); + fields.add_field("apples"); + fields.add_field("oranges"); return fields; } @@ -284,7 +338,10 @@ class MatchingElementsFillerTest : public ::testing::Test { std::unique_ptr _matching_elements; public: MatchingElementsFillerTest(); - ~MatchingElementsFillerTest(); + ~MatchingElementsFillerTest() override; + template + static T* as(QueryNode& query_node) { return dynamic_cast(&query_node); } + void populate_term_data(QueryNode& query_node); void fill_matching_elements(Query &&query); void assert_elements(uint32_t doc_lid, const std::string& field, const ElementVector& exp_elements); void assert_same_element(const std::string& field, const std::string& term1, const std::string& term2, const ElementVector& exp_elements); @@ -305,6 +362,7 @@ MatchingElementsFillerTest::MatchingElementsFillerTest() _matching_elements() { _env.field_paths = make_field_path_map(_doc_type); + setup_index_env(_env.index_env); _search_result.addHit(1, "id::test::1", 0.0, nullptr, 0); auto sdoc = std::make_shared(_doc_type.make_test_doc(), _env.field_paths, _env.field_paths->size()); EXPECT_TRUE(sdoc->valid()); @@ -314,14 +372,35 @@ MatchingElementsFillerTest::MatchingElementsFillerTest() MatchingElementsFillerTest::~MatchingElementsFillerTest() = default; +void +MatchingElementsFillerTest::populate_term_data(QueryNode& query_node) +{ + auto& index_map = _index_to_field_ids["test"]; + if (auto query_term = as(query_node)) { + auto& qtd = dynamic_cast(query_term->getQueryItem()); + + auto itr = index_map.find(query_node.getIndex()); + if (itr != index_map.end()) { + for (auto field_id : itr->second) { + qtd.getTermData().addField(field_id); + } + } + } else if (auto intermediate = as(query_node)) { + for (size_t i = 0; i < intermediate->size(); ++i) { + populate_term_data(*(*intermediate)[i]); + } + } +} + void MatchingElementsFillerTest::fill_matching_elements(Query &&query) { _matching_elements_filler.reset(); _matching_elements.reset(); _query = std::move(query); + populate_term_data(_query.getRoot()); _env.prepare(_field_searcher_map, _index_to_field_ids, _query); - _matching_elements_filler = std::make_unique(_field_searcher_map, _query, _hit_collector, _search_result); + _matching_elements_filler = std::make_unique(_field_searcher_map, _env.index_env, _query, _hit_collector, _search_result); _matching_elements = _matching_elements_filler->fill_matching_elements(_matching_elems_fields); } @@ -407,4 +486,14 @@ TEST_F(MatchingElementsFillerTest, union_of_matching_elements) assert_elements(1, "elem_array", { 0, 1, 3, 4, 5 }); } +TEST_F(MatchingElementsFillerTest, separate_collection_for_each_field) +{ + MyQueryBuilder builder; + // fieldset fruit { fields: apples, oranges } + builder.add_term("fruit:one", 0); + fill_matching_elements(make_query(builder.build())); + assert_elements(1, "apples", { 0 }); + assert_elements(1, "oranges", { 2 }); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp b/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp index 8035dd62d994..25d2d5a588d6 100644 --- a/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp @@ -1,8 +1,10 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "matching_elements_filler.h" +#include "querytermdata.h" #include #include +#include #include #include #include @@ -10,6 +12,7 @@ #include #include "hitcollector.h" #include +#include using search::MatchingElements; using search::MatchingElementsFields; @@ -23,7 +26,9 @@ using search::streaming::QueryNode; using search::streaming::QueryTerm; using search::streaming::SameElementQueryNode; using search::streaming::WeightedSetTerm; +using streaming::QueryTermData; using vdslib::SearchResult; +using vsm::FieldIdT; using vsm::FieldIdTSearcherMap; using vsm::StorageDocument; @@ -34,15 +39,18 @@ namespace { struct SubFieldTerm { std::string _field_name; + FieldIdT _id; const QueryTerm* _term; public: - SubFieldTerm(std::string field_name, const QueryTerm* term) noexcept + SubFieldTerm(std::string field_name, FieldIdT id, const QueryTerm* term) noexcept : _field_name(std::move(field_name)), + _id(id), _term(term) { } - const std::string& get_field_name() const { return _field_name; } - const QueryTerm& get_term() const { return *_term; } + const std::string& get_field_name() const noexcept { return _field_name; } + const QueryTerm& get_term() const noexcept { return *_term; } + FieldIdT get_id() const noexcept { return _id; } }; class Matcher @@ -50,16 +58,20 @@ class Matcher std::vector _same_element_nodes; std::vector _sub_field_terms; vsm::FieldIdTSearcherMap& _field_searcher_map; + const search::fef::IIndexEnvironment& _index_env; HitList _hit_list; std::vector _elements; + const std::string* matching_elements_field(const MatchingElementsFields& fields, FieldIdT field_id); void select_multiterm_children(const MatchingElementsFields& fields, const MultiTerm& multi_term); + void select_query_term_node(const MatchingElementsFields& fields, const QueryTerm& term); void select_query_nodes(const MatchingElementsFields& fields, const QueryNode& query_node); - void add_matching_elements(const std::string& field_name, uint32_t doc_lid, const HitList& hit_list, MatchingElements& matching_elements); + void add_matching_elements(const std::string& field_name, std::optional field_id, uint32_t doc_lid, const HitList& hit_list, MatchingElements& matching_elements); void find_matching_elements(const SameElementQueryNode& same_element, uint32_t doc_lid, MatchingElements& matching_elements); void find_matching_elements(const SubFieldTerm& sub_field_term, uint32_t doc_lid, MatchingElements& matching_elements); public: - Matcher(vsm::FieldIdTSearcherMap& field_searcher_map, const MatchingElementsFields& fields, const Query& query); + Matcher(vsm::FieldIdTSearcherMap& field_searcher_map, const search::fef::IIndexEnvironment& index_env, + const MatchingElementsFields& fields, const Query& query); ~Matcher(); bool empty() const { return _same_element_nodes.empty() && _sub_field_terms.empty(); } void find_matching_elements(const vsm::StorageDocument& doc, uint32_t doc_lid, MatchingElements& matching_elements); @@ -68,10 +80,12 @@ class Matcher template const T* as(const QueryNode& query_node) { return dynamic_cast(&query_node); } -Matcher::Matcher(FieldIdTSearcherMap& field_searcher_map, const MatchingElementsFields& fields, const Query& query) +Matcher::Matcher(FieldIdTSearcherMap& field_searcher_map, const search::fef::IIndexEnvironment& index_env, + const MatchingElementsFields& fields, const Query& query) : _same_element_nodes(), _sub_field_terms(), _field_searcher_map(field_searcher_map), + _index_env(index_env), _hit_list() { select_query_nodes(fields, query.getRoot()); @@ -79,20 +93,50 @@ Matcher::Matcher(FieldIdTSearcherMap& field_searcher_map, const MatchingElements Matcher::~Matcher() = default; +const std::string* +Matcher::matching_elements_field(const MatchingElementsFields& fields, FieldIdT field_id) +{ + auto* field_info = _index_env.getField(field_id); + if (field_info != nullptr) { + auto &field_name = field_info->name(); + if (fields.has_struct_field(field_name)) { + return &fields.get_enclosing_field(field_name); + } else if (fields.has_field(field_name)) { + return &field_name; + } + } + return nullptr; +} + void Matcher::select_multiterm_children(const MatchingElementsFields& fields, const MultiTerm& multi_term) { - const std::string* field = nullptr; - auto &multi_term_index = multi_term.getIndex(); - if (fields.has_struct_field(multi_term_index)) { - field = &fields.get_enclosing_field(multi_term_index); - } else if (fields.has_field(multi_term_index)) { - field = &multi_term_index; + auto& qtd = dynamic_cast(multi_term.getQueryItem()); + auto& td = qtd.getTermData(); + auto num_fields = td.numFields(); + for (size_t i = 0; i < num_fields; ++i) { + auto field_id = td.field(i).getFieldId(); + auto* field = matching_elements_field(fields, field_id); + if (field != nullptr) { + auto& terms = multi_term.get_terms(); + for (auto &term : terms) { + _sub_field_terms.emplace_back(*field, field_id, term.get()); + } + } } - if (field != nullptr) { - auto &terms = multi_term.get_terms(); - for (auto& term : terms) { - _sub_field_terms.emplace_back(*field, term.get()); +} + +void +Matcher::select_query_term_node(const MatchingElementsFields& fields, const QueryTerm& query_term) +{ + auto& qtd = dynamic_cast(query_term.getQueryItem()); + auto& td = qtd.getTermData(); + auto num_fields = td.numFields(); + for (size_t i = 0; i < num_fields; ++i) { + auto field_id = td.field(i).getFieldId(); + auto* field = matching_elements_field(fields, field_id); + if (field != nullptr) { + _sub_field_terms.emplace_back(*field, field_id, &query_term); } } } @@ -109,12 +153,7 @@ Matcher::select_query_nodes(const MatchingElementsFields& fields, const QueryNod } else if (auto in_term = as(query_node)) { select_multiterm_children(fields, *in_term); } else if (auto query_term = as(query_node)) { - if (fields.has_struct_field(query_term->getIndex())) { - _sub_field_terms.emplace_back(fields.get_enclosing_field(query_term->getIndex()), query_term); - } - if (fields.has_field(query_term->getIndex())) { - _sub_field_terms.emplace_back(query_term->getIndex(), query_term); - } + select_query_term_node(fields, *query_term); } else if (auto and_not = as(query_node)) { select_query_nodes(fields, *(*and_not)[0]); } else if (auto intermediate = as(query_node)) { @@ -125,11 +164,13 @@ Matcher::select_query_nodes(const MatchingElementsFields& fields, const QueryNod } void -Matcher::add_matching_elements(const std::string& field_name, uint32_t doc_lid, const HitList& hit_list, MatchingElements& matching_elements) +Matcher::add_matching_elements(const std::string& field_name, std::optional field_id, uint32_t doc_lid, const HitList& hit_list, MatchingElements& matching_elements) { _elements.clear(); for (auto& hit : hit_list) { - _elements.emplace_back(hit.element_id()); + if (!field_id.has_value() || field_id.value() == hit.field_id()) { + _elements.emplace_back(hit.element_id()); + } } if (_elements.size() > 1) { std::sort(_elements.begin(), _elements.end()); @@ -144,7 +185,7 @@ Matcher::find_matching_elements(const SameElementQueryNode& same_element, uint32 { const HitList& hit_list = same_element.evaluateHits(_hit_list); if (!hit_list.empty()) { - add_matching_elements(same_element.getIndex(), doc_lid, hit_list, matching_elements); + add_matching_elements(same_element.getIndex(), std::nullopt, doc_lid, hit_list, matching_elements); } } @@ -153,7 +194,7 @@ Matcher::find_matching_elements(const SubFieldTerm& sub_field_term, uint32_t doc { const HitList& hit_list = sub_field_term.get_term().evaluateHits(_hit_list); if (!hit_list.empty()) { - add_matching_elements(sub_field_term.get_field_name(), doc_lid, hit_list, matching_elements); + add_matching_elements(sub_field_term.get_field_name(), sub_field_term.get_id(), doc_lid, hit_list, matching_elements); } } @@ -173,10 +214,12 @@ Matcher::find_matching_elements(const StorageDocument& doc, uint32_t doc_lid, Ma } -MatchingElementsFiller::MatchingElementsFiller(FieldIdTSearcherMap& field_searcher_map, Query& query, +MatchingElementsFiller::MatchingElementsFiller(FieldIdTSearcherMap& field_searcher_map, + const search::fef::IIndexEnvironment& index_env, Query& query, HitCollector& hit_collector, SearchResult& search_result) : vsm::IMatchingElementsFiller(), _field_searcher_map(field_searcher_map), + _index_env(index_env), _query(query), _hit_collector(hit_collector), _search_result(search_result) @@ -192,7 +235,7 @@ MatchingElementsFiller::fill_matching_elements(const MatchingElementsFields& fie if (fields.empty()) { return result; } - Matcher matcher(_field_searcher_map, fields, _query); + Matcher matcher(_field_searcher_map, _index_env, fields, _query); if (matcher.empty()) { return result; } diff --git a/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.h b/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.h index 987efec3d46b..d70ce19d99da 100644 --- a/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.h +++ b/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.h @@ -4,6 +4,7 @@ #include +namespace search::fef { class IIndexEnvironment; } namespace search::streaming { class Query; } namespace vdslib { class SearchResult; } namespace vsm { @@ -21,12 +22,14 @@ class HitCollector; */ class MatchingElementsFiller : public vsm::IMatchingElementsFiller { vsm::FieldIdTSearcherMap& _field_searcher_map; + const search::fef::IIndexEnvironment& _index_env; search::streaming::Query& _query; HitCollector& _hit_collector; vdslib::SearchResult& _search_result; public: - MatchingElementsFiller(vsm::FieldIdTSearcherMap& field_searcher_map, search::streaming::Query& query, + MatchingElementsFiller(vsm::FieldIdTSearcherMap& field_searcher_map, + const search::fef::IIndexEnvironment& index_env, search::streaming::Query& query, HitCollector& hit_collector, vdslib::SearchResult& search_result); virtual ~MatchingElementsFiller(); std::unique_ptr fill_matching_elements(const search::MatchingElementsFields& fields) override; diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h index d16cd4f70d4e..ac0d26c436ae 100644 --- a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h +++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h @@ -18,6 +18,7 @@ class QueryTermData : public search::streaming::QueryNodeResultBase public: QueryTermData * clone() const override { return new QueryTermData(); } search::fef::SimpleTermData &getTermData() noexcept { return _termData; } + const search::fef::SimpleTermData &getTermData() const noexcept { return _termData; } }; class QueryTermDataFactory final : public search::streaming::QueryNodeResultFactory { diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp index 10dcf74baba5..369608a0a73e 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -1320,7 +1320,8 @@ SearchVisitor::generateDocumentSummaries() auto& hit_collector = _rankController.getRankProcessor()->getHitCollector(); _summaryGenerator.setDocsumCache(hit_collector); vdslib::SearchResult & searchResult(_queryResult->getSearchResult()); - _summaryGenerator.getDocsumCallback().set_matching_elements_filler(std::make_unique(_fieldSearcherMap, _query, hit_collector, searchResult)); + auto& index_env = _rankController.getRankProcessor()->get_query_env().getIndexEnvironment(); + _summaryGenerator.getDocsumCallback().set_matching_elements_filler(std::make_unique(_fieldSearcherMap, index_env, _query, hit_collector, searchResult)); vdslib::DocumentSummary & documentSummary(_queryResult->getDocumentSummary()); for (size_t i(0), m(searchResult.getHitCount()); (i < m) && (i < searchResult.getWantedHitCount()); i++ ) { const char * docId(nullptr);