From 776d7f953c1f7a8c85bba42557a2f5d00c819d6e Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Thu, 24 Oct 2024 11:21:06 +0200 Subject: [PATCH 1/3] Add test of diversity with different min-groups settings --- tests/search/diversity/diversity.rb | 50 +++++++++++++++++++++++++++++ tests/search/diversity/music.sd | 50 +++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tests/search/diversity/diversity.rb create mode 100644 tests/search/diversity/music.sd diff --git a/tests/search/diversity/diversity.rb b/tests/search/diversity/diversity.rb new file mode 100644 index 000000000..2878660b1 --- /dev/null +++ b/tests/search/diversity/diversity.rb @@ -0,0 +1,50 @@ +# Copyright Vespa.ai. All rights reserved. +require 'indexed_only_search_test' +require 'doc_generator' + +class DiversityMinGroups < IndexedOnlySearchTest + + def setup + set_owner('hmusum') + @docs = 50 + @expected_relevancy = 1000.3344587750165 + end + + def test_diversity_min_groups + deploy_app(SearchApp.new.sd(selfdir+"music.sd")) + start + feed_docs + wait_for_hitcount("query=sddocname:music", @docs) + # All docs match this query, but 1 doc has lower relevancy, see feed_docs() + assert_hitcount("query=cherub+rock", @docs) + assert_hitcount("query=cherub+rock&ranking=base", @docs) + + assert_relevancy("query=cherub+rock&ranking=base", @expected_relevancy, 0) + assert_relevancy("query=cherub+rock&ranking=diversity", @expected_relevancy, 0) + # diversity.min-groups is 60 in the 'diversity_many_groups' rank profile, more than number of docs + # => not aenough docs to fulfill min-groups criteria + assert_relevancy("query=cherub+rock&ranking=diversity_many_groups", @expected_relevancy, 0) + end + + def feed_docs + @docs.times.each { |i| + doc = Document.new('music', "id:test:music::#{i}") + if i == 0 + doc.add_field('genre', 'rock') + doc.add_field('artist', 'The Clash') + doc.add_field('title', 'Rock the Casbah') + else + doc.add_field('genre', 'alternative') + doc.add_field('artist', 'Smashing Pumpkins') + doc.add_field('title', 'Cherub Rock') + end + vespa.document_api_v1.put(doc, :brief => true) + } + end + + def teardown + stop + end + +end + diff --git a/tests/search/diversity/music.sd b/tests/search/diversity/music.sd new file mode 100644 index 000000000..cacf351d2 --- /dev/null +++ b/tests/search/diversity/music.sd @@ -0,0 +1,50 @@ +# Copyright Vespa.ai. All rights reserved. + +schema music { + + document music { + + field title type string { + indexing: index | summary + } + + field artist type string { + indexing: index | summary + } + + field genre type string { + indexing: summary | attribute + } + + } + + fieldset default { + fields: title, artist + } + + rank-profile base inherits default { + first-phase { + expression: nativeRank(artist) + nativeRank(title) + } + + second-phase { + expression: firstPhase() + 1000 + } + } + + rank-profile diversity inherits base { + diversity { + attribute: genre + min-groups: 5 + } + } + + rank-profile diversity_many_groups inherits diversity { + diversity { + attribute: genre + min-groups: 60 + } + } + +} + From 1d0d32d6f7fa0f6271ba2afe65ae281b25b3e682 Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Fri, 25 Oct 2024 14:03:28 +0200 Subject: [PATCH 2/3] Check relvancy of all hits, show that some hits are not going through second phase ranking when there are few docs, but works with more docs --- tests/search/diversity/diversity.rb | 45 ++++++++++++++++++++++------- tests/search/diversity/music.sd | 10 ++----- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/tests/search/diversity/diversity.rb b/tests/search/diversity/diversity.rb index 2878660b1..6d6845233 100644 --- a/tests/search/diversity/diversity.rb +++ b/tests/search/diversity/diversity.rb @@ -6,8 +6,7 @@ class DiversityMinGroups < IndexedOnlySearchTest def setup set_owner('hmusum') - @docs = 50 - @expected_relevancy = 1000.3344587750165 + @docs = 20 end def test_diversity_min_groups @@ -15,15 +14,19 @@ def test_diversity_min_groups start feed_docs wait_for_hitcount("query=sddocname:music", @docs) - # All docs match this query, but 1 doc has lower relevancy, see feed_docs() - assert_hitcount("query=cherub+rock", @docs) - assert_hitcount("query=cherub+rock&ranking=base", @docs) - - assert_relevancy("query=cherub+rock&ranking=base", @expected_relevancy, 0) - assert_relevancy("query=cherub+rock&ranking=diversity", @expected_relevancy, 0) - # diversity.min-groups is 60 in the 'diversity_many_groups' rank profile, more than number of docs - # => not aenough docs to fulfill min-groups criteria - assert_relevancy("query=cherub+rock&ranking=diversity_many_groups", @expected_relevancy, 0) + + # All docs match this query, but 1 doc has higher relevancy, see feed_docs() + puts "Query: 'rock'" + @expected_relevancy_best_doc = 1000.3818623835995 + @expected_relevancy_rest = 1000.16343879032 + @expected_relevancy_no_second_phase = 0.16343879032006287 + + assert_hitcount("query=rock", @docs) + assert_relevancy("query=rock&ranking=base", @expected_relevancy_best_doc, 0) + assert_relevancy("query=rock&ranking=diversity_min_groups_5", @expected_relevancy_best_doc, 0) + # Should get 1 hit that is doc 0, rest should have gone through second phase + # TODO: Fails with @docs = 20, works with @docs = 50 + check_relevancy("query=rock&ranking=diversity_min_groups_5", @expected_relevancy_rest, {0 => @expected_relevancy_best_doc}) end def feed_docs @@ -42,6 +45,26 @@ def feed_docs } end + def check_relevancy(query, default_relevance, hit_number_to_relevance_mapping, hits=10) + result = search(query) + assert_equal(hits, result.hit.length) + hits.times.each { |i| + puts "hit #{i} relevance = #{relevance(result, i)}" + } + puts "---\n" + hits.times.each { |i| + expected_relevance = hit_number_to_relevance_mapping[i] + expected_relevance = default_relevance unless expected_relevance + hit = result.hit[i] + relevance = relevance(result, i) + assert_approx(expected_relevance, relevance, 0.01, "expected: #{expected_relevance}, got #{relevance} for hit #{i}: #{hit}") + } + end + + def relevance(result, index) + result.hit[index].field['relevancy'].to_f + end + def teardown stop end diff --git a/tests/search/diversity/music.sd b/tests/search/diversity/music.sd index cacf351d2..b300291df 100644 --- a/tests/search/diversity/music.sd +++ b/tests/search/diversity/music.sd @@ -29,22 +29,16 @@ schema music { second-phase { expression: firstPhase() + 1000 + rerank-count: 100 } } - rank-profile diversity inherits base { + rank-profile diversity_min_groups_5 inherits base { diversity { attribute: genre min-groups: 5 } } - rank-profile diversity_many_groups inherits diversity { - diversity { - attribute: genre - min-groups: 60 - } - } - } From 30abc95f1273e4e45fb55700e9924107b9828b9f Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Fri, 25 Oct 2024 15:38:17 +0200 Subject: [PATCH 3/3] Works on 8.431.21 and later --- tests/search/diversity/diversity.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/search/diversity/diversity.rb b/tests/search/diversity/diversity.rb index 6d6845233..7fcd9dfce 100644 --- a/tests/search/diversity/diversity.rb +++ b/tests/search/diversity/diversity.rb @@ -25,7 +25,6 @@ def test_diversity_min_groups assert_relevancy("query=rock&ranking=base", @expected_relevancy_best_doc, 0) assert_relevancy("query=rock&ranking=diversity_min_groups_5", @expected_relevancy_best_doc, 0) # Should get 1 hit that is doc 0, rest should have gone through second phase - # TODO: Fails with @docs = 20, works with @docs = 50 check_relevancy("query=rock&ranking=diversity_min_groups_5", @expected_relevancy_rest, {0 => @expected_relevancy_best_doc}) end