diff --git a/container-search/pom.xml b/container-search/pom.xml
index 5e7c60d49c33..9e990d2b41ff 100644
--- a/container-search/pom.xml
+++ b/container-search/pom.xml
@@ -183,6 +183,11 @@
junit-jupiter-engine
test
+
+ org.junit.jupiter
+ junit-jupiter-params
+ test
+
diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
index d0576295b365..5976502ed1c7 100644
--- a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
+++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
@@ -114,6 +114,15 @@ private Result calculateAndSetSignificance(Query query, Execution execution) {
}
private SignificanceModel getSignificanceModelFromQueryLanguage(Query query) throws IllegalArgumentException {
+ /*
+ Implements the following model resolving logic:
+ - When language is explicitly tagged on query
+ - Use language if available from the model registry, fail otherwise.
+ - If “un” try both “un” and “en”.
+ - When language is implicitly detected
+ - Use language if available from the model registry. Fallback to “un” then “en”, fail if none are available.
+ */
+
Language explicitLanguage = query.getModel().getLanguage();
Language implicitLanguage = query.getModel().getParsingLanguage();
@@ -132,14 +141,8 @@ private SignificanceModel getSignificanceModelFromQueryLanguage(Query query) thr
return model.get();
}
- if (implicitLanguage == Language.UNKNOWN) {
- return handleFallBackToUnknownLanguage();
- }
var model = significanceModelRegistry.getModel(implicitLanguage);
- if (model.isEmpty()) {
- throw new IllegalArgumentException("No significance model available for implicit language " + implicitLanguage);
- }
- return model.get();
+ return model.orElseGet(this::handleFallBackToUnknownLanguage);
}
private SignificanceModel handleFallBackToUnknownLanguage() throws IllegalArgumentException {
@@ -158,7 +161,7 @@ private void setIDF(Item root, SignificanceModel significanceModel) {
if (root instanceof WordItem wi) {
var word = wi.getWord();
- var documentFrequency = significanceModel.documentFrequency(word);
+ var documentFrequency = significanceModel.documentFrequency(word.toLowerCase());
long N = documentFrequency.corpusSize();
long nq_i = documentFrequency.frequency();
log.log(Level.FINE, () -> "Setting document frequency for " + word + " to {frequency: " + nq_i + ", count: " + N + "}");
diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
index e71a31fea2c1..07756bbb83fc 100644
--- a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
+++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
@@ -2,13 +2,11 @@
package com.yahoo.search.significance.test;
import com.yahoo.component.chain.Chain;
-import com.yahoo.config.subscription.ConfigGetter;
import com.yahoo.language.Language;
import com.yahoo.language.Linguistics;
import com.yahoo.language.detect.Detection;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.detect.Hint;
-import com.yahoo.language.opennlp.OpenNlpLinguistics;
import com.yahoo.language.process.*;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
@@ -24,13 +22,13 @@
import com.yahoo.search.schema.SchemaInfo;
import com.yahoo.search.searchchain.Execution;
import com.yahoo.search.significance.SignificanceSearcher;
-import com.yahoo.vespa.config.search.RankProfilesConfig;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
import java.util.Optional;
@@ -218,7 +216,7 @@ void testSignificanceValueOnSimpleANDQuery() {
q.getModel().getQueryTree().setRoot(root);
SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
- var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("Hello"));
+ var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello"));
var worldDocumentFrequency = makeDocumentFrequency(model.documentFrequency("world"));
Result r = createExecution(searcher).search(q);
@@ -256,9 +254,9 @@ void testSignificanceValueOnRecursiveQuery() {
q.getModel().getQueryTree().setRoot(root);
- SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
- var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello"));
- var testDocumentFrequency = makeDocumentFrequency(model.documentFrequency("test"));
+ var helloDocumentFrequency = getDocumentFrequencyWithEnglish("hello");
+ var testDocumentFrequency = getDocumentFrequencyWithEnglish("test");
+
Result r = createExecution(searcher).search(q);
root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
@@ -296,59 +294,124 @@ public void failsOnConflictingSignificanceConfiguration() {
var result = createExecution(searcher).search(query);
assertEquals(1, result.hits().getErrorHit().errors().size());
- var errorMessage = result.hits().getError();
+ var errorMessage = getErrorMessage(result);
assertEquals("Inconsistent 'significance' configuration for the rank profile 'significance-ranking' in the schemas [music, album]. " +
"Use 'restrict' to limit the query to a subset of schemas " +
"(https://docs.vespa.ai/en/schemas.html#multiple-schemas). " +
"Specify same 'significance' configuration for all selected schemas " +
"(https://docs.vespa.ai/en/reference/schema-reference.html#significance).",
- errorMessage.getDetailedMessage());
+ errorMessage);
}
- @Test
- public void testSignificanceSearcherWithExplictitAndImplictSetLanguages() {
- Query q = new Query();
- q.getModel().setLanguage(Language.UNKNOWN);
- q.getRanking().setProfile("significance-ranking");
- AndItem root = new AndItem();
- WordItem tmp;
- tmp = new WordItem("hello", true);
- root.addItem(tmp);
+ // Tests that follow verify model resolving logic in different scenarios.
+ // Test naming convention is as follows:
+ // Explicit language - language set in a query
+ // Implicit language - language detected automatically
+ // Missing language - language without a model
+ // Unknown language - Language.UNKNOWN
+ // Missing word - word not in a model for the specified language or fallback models
+ // Existing word - word in a model for the specified language or fallback models
- q.getModel().getQueryTree().setRoot(root);
+ private Result searchWord(String word, Optional explicitLanguage, Optional implicitLanguage) {
+ var query = new Query();
+ explicitLanguage.ifPresent(language -> query.getModel().setLanguage(language));
+ query.getRanking().setProfile("significance-ranking");
+ var queryRoot = new AndItem();
+ var queryWord = new WordItem(word, true);
+ queryRoot.addItem(queryWord);
+ query.getModel().getQueryTree().setRoot(queryRoot);
+
+ var context = Execution.Context.createContextStub();
+ implicitLanguage.ifPresent(language -> context.setLinguistics(new MockLinguistics(language)));
+ var execution = new Execution(new Chain<>(searcher), context);
+ return execution.search(query);
+ }
+ private Optional getDocumentFrequencyWithEnglish(String word) {
SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
- var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello"));
- Result r = createExecution(searcher).search(q);
+ return makeDocumentFrequency(model.documentFrequency(word));
+ }
- root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
- WordItem w0 = (WordItem) root.getItem(0);
- assertEquals(helloDocumentFrequency, w0.getDocumentFrequency());
+ private static WordItem getFirstWord(Result result) {
+ var resultRoot = (AndItem) result.getQuery().getModel().getQueryTree().getRoot();
+ return (WordItem) resultRoot.getItem(0);
+ }
- Query q2 = new Query();
- q2.getModel().setLanguage(Language.FRENCH);
- q2.getRanking().setProfile("significance-ranking");
- AndItem root2 = new AndItem();
- WordItem tmp2;
- tmp2 = new WordItem("hello", true);
- root2.addItem(tmp2);
+ private static String getErrorMessage(Result result) {
+ return result.hits().getError().getDetailedMessage();
+ }
- q2.getModel().getQueryTree().setRoot(root2);
- Result r2 = createExecution(searcher).search(q2);
+ @Test
+ public void testSignificanceSearcherWithMissingExplicitLanguageOnExistingWord() {
+ var existingWord = "hello";
+ var explicitLanguage = Language.ITALIAN;
+ var result = searchWord(existingWord, Optional.of(explicitLanguage), Optional.empty());
+
+ var resultWord = getFirstWord(result);
+ assertEquals(Optional.empty(), resultWord.getDocumentFrequency());
+
+ var errorMessage = getErrorMessage(result);
+ assertEquals("No significance model available for set language ITALIAN", errorMessage);
+ }
- assertEquals(1, r2.hits().getErrorHit().errors().size());
+ @Test
+ public void testSignificanceSearcherWithUnknownExplicitLanguageOnExistingWord() {
+ var existingWord = "hello";
+ var explicitLanguage = Language.UNKNOWN;
+ var result = searchWord(existingWord, Optional.of(explicitLanguage), Optional.empty());
+ var resultWord = getFirstWord(result);
+ var existingDocumentFrequency = getDocumentFrequencyWithEnglish(existingWord);
+ assertEquals(existingDocumentFrequency, resultWord.getDocumentFrequency());
+ }
+
+ @Test
+ public void testSignificanceSearcherWithMissingExplicitLanguageOnMissingWord() {
+ var missingWord = "ciao";
+ var explicitLanguage = Language.ITALIAN;
+ var result = searchWord(missingWord, Optional.of(explicitLanguage), Optional.empty());
+
+ var resultWord = getFirstWord(result);
+ assertEquals(Optional.empty(), resultWord.getDocumentFrequency());
+ var errorMessage = getErrorMessage(result);
+ assertEquals("No significance model available for set language ITALIAN", errorMessage);
+ }
- Query q3 = new Query();
- q3.getRanking().setProfile("significance-ranking");
- WordItem root3 = new WordItem("Я с детства хотел завести собаку, но родители мне не разрешали.", true);
+ @Test
+ public void testSignificanceSearcherWithMissingImplicitLanguageOnExistingWord() {
+ var existingWord = "hello";
+ var implicitLanguage = Language.ITALIAN;
+ var result = searchWord(existingWord, Optional.empty(), Optional.of(implicitLanguage));
+ var resultWord = getFirstWord(result);
+ var existingDocumentFrequency = getDocumentFrequencyWithEnglish(existingWord);
+ assertEquals(existingDocumentFrequency, resultWord.getDocumentFrequency());
+ }
- q3.getModel().getQueryTree().setRoot(root3);
- Execution execution = createExecution(searcher, Language.RUSSIAN);
- Result r3 = execution.search(q3);
+ @Test
+ public void testSignificanceSearcherWithMissingImplicitLanguageOnMissingWord() {
+ var implicitLanguage = Language.ITALIAN;
+ var missingWord = "ciao";
+ var result = searchWord(missingWord, Optional.empty(), Optional.of(implicitLanguage));
+ var resultWord = getFirstWord(result);
+
+ var existingWord = "hello";
+ var documentFrequency = getDocumentFrequencyWithEnglish(existingWord);
+ var count = documentFrequency.get().count();
+ var defaultDocumentFrequency = Optional.of(new DocumentFrequency(1, count));
+
+ assertEquals(defaultDocumentFrequency, resultWord.getDocumentFrequency());
+ }
- assertEquals(1, r3.hits().getErrorHit().errors().size());
+ // Tests for upper case words in a query
+ @ParameterizedTest
+ @ValueSource(strings = {"Hello", "HeLlo", "HELLO"})
+ public void testSignificanceSearcherWithUpperCaseWord(String wordWithUpperCase) {
+ var result = searchWord(wordWithUpperCase, Optional.of(Language.ENGLISH), Optional.empty());
+ var resultWord = getFirstWord(result);
+ var lowerCaseWord = "hello";
+ var documentFrequency = getDocumentFrequencyWithEnglish(lowerCaseWord);
+ assertEquals(documentFrequency, resultWord.getDocumentFrequency());
}
-}
+}
\ No newline at end of file