diff --git a/container-search/pom.xml b/container-search/pom.xml index 5e7c60d49c33..9e990d2b41ff 100644 --- a/container-search/pom.xml +++ b/container-search/pom.xml @@ -183,6 +183,11 @@ junit-jupiter-engine test + + org.junit.jupiter + junit-jupiter-params + test + diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java index d0576295b365..5976502ed1c7 100644 --- a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java +++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java @@ -114,6 +114,15 @@ private Result calculateAndSetSignificance(Query query, Execution execution) { } private SignificanceModel getSignificanceModelFromQueryLanguage(Query query) throws IllegalArgumentException { + /* + Implements the following model resolving logic: + - When language is explicitly tagged on query + - Use language if available from the model registry, fail otherwise. + - If “un” try both “un” and “en”. + - When language is implicitly detected + - Use language if available from the model registry. Fallback to “un” then “en”, fail if none are available. + */ + Language explicitLanguage = query.getModel().getLanguage(); Language implicitLanguage = query.getModel().getParsingLanguage(); @@ -132,14 +141,8 @@ private SignificanceModel getSignificanceModelFromQueryLanguage(Query query) thr return model.get(); } - if (implicitLanguage == Language.UNKNOWN) { - return handleFallBackToUnknownLanguage(); - } var model = significanceModelRegistry.getModel(implicitLanguage); - if (model.isEmpty()) { - throw new IllegalArgumentException("No significance model available for implicit language " + implicitLanguage); - } - return model.get(); + return model.orElseGet(this::handleFallBackToUnknownLanguage); } private SignificanceModel handleFallBackToUnknownLanguage() throws IllegalArgumentException { @@ -158,7 +161,7 @@ private void setIDF(Item root, SignificanceModel significanceModel) { if (root instanceof WordItem wi) { var word = wi.getWord(); - var documentFrequency = significanceModel.documentFrequency(word); + var documentFrequency = significanceModel.documentFrequency(word.toLowerCase()); long N = documentFrequency.corpusSize(); long nq_i = documentFrequency.frequency(); log.log(Level.FINE, () -> "Setting document frequency for " + word + " to {frequency: " + nq_i + ", count: " + N + "}"); diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java index e71a31fea2c1..07756bbb83fc 100644 --- a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java +++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java @@ -2,13 +2,11 @@ package com.yahoo.search.significance.test; import com.yahoo.component.chain.Chain; -import com.yahoo.config.subscription.ConfigGetter; import com.yahoo.language.Language; import com.yahoo.language.Linguistics; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; -import com.yahoo.language.opennlp.OpenNlpLinguistics; import com.yahoo.language.process.*; import com.yahoo.language.significance.SignificanceModel; import com.yahoo.language.significance.SignificanceModelRegistry; @@ -24,13 +22,13 @@ import com.yahoo.search.schema.SchemaInfo; import com.yahoo.search.searchchain.Execution; import com.yahoo.search.significance.SignificanceSearcher; -import com.yahoo.vespa.config.search.RankProfilesConfig; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import java.nio.ByteBuffer; import java.nio.file.Path; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Optional; @@ -218,7 +216,7 @@ void testSignificanceValueOnSimpleANDQuery() { q.getModel().getQueryTree().setRoot(root); SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get(); - var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("Hello")); + var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello")); var worldDocumentFrequency = makeDocumentFrequency(model.documentFrequency("world")); Result r = createExecution(searcher).search(q); @@ -256,9 +254,9 @@ void testSignificanceValueOnRecursiveQuery() { q.getModel().getQueryTree().setRoot(root); - SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get(); - var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello")); - var testDocumentFrequency = makeDocumentFrequency(model.documentFrequency("test")); + var helloDocumentFrequency = getDocumentFrequencyWithEnglish("hello"); + var testDocumentFrequency = getDocumentFrequencyWithEnglish("test"); + Result r = createExecution(searcher).search(q); root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot(); @@ -296,59 +294,124 @@ public void failsOnConflictingSignificanceConfiguration() { var result = createExecution(searcher).search(query); assertEquals(1, result.hits().getErrorHit().errors().size()); - var errorMessage = result.hits().getError(); + var errorMessage = getErrorMessage(result); assertEquals("Inconsistent 'significance' configuration for the rank profile 'significance-ranking' in the schemas [music, album]. " + "Use 'restrict' to limit the query to a subset of schemas " + "(https://docs.vespa.ai/en/schemas.html#multiple-schemas). " + "Specify same 'significance' configuration for all selected schemas " + "(https://docs.vespa.ai/en/reference/schema-reference.html#significance).", - errorMessage.getDetailedMessage()); + errorMessage); } - @Test - public void testSignificanceSearcherWithExplictitAndImplictSetLanguages() { - Query q = new Query(); - q.getModel().setLanguage(Language.UNKNOWN); - q.getRanking().setProfile("significance-ranking"); - AndItem root = new AndItem(); - WordItem tmp; - tmp = new WordItem("hello", true); - root.addItem(tmp); + // Tests that follow verify model resolving logic in different scenarios. + // Test naming convention is as follows: + // Explicit language - language set in a query + // Implicit language - language detected automatically + // Missing language - language without a model + // Unknown language - Language.UNKNOWN + // Missing word - word not in a model for the specified language or fallback models + // Existing word - word in a model for the specified language or fallback models - q.getModel().getQueryTree().setRoot(root); + private Result searchWord(String word, Optional explicitLanguage, Optional implicitLanguage) { + var query = new Query(); + explicitLanguage.ifPresent(language -> query.getModel().setLanguage(language)); + query.getRanking().setProfile("significance-ranking"); + var queryRoot = new AndItem(); + var queryWord = new WordItem(word, true); + queryRoot.addItem(queryWord); + query.getModel().getQueryTree().setRoot(queryRoot); + + var context = Execution.Context.createContextStub(); + implicitLanguage.ifPresent(language -> context.setLinguistics(new MockLinguistics(language))); + var execution = new Execution(new Chain<>(searcher), context); + return execution.search(query); + } + private Optional getDocumentFrequencyWithEnglish(String word) { SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get(); - var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello")); - Result r = createExecution(searcher).search(q); + return makeDocumentFrequency(model.documentFrequency(word)); + } - root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot(); - WordItem w0 = (WordItem) root.getItem(0); - assertEquals(helloDocumentFrequency, w0.getDocumentFrequency()); + private static WordItem getFirstWord(Result result) { + var resultRoot = (AndItem) result.getQuery().getModel().getQueryTree().getRoot(); + return (WordItem) resultRoot.getItem(0); + } - Query q2 = new Query(); - q2.getModel().setLanguage(Language.FRENCH); - q2.getRanking().setProfile("significance-ranking"); - AndItem root2 = new AndItem(); - WordItem tmp2; - tmp2 = new WordItem("hello", true); - root2.addItem(tmp2); + private static String getErrorMessage(Result result) { + return result.hits().getError().getDetailedMessage(); + } - q2.getModel().getQueryTree().setRoot(root2); - Result r2 = createExecution(searcher).search(q2); + @Test + public void testSignificanceSearcherWithMissingExplicitLanguageOnExistingWord() { + var existingWord = "hello"; + var explicitLanguage = Language.ITALIAN; + var result = searchWord(existingWord, Optional.of(explicitLanguage), Optional.empty()); + + var resultWord = getFirstWord(result); + assertEquals(Optional.empty(), resultWord.getDocumentFrequency()); + + var errorMessage = getErrorMessage(result); + assertEquals("No significance model available for set language ITALIAN", errorMessage); + } - assertEquals(1, r2.hits().getErrorHit().errors().size()); + @Test + public void testSignificanceSearcherWithUnknownExplicitLanguageOnExistingWord() { + var existingWord = "hello"; + var explicitLanguage = Language.UNKNOWN; + var result = searchWord(existingWord, Optional.of(explicitLanguage), Optional.empty()); + var resultWord = getFirstWord(result); + var existingDocumentFrequency = getDocumentFrequencyWithEnglish(existingWord); + assertEquals(existingDocumentFrequency, resultWord.getDocumentFrequency()); + } + + @Test + public void testSignificanceSearcherWithMissingExplicitLanguageOnMissingWord() { + var missingWord = "ciao"; + var explicitLanguage = Language.ITALIAN; + var result = searchWord(missingWord, Optional.of(explicitLanguage), Optional.empty()); + + var resultWord = getFirstWord(result); + assertEquals(Optional.empty(), resultWord.getDocumentFrequency()); + var errorMessage = getErrorMessage(result); + assertEquals("No significance model available for set language ITALIAN", errorMessage); + } - Query q3 = new Query(); - q3.getRanking().setProfile("significance-ranking"); - WordItem root3 = new WordItem("Я с детства хотел завести собаку, но родители мне не разрешали.", true); + @Test + public void testSignificanceSearcherWithMissingImplicitLanguageOnExistingWord() { + var existingWord = "hello"; + var implicitLanguage = Language.ITALIAN; + var result = searchWord(existingWord, Optional.empty(), Optional.of(implicitLanguage)); + var resultWord = getFirstWord(result); + var existingDocumentFrequency = getDocumentFrequencyWithEnglish(existingWord); + assertEquals(existingDocumentFrequency, resultWord.getDocumentFrequency()); + } - q3.getModel().getQueryTree().setRoot(root3); - Execution execution = createExecution(searcher, Language.RUSSIAN); - Result r3 = execution.search(q3); + @Test + public void testSignificanceSearcherWithMissingImplicitLanguageOnMissingWord() { + var implicitLanguage = Language.ITALIAN; + var missingWord = "ciao"; + var result = searchWord(missingWord, Optional.empty(), Optional.of(implicitLanguage)); + var resultWord = getFirstWord(result); + + var existingWord = "hello"; + var documentFrequency = getDocumentFrequencyWithEnglish(existingWord); + var count = documentFrequency.get().count(); + var defaultDocumentFrequency = Optional.of(new DocumentFrequency(1, count)); + + assertEquals(defaultDocumentFrequency, resultWord.getDocumentFrequency()); + } - assertEquals(1, r3.hits().getErrorHit().errors().size()); + // Tests for upper case words in a query + @ParameterizedTest + @ValueSource(strings = {"Hello", "HeLlo", "HELLO"}) + public void testSignificanceSearcherWithUpperCaseWord(String wordWithUpperCase) { + var result = searchWord(wordWithUpperCase, Optional.of(Language.ENGLISH), Optional.empty()); + var resultWord = getFirstWord(result); + var lowerCaseWord = "hello"; + var documentFrequency = getDocumentFrequencyWithEnglish(lowerCaseWord); + assertEquals(documentFrequency, resultWord.getDocumentFrequency()); } -} +} \ No newline at end of file