Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix significance model resolver, better tests, handling of upper case in query words #32223

Merged
merged 6 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions container-search/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ private Result calculateAndSetSignificance(Query query, Execution execution) {
}

private SignificanceModel getSignificanceModelFromQueryLanguage(Query query) throws IllegalArgumentException {
/*
Implements the following model resolving logic:
- When language is explicitly tagged on query
- Use language if available from the model registry, fail otherwise.
- If “un” try both “un” and “en”.
- When language is implicitly detected
- Use language if available from the model registry. Fallback to “un” then “en”, fail if none are available.
*/

Language explicitLanguage = query.getModel().getLanguage();
Language implicitLanguage = query.getModel().getParsingLanguage();

Expand All @@ -132,14 +141,8 @@ private SignificanceModel getSignificanceModelFromQueryLanguage(Query query) thr
return model.get();
}

if (implicitLanguage == Language.UNKNOWN) {
return handleFallBackToUnknownLanguage();
}
var model = significanceModelRegistry.getModel(implicitLanguage);
if (model.isEmpty()) {
throw new IllegalArgumentException("No significance model available for implicit language " + implicitLanguage);
}
return model.get();
return model.orElseGet(this::handleFallBackToUnknownLanguage);
}

private SignificanceModel handleFallBackToUnknownLanguage() throws IllegalArgumentException {
Expand All @@ -158,7 +161,7 @@ private void setIDF(Item root, SignificanceModel significanceModel) {

if (root instanceof WordItem wi) {
var word = wi.getWord();
var documentFrequency = significanceModel.documentFrequency(word);
var documentFrequency = significanceModel.documentFrequency(word.toLowerCase());
long N = documentFrequency.corpusSize();
long nq_i = documentFrequency.frequency();
log.log(Level.FINE, () -> "Setting document frequency for " + word + " to {frequency: " + nq_i + ", count: " + N + "}");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
package com.yahoo.search.significance.test;

import com.yahoo.component.chain.Chain;
import com.yahoo.config.subscription.ConfigGetter;
import com.yahoo.language.Language;
import com.yahoo.language.Linguistics;
import com.yahoo.language.detect.Detection;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.detect.Hint;
import com.yahoo.language.opennlp.OpenNlpLinguistics;
import com.yahoo.language.process.*;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
Expand All @@ -24,13 +22,13 @@
import com.yahoo.search.schema.SchemaInfo;
import com.yahoo.search.searchchain.Execution;
import com.yahoo.search.significance.SignificanceSearcher;
import com.yahoo.vespa.config.search.RankProfilesConfig;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Optional;

Expand Down Expand Up @@ -218,7 +216,7 @@ void testSignificanceValueOnSimpleANDQuery() {
q.getModel().getQueryTree().setRoot(root);

SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("Hello"));
var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello"));
var worldDocumentFrequency = makeDocumentFrequency(model.documentFrequency("world"));

Result r = createExecution(searcher).search(q);
Expand Down Expand Up @@ -256,9 +254,9 @@ void testSignificanceValueOnRecursiveQuery() {

q.getModel().getQueryTree().setRoot(root);

SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello"));
var testDocumentFrequency = makeDocumentFrequency(model.documentFrequency("test"));
var helloDocumentFrequency = getDocumentFrequencyWithEnglish("hello");
var testDocumentFrequency = getDocumentFrequencyWithEnglish("test");

Result r = createExecution(searcher).search(q);

root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
Expand Down Expand Up @@ -296,59 +294,124 @@ public void failsOnConflictingSignificanceConfiguration() {
var result = createExecution(searcher).search(query);
assertEquals(1, result.hits().getErrorHit().errors().size());

var errorMessage = result.hits().getError();
var errorMessage = getErrorMessage(result);
assertEquals("Inconsistent 'significance' configuration for the rank profile 'significance-ranking' in the schemas [music, album]. " +
"Use 'restrict' to limit the query to a subset of schemas " +
"(https://docs.vespa.ai/en/schemas.html#multiple-schemas). " +
"Specify same 'significance' configuration for all selected schemas " +
"(https://docs.vespa.ai/en/reference/schema-reference.html#significance).",
errorMessage.getDetailedMessage());
errorMessage);
}

@Test
public void testSignificanceSearcherWithExplictitAndImplictSetLanguages() {
Query q = new Query();
q.getModel().setLanguage(Language.UNKNOWN);
q.getRanking().setProfile("significance-ranking");
AndItem root = new AndItem();
WordItem tmp;
tmp = new WordItem("hello", true);
root.addItem(tmp);
// Tests that follow verify model resolving logic in different scenarios.
// Test naming convention is as follows:
// Explicit language - language set in a query
// Implicit language - language detected automatically
// Missing language - language without a model
// Unknown language - Language.UNKNOWN
// Missing word - word not in a model for the specified language or fallback models
// Existing word - word in a model for the specified language or fallback models

q.getModel().getQueryTree().setRoot(root);
private Result searchWord(String word, Optional<Language> explicitLanguage, Optional<Language> implicitLanguage) {
var query = new Query();
explicitLanguage.ifPresent(language -> query.getModel().setLanguage(language));
query.getRanking().setProfile("significance-ranking");
var queryRoot = new AndItem();
var queryWord = new WordItem(word, true);
queryRoot.addItem(queryWord);
query.getModel().getQueryTree().setRoot(queryRoot);

var context = Execution.Context.createContextStub();
implicitLanguage.ifPresent(language -> context.setLinguistics(new MockLinguistics(language)));
var execution = new Execution(new Chain<>(searcher), context);
return execution.search(query);
}

private Optional<DocumentFrequency> getDocumentFrequencyWithEnglish(String word) {
SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
var helloDocumentFrequency = makeDocumentFrequency(model.documentFrequency("hello"));
Result r = createExecution(searcher).search(q);
return makeDocumentFrequency(model.documentFrequency(word));
}

root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
WordItem w0 = (WordItem) root.getItem(0);
assertEquals(helloDocumentFrequency, w0.getDocumentFrequency());
private static WordItem getFirstWord(Result result) {
var resultRoot = (AndItem) result.getQuery().getModel().getQueryTree().getRoot();
return (WordItem) resultRoot.getItem(0);
}

Query q2 = new Query();
q2.getModel().setLanguage(Language.FRENCH);
q2.getRanking().setProfile("significance-ranking");
AndItem root2 = new AndItem();
WordItem tmp2;
tmp2 = new WordItem("hello", true);
root2.addItem(tmp2);
private static String getErrorMessage(Result result) {
return result.hits().getError().getDetailedMessage();
}

q2.getModel().getQueryTree().setRoot(root2);
Result r2 = createExecution(searcher).search(q2);
@Test
public void testSignificanceSearcherWithMissingExplicitLanguageOnExistingWord() {
var existingWord = "hello";
var explicitLanguage = Language.ITALIAN;
var result = searchWord(existingWord, Optional.of(explicitLanguage), Optional.empty());

var resultWord = getFirstWord(result);
assertEquals(Optional.empty(), resultWord.getDocumentFrequency());

var errorMessage = getErrorMessage(result);
assertEquals("No significance model available for set language ITALIAN", errorMessage);
}

assertEquals(1, r2.hits().getErrorHit().errors().size());
@Test
public void testSignificanceSearcherWithUnknownExplicitLanguageOnExistingWord() {
var existingWord = "hello";
var explicitLanguage = Language.UNKNOWN;
var result = searchWord(existingWord, Optional.of(explicitLanguage), Optional.empty());
var resultWord = getFirstWord(result);
var existingDocumentFrequency = getDocumentFrequencyWithEnglish(existingWord);
assertEquals(existingDocumentFrequency, resultWord.getDocumentFrequency());
}

@Test
public void testSignificanceSearcherWithMissingExplicitLanguageOnMissingWord() {
var missingWord = "ciao";
var explicitLanguage = Language.ITALIAN;
var result = searchWord(missingWord, Optional.of(explicitLanguage), Optional.empty());

var resultWord = getFirstWord(result);
assertEquals(Optional.empty(), resultWord.getDocumentFrequency());

var errorMessage = getErrorMessage(result);
assertEquals("No significance model available for set language ITALIAN", errorMessage);
}

Query q3 = new Query();
q3.getRanking().setProfile("significance-ranking");
WordItem root3 = new WordItem("Я с детства хотел завести собаку, но родители мне не разрешали.", true);
@Test
public void testSignificanceSearcherWithMissingImplicitLanguageOnExistingWord() {
var existingWord = "hello";
var implicitLanguage = Language.ITALIAN;
var result = searchWord(existingWord, Optional.empty(), Optional.of(implicitLanguage));
var resultWord = getFirstWord(result);
var existingDocumentFrequency = getDocumentFrequencyWithEnglish(existingWord);
assertEquals(existingDocumentFrequency, resultWord.getDocumentFrequency());
}

q3.getModel().getQueryTree().setRoot(root3);
Execution execution = createExecution(searcher, Language.RUSSIAN);
Result r3 = execution.search(q3);
@Test
public void testSignificanceSearcherWithMissingImplicitLanguageOnMissingWord() {
var implicitLanguage = Language.ITALIAN;
var missingWord = "ciao";
var result = searchWord(missingWord, Optional.empty(), Optional.of(implicitLanguage));
var resultWord = getFirstWord(result);

var existingWord = "hello";
var documentFrequency = getDocumentFrequencyWithEnglish(existingWord);
var count = documentFrequency.get().count();
var defaultDocumentFrequency = Optional.of(new DocumentFrequency(1, count));

assertEquals(defaultDocumentFrequency, resultWord.getDocumentFrequency());
}

assertEquals(1, r3.hits().getErrorHit().errors().size());
// Tests for upper case words in a query
@ParameterizedTest
@ValueSource(strings = {"Hello", "HeLlo", "HELLO"})
public void testSignificanceSearcherWithUpperCaseWord(String wordWithUpperCase) {
var result = searchWord(wordWithUpperCase, Optional.of(Language.ENGLISH), Optional.empty());
var resultWord = getFirstWord(result);

var lowerCaseWord = "hello";
var documentFrequency = getDocumentFrequencyWithEnglish(lowerCaseWord);

assertEquals(documentFrequency, resultWord.getDocumentFrequency());
}
}
}