From 6fa0791c5bac03554f01fc5a8652741cb33921b5 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Sat, 9 Mar 2024 21:22:26 +0100 Subject: [PATCH] Stem prefix items If we are searching a stemmed index, it's probably better to stem terms also when we are searching for prefixes. --- .../com/yahoo/schema/document/Matching.java | 1 + .../validation/NoPrefixForIndexes.java | 3 +-- container-search/abi-spec.json | 25 +++++++++++++++---- .../yahoo/prelude/query/ExactStringItem.java | 8 ++++++ .../yahoo/prelude/query/MarkerWordItem.java | 10 ++++++++ .../com/yahoo/prelude/query/PrefixItem.java | 12 ++++++++- .../yahoo/prelude/query/SubstringItem.java | 8 ++++++ .../com/yahoo/prelude/query/SuffixItem.java | 8 ++++++ .../com/yahoo/prelude/query/WordItem.java | 8 ++++++ .../querytransform/StemmingSearcher.java | 15 ++++++----- 10 files changed, 84 insertions(+), 14 deletions(-) diff --git a/config-model/src/main/java/com/yahoo/schema/document/Matching.java b/config-model/src/main/java/com/yahoo/schema/document/Matching.java index 9d68553fa804..9f05045d0902 100644 --- a/config-model/src/main/java/com/yahoo/schema/document/Matching.java +++ b/config-model/src/main/java/com/yahoo/schema/document/Matching.java @@ -31,6 +31,7 @@ public class Matching implements Cloneable, Serializable { /** Maximum number of characters to consider when searching in this field. Used for limiting resources, especially in streaming search. */ private Integer maxLength; + /** Maximum number of occurrences for each term */ private Integer maxTermOccurrences; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java index 15d293e4abc1..0aa0dc85ab8a 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java @@ -23,8 +23,7 @@ public class NoPrefixForIndexes implements Validator { @Override public void validate(Context context) { for (SearchCluster cluster : context.model().getSearchClusters()) { - if (cluster instanceof IndexedSearchCluster) { - IndexedSearchCluster sc = (IndexedSearchCluster) cluster; + if (cluster instanceof IndexedSearchCluster sc) { for (DocumentDatabase docDb : sc.getDocumentDbs()) { DerivedConfiguration sdConfig = docDb.getDerivedConfiguration(); Schema schema = sdConfig.getSchema(); diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 73376ac4b25a..79cc578c6cdc 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -524,9 +524,12 @@ "methods" : [ "public void (java.lang.String)", "public void (java.lang.String, boolean)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.ExactStringItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -914,6 +917,7 @@ "public" ], "methods" : [ + "public com.yahoo.prelude.query.MarkerWordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public boolean isStartAnchor()", "public boolean isEndAnchor()", "protected java.lang.String getEncodedWord()", @@ -923,7 +927,8 @@ "public static com.yahoo.prelude.query.MarkerWordItem createStartOfHost(java.lang.String)", "public static com.yahoo.prelude.query.MarkerWordItem createStartOfHost()", "public static com.yahoo.prelude.query.MarkerWordItem createEndOfHost(java.lang.String)", - "public static com.yahoo.prelude.query.MarkerWordItem createEndOfHost()" + "public static com.yahoo.prelude.query.MarkerWordItem createEndOfHost()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1296,9 +1301,12 @@ "public void (java.lang.String)", "public void (java.lang.String, boolean)", "public void (java.lang.String, java.lang.String)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.PrefixItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1622,9 +1630,12 @@ "methods" : [ "public void (java.lang.String)", "public void (java.lang.String, boolean)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.SubstringItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1637,9 +1648,12 @@ "methods" : [ "public void (java.lang.String)", "public void (java.lang.String, boolean)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.SuffixItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1962,6 +1976,7 @@ "public void (com.yahoo.prelude.query.parser.Token, boolean)", "public void (java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", "public void setWord(java.lang.String)", diff --git a/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java b/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java index cb0752e54088..36e24fa81dbd 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java @@ -17,6 +17,14 @@ public ExactStringItem(String substring, boolean isFromQuery) { super(substring, isFromQuery); } + public ExactStringItem(String word, String indexName, boolean isFromQuery, Substring origin) { + super(word, indexName, isFromQuery, origin); + } + + public ExactStringItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new ExactStringItem(word, indexName, isFromQuery, origin); + } + @Override public ItemType getItemType() { return ItemType.EXACT; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java b/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java index 40ea1e37c479..48309cdd8fa4 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java @@ -25,6 +25,16 @@ private MarkerWordItem(String publicSymbol, String markerWord, String indexName) this.markerWord = markerWord; } + private MarkerWordItem(String publicSymbol, String markerWord, String indexName, boolean isFromQuery, Substring origin) { + super(publicSymbol, indexName); + this.markerWord = markerWord; + } + + /** Returns a new instance of this kind of WordItem, initialized with the given data and nothing else. */ + public MarkerWordItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new MarkerWordItem(word, markerWord, indexName, isFromQuery, origin); + } + public boolean isStartAnchor() { return getWord().equals(startAnchor); } public boolean isEndAnchor() { return getWord().equals(endAnchor); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java index 5904d805a39e..9fc087e70b49 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java @@ -17,7 +17,17 @@ public PrefixItem(String prefix, boolean isFromQuery) { super(prefix, isFromQuery); } - public PrefixItem(String prefix, String indexName) { super(prefix, indexName); } + public PrefixItem(String prefix, String indexName) { + super(prefix, indexName); + } + + public PrefixItem(String prefix, String indexName, boolean isFromQuery, Substring origin) { + super(prefix, indexName, isFromQuery, origin); + } + + public PrefixItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new PrefixItem(word, indexName, isFromQuery, origin); + } @Override public ItemType getItemType() { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java index 7a05235b1998..df9de84b04d9 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java @@ -16,6 +16,14 @@ public SubstringItem(String substring, boolean isFromQuery) { super(substring, isFromQuery); } + public SubstringItem(String substring, String indexName, boolean isFromQuery, Substring origin) { + super(substring, indexName, isFromQuery, origin); + } + + public SubstringItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new SubstringItem(word, indexName, isFromQuery, origin); + } + @Override public ItemType getItemType() { return ItemType.SUBSTRING; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java index 700564853fd9..e364330a3778 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java @@ -16,6 +16,14 @@ public SuffixItem(String suffix, boolean isFromQuery) { super(suffix, isFromQuery); } + public SuffixItem(String substring, String indexName, boolean isFromQuery, Substring origin) { + super(substring, indexName, isFromQuery, origin); + } + + public SuffixItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new SuffixItem(word, indexName, isFromQuery, origin); + } + @Override public ItemType getItemType() { return ItemType.SUFFIX; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java index 4f8b02a8d138..9cfa33fa07d9 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java @@ -62,6 +62,14 @@ public WordItem(String word, String indexName, boolean isFromQuery, Substring or setWord(word); } + /** + * Returns a new instance of this kind of WordItem, initialized with the given data and any other + * fields belonging to the item subclass copied from this instance. + */ + public WordItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new WordItem(word, indexName, isFromQuery, origin); + } + public ItemType getItemType() { return ItemType.WORD; } diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java index e8350831381f..e40f161ede2f 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java @@ -163,7 +163,7 @@ private Item scan(Item item, StemContext context) { } private Item checkBlock(BlockItem b, StemContext context) { - if (b instanceof PrefixItem || !b.isWords()) return (Item) b; + if (!b.isWords()) return (Item) b; if (b.isFromQuery() && !b.isStemmed()) { Index index = context.indexFacts.getIndex(b.getIndexName()); @@ -190,10 +190,8 @@ private Substring getOffsets(BlockItem b) { // The rewriting logic is here private Item stem(BlockItem current, StemContext context, Index index) { - Item blockAsItem = (Item)current; - CompositeItem composite; List segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language); - if (segments.isEmpty()) return blockAsItem; + if (segments.isEmpty()) return (Item)current; String indexName = current.getIndexName(); Substring substring = getOffsets(current); @@ -203,6 +201,7 @@ private Item stem(BlockItem current, StemContext context, Index index) { return (Item)w; } + CompositeItem composite; if (context.isCJK) composite = chooseCompositeForCJK(current, ((Item) current).getParent(), indexName); else @@ -219,7 +218,7 @@ private Item stem(BlockItem current, StemContext context, Index index) { if (composite instanceof AndSegmentItem) { andSegmentConnectivity(current, context.reverseConnectivity, composite); } - copyAttributes(blockAsItem, composite); + copyAttributes((Item)current, composite); composite.lock(); if (composite instanceof PhraseSegmentItem replacement) { @@ -320,7 +319,11 @@ private void setMetaData(BlockItem current, Map reverseConne private WordItem singleStemSegment(Item blockAsItem, String stem, String indexName, Substring substring) { - WordItem replacement = new WordItem(stem, indexName, true, substring); + WordItem replacement; + if (blockAsItem instanceof WordItem) // preserve the WordItem subclass type + replacement = ((WordItem)blockAsItem).newInstance(stem, indexName, true, substring); + else + replacement = new WordItem(stem, indexName, true, substring); replacement.setStemmed(true); copyAttributes(blockAsItem, replacement); return replacement;