From 1a7e4248c35a0941f7d6c39d94f999d2e8562d3b Mon Sep 17 00:00:00 2001 From: MariusArhaug Date: Wed, 19 Jun 2024 11:57:22 +0200 Subject: [PATCH] Fix CR comments --- _data/sidebar.yml | 6 +- .../vespa-cmdline-tools.html | 25 +- en/reference/schema-reference.html | 2 +- en/reference/services-search.html | 28 ++- en/reference/significance-reference.html | 107 --------- en/reference/significance.html | 217 ------------------ en/significance.html | 103 +++++++++ 7 files changed, 147 insertions(+), 341 deletions(-) delete mode 100644 en/reference/significance-reference.html delete mode 100644 en/reference/significance.html create mode 100644 en/significance.html diff --git a/_data/sidebar.yml b/_data/sidebar.yml index 932b2e0c96..63128070d2 100644 --- a/_data/sidebar.yml +++ b/_data/sidebar.yml @@ -129,8 +129,8 @@ docs: url: /en/stateless-model-evaluation.html - page: Ranking With BM25 url: /en/reference/bm25.html - - page: Ranking With Significance Model - url: /en/reference/significance.html + - page: Using Significance Model + url: /en/significance.html - page: Ranking With nativeRank url: /en/nativerank.html - page: Accelerated OR search using the WAND algorithm @@ -412,8 +412,6 @@ docs: url: /en/reference/stateless-model-reference.html - page: Embedding Model Reference url: /en/reference/embedding-reference.html - - page: Significance Model Reference - url: /en/reference/significance-reference.html - title: Queries and results reference documents: diff --git a/en/operations-selfhosted/vespa-cmdline-tools.html b/en/operations-selfhosted/vespa-cmdline-tools.html index a96e4d4ed8..68609fe2ec 100644 --- a/en/operations-selfhosted/vespa-cmdline-tools.html +++ b/en/operations-selfhosted/vespa-cmdline-tools.html @@ -1910,8 +1910,8 @@

vespa-set-node-state

-

vespa-significance

-

vepsa-signficance is a tool that generates a significance model file based on this file format. Its input is a vespa-feed file. +

vespa-significance

+

vespa-signficance is a tool that generates a significance model file. Its input is a vespa-feed file.

Synopsis: vespa-significance [options]

Example

@@ -1928,14 +1928,12 @@

vespa-significance

-h, --help - - Help text - + Help text - -i, --input <input file> + -i, --in <input file> - Vespa dump file to be used for generating the significance model + Vespa-feed file to be used for generating the significance model -o, --out <output file> @@ -1945,17 +1943,24 @@

vespa-significance

-f, --field <field> - Name of the text field to be used for tokenization + Name of the text field to be used for significance model -l, --language <language> - Language of the text field, must be a valid language code from the RFC5646 standard. +

+ Language of the text field, must be a valid language code from the RFC5646 standard. +
+ It is used with + OpenNLP's tokenizer to tokenize the text field based on that language's rules. +

-d, --doc-type <doc-id> - Document type identifier for the dump file +

Document type identifier for the vespa dump file.
+ It becomes a part of the id for put operations in the vespa-feed file. { "put": "id::<doc-id>::1" } +

diff --git a/en/reference/schema-reference.html b/en/reference/schema-reference.html index eae80f23d6..41cd3aa8fd 100644 --- a/en/reference/schema-reference.html +++ b/en/reference/schema-reference.html @@ -2488,7 +2488,7 @@

onnx-model

significance

-Constrained in rank-profile. True or false. By default this is false. When enabled Vespa will use the significance calculation based on provided significance models in the service.xml for the rank-profile it is defined in. +Contained in rank-profile. True or false. By default this is false. When enabled Vespa will use the significance calculation based on provided significance models in the service.xml for the rank-profile it is defined in.

 significance {
     use-model: true
diff --git a/en/reference/services-search.html b/en/reference/services-search.html
index 65ee2c76e7..7ca79d44c9 100644
--- a/en/reference/services-search.html
+++ b/en/reference/services-search.html
@@ -331,10 +331,10 @@ 

renderer

significance

-The significance tag can include multiple models. Their order determines the model precedence for a given language, with the last element having the highest. The models' document frequency is used to set a token's significance value based on the inverse document frequency (IDF). To enable the use of these models, the schema needs to have a rank-profile field with the significance element and the use-model flag set to true. +The significance element can include multiple models. Their order determines the model precedence for a given language, with the last element having the highest. The models' document frequency is used to set a token's significance. To enable the use of these models, the schema needs to have a rank-profile with the significance element and the use-model set to true.

-

Example of significance model with multiple models. These models are either provided by Vespa or can be generated with the vepsa-signficance cli.

+

Example with multiple model files. These models are either provided by Vespa or can be generated with the vespa-signficance cli.

 <significance>  	
     <model model-id="wikimedia"/>
@@ -345,6 +345,30 @@ 

significance

+

significance reference config

+ + + + + + + + + + + + + + + + + + + + +
NameOccurrenceDescriptionTypeDefault
modelOne To ManyUse to point to the significance model filemodel-typeN/A
+ +

chain

Specifies how a search chain should be instantiated, and how the contained searchers should be ordered. diff --git a/en/reference/significance-reference.html b/en/reference/significance-reference.html deleted file mode 100644 index 0c72a62320..0000000000 --- a/en/reference/significance-reference.html +++ /dev/null @@ -1,107 +0,0 @@ ---- -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -title: "Significance Reference" -redirect_from: -- /documentation/reference/significance-reference.html ---- - -

Reference configuration for embedders.

- -

Model config reference

-

- Significance model uses the model type configuration. - The model type configuration accepts attributes model-id, url or path, - and multiple of these can be specified as a single config value. The model order determines the precedence for a given language - with the last element having the highest precedence. -

-
    -
  • If a model-id is specified and the application is deployed on Vespa Cloud, the model-id is used.
  • -
  • Otherwise, if a url is specified, it is used
  • -
  • Otherwise, path is used.
  • -
-

- When using path, the model files must be supplied in the - Vespa application package. -

- -

Significance

-

- A significance component is comprised of one or multiple significance models, for one or multiple languages. It uses these models' document frquencies to calculate the inverse document frequency (IDF) of terms in a query. -

-

- The significance component is configured in services.xml, with the significance tag: -

- -
{% highlight xml %}
-
-    
-        	
-         
-         
-         
-      
-    
-  
-{% endhighlight %}
- -

Significance reference config

- - - - - - - - - - - - - - - - - - - - -
NameOccurrenceDescriptionTypeDefault
modelOne To ManyUse to point to the significance model filemodel-typeN/A
- - -

Significance Model File format

- -

-The significance model file is a JSON file with the following format: -

{% highlight json %}
-{
-    "version": 1,
-    "id": "wikipedia",
-    "description": "Some optional description",
-    "languages": {
-      "en": {
-        "description": "Some optional description for English model", 
-        "document-count": 1000,
-        "document-frequencies": {
-          "and": 500,
-          "car": 100,
-          ...
-        }
-      },
-      "no": {
-        "description": "Some optional description for Norwegian model", 
-        "document-count": 800,
-        "document-frequencies": {
-          "bil": 80,
-          "og": 400,
-          ...
-        }
-      }
-    }
-  }
-{% endhighlight %}
-
-

-

-Each file contains a map of languages and their document frequencies. - -

- \ No newline at end of file diff --git a/en/reference/significance.html b/en/reference/significance.html deleted file mode 100644 index 1a63ca6a87..0000000000 --- a/en/reference/significance.html +++ /dev/null @@ -1,217 +0,0 @@ ---- -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -title: "Significance Model Reference" -redirect_from: -- /documentation/reference/significance.html ---- -

-The -significance model feature -implements the inverse document frequency query term for tokens based on an existing or user defined significance model. A siginficance model is a mapping from query terms to a floating point value. The significance model(s) are either provided by Vespa or can be generated using the Vespa-CLI command vespa-significance. - -

Background

-The bm25 rank feature uses the inverse document frequency (IDF) of each query term searching an index field when calculating the score of a document: - -
-
- - - - - - i - n - - I - D - F - - ( - - q - i - - ) - - - - - f - ( - - q - i - - , - D - ) - - ( - - k - 1 - - + - 1 - ) - - - f - ( - - q - i - - , - D - ) - + - - k - 1 - - - ( - 1 - - - b - + - b - - - - f - i - e - l - d - _ - l - e - n - - - a - v - g - _ - f - i - e - l - d - _ - l - e - n - - - ) - - - - - -
-
- -

The IDF of query term i in field t is currently calculated per field per content node:

-
-
- - - - l - o - g - ( - 1 - + - - - N - - - n - ( - - q - i - - ) - + - 0.5 - - - n - ( - - q - i - - ) - + - 0.5 - - - ) - - - -
-
- -

- N is the total number of documents on the content node, and n(qi) is the number of documents containing the query term qi for field t. -

- -

Short commings:

-
    -
  • The IDF values will typically be different across content nodes, as they contain a different subset of the document corpus. This might lead to inconsistent ranking order between content nodes.
  • -
  • When using bm25 in streaming search no IDF values are available at all, as we don’t build an inverted index where the IDF values can be extracted from
  • -
- -

With the user or Vepsa defined significance models, the IDF calculation can be overridden

- -

Example

-

-In the following example, we show how to reference a significance model in the service.xml. -Note that the field must be enabled for usage with the bm25 feature -by setting the use-model flag in the -significance rank-profile -section of the field definition. -

- -

-

-<container version="1.0">
-    <search>
-        <significance>  	
-            <model model-id="wikimedia"/>
-            <model url="https://some/uri/bibel-multilingual.json" />
-            <model path="models/reddit-norge.no.json.zst" />
-        </significance>
-    </search>
-</container>      
-
-

- -Note that it is possible to specify multiple significance models in the service.xml file. - -

-

-schema example {
-  document example {
-    field content type string {
-      indexing: index | summary
-      index: enable-bm25
-    }
-  }
-  rank-profile default {
-    significance {
-      use-model: true
-    }
-  }
-}
-
-

- diff --git a/en/significance.html b/en/significance.html new file mode 100644 index 0000000000..c224879c38 --- /dev/null +++ b/en/significance.html @@ -0,0 +1,103 @@ +--- +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +title: "Using Significance Model" +redirect_from: +- /documentation/reference/significance.html +--- +

+The +significance model feature +implements the inverse document frequency query term for tokens based on an existing or user defined significance model. A siginficance model is a mapping from query terms to a floating point value. The significance model(s) are either provided by Vespa or can be generated using the Vespa-CLI command vespa-significance. + +

Background

+

The bm25 and native rank features uses the significance value of each query term searching an index field when calculating the score of a document. There are short commings with these ranking features, to name a few, the bm25 rank feature suffers from the following limitations: +

+ +
    +
  • The significance values will typically be different across content nodes, as they contain a different subset of the document corpus. This might lead to inconsistent ranking order between content nodes.
  • +
  • When using bm25 in streaming search no significance values are available at all, as we don’t build an inverted index where the IDF values can be extracted from
  • +
+ +

By explicitly using a Vespa or user defined significance model, these rank features calculations can be overridden

+ +

Example

+

+In the following example, we show how to reference a significance model in the service.xml. +Note that the field must be enabled for usage with the bm25 feature +by setting the use-model flag in the +significance rank-profile +section of the field definition. +

+

+ A significance component is comprised of one or multiple significance models, for one or multiple languages. It uses these models' document frquencies to calculate the inverse document frequency (IDF) of terms in a query. +

+ +

+

{% highlight xml %}
+
+    
+          	
+            
+            
+            
+        
+    
+
+{% endhighlight %}
+

+ +Note that it is possible to specify multiple significance models in the service.xml file. + +

+

+schema example {
+  document example {
+    field content type string {
+      indexing: index | summary
+      index: enable-bm25
+    }
+  }
+  rank-profile default {
+    significance {
+      use-model: true
+    }
+  }
+}
+
+

+

Significance Model File format

+ +

+ The significance model file is a JSON file with the following format: +

{% highlight json %}
+{
+  "version": 1,
+  "id": "wikipedia",
+  "description": "Some optional description",
+  "languages": {
+    "en": {
+      "description": "Some optional description for English model", 
+      "document-count": 1000,
+      "document-frequencies": {
+        "and": 500,
+        "car": 100,
+        ...
+      }
+    },
+    "no": {
+      "description": "Some optional description for Norwegian model", 
+      "document-count": 800,
+      "document-frequencies": {
+        "bil": 80,
+        "og": 400,
+        ...
+      }
+    }
+  }
+}{% endhighlight %}
+

+

+Each file contains a map of languages and their document frequencies. The document frequencies are the number of documents in the corpus that contain the term. The document count is the total number of documents in the corpus. + +

+