From 1a7e4248c35a0941f7d6c39d94f999d2e8562d3b Mon Sep 17 00:00:00 2001
From: MariusArhaug <mariusarhaug@hotmail.com>
Date: Wed, 19 Jun 2024 11:57:22 +0200
Subject: [PATCH] Fix CR comments

---
 _data/sidebar.yml                             |   6 +-
 .../vespa-cmdline-tools.html                  |  25 +-
 en/reference/schema-reference.html            |   2 +-
 en/reference/services-search.html             |  28 ++-
 en/reference/significance-reference.html      | 107 ---------
 en/reference/significance.html                | 217 ------------------
 en/significance.html                          | 103 +++++++++
 7 files changed, 147 insertions(+), 341 deletions(-)
 delete mode 100644 en/reference/significance-reference.html
 delete mode 100644 en/reference/significance.html
 create mode 100644 en/significance.html
diff --git a/_data/sidebar.yml b/_data/sidebar.yml
index 932b2e0c96..63128070d2 100644
--- a/_data/sidebar.yml
+++ b/_data/sidebar.yml
@@ -129,8 +129,8 @@ docs:
         url: /en/stateless-model-evaluation.html
       - page: Ranking With BM25
         url: /en/reference/bm25.html
-      - page: Ranking With Significance Model
-        url: /en/reference/significance.html
+      - page: Using Significance Model
+        url: /en/significance.html
       - page: Ranking With nativeRank
         url: /en/nativerank.html
       - page: Accelerated OR search using the WAND algorithm
@@ -412,8 +412,6 @@ docs:
         url: /en/reference/stateless-model-reference.html
       - page: Embedding Model Reference
         url: /en/reference/embedding-reference.html
-      - page: Significance Model Reference
-        url: /en/reference/significance-reference.html
 
   - title: Queries and results reference
     documents:
diff --git a/en/operations-selfhosted/vespa-cmdline-tools.html b/en/operations-selfhosted/vespa-cmdline-tools.html
index a96e4d4ed8..68609fe2ec 100644
--- a/en/operations-selfhosted/vespa-cmdline-tools.html
+++ b/en/operations-selfhosted/vespa-cmdline-tools.html
@@ -1910,8 +1910,8 @@ <h2 id="vespa-set-node-state">vespa-set-node-state</h2>
 
 <!--h2 id="vespa-slobrok-cmd">vespa-slobrok-cmd</h2-->
 
-<h2 id="vepsa-signficance">vespa-significance</h2>
-<p><code>vepsa-signficance</code> is a tool that generates a significance model file based on <a href="">this</a> file format. Its input is a <code>vespa-feed</code> file. 
+<h2 id="vespa-significance">vespa-significance</h2>
+<p><code>vespa-signficance</code> is a tool that generates a significance model <a href="../reference/significance-reference.html#significance-file-format">file</a>. Its input is a <a href="../reference/document-json-format.html"><em>vespa-feed</em></a> file. 
 </p>
 <p>Synopsis: <code>vespa-significance [options]</code></p>
 <p>Example</p>
@@ -1928,14 +1928,12 @@ <h2 id="vepsa-signficance">vespa-significance</h2>
   <tbody>
     <tr>
       <th>-h, --help</th>
-      <td>
-        Help text
-      </td>
+      <td>Help text</td>
     </tr>
     <tr>
-      <th>-i, --input &lt;input file&gt;</th>
+      <th>-i, --in &lt;input file&gt;</th>
       <td>
-        Vespa dump file to be used for generating the significance model
+        <a href="../reference/document-json-format.html">Vespa-feed</a>  file to be used for generating the significance model
       </td>
     </tr><tr>
       <th>-o, --out &lt;output file&gt;</th>
@@ -1945,17 +1943,24 @@ <h2 id="vepsa-signficance">vespa-significance</h2>
     </tr><tr>
       <th> -f, --field &lt;field&gt;</th>
       <td>
-        Name of the text field to be used for tokenization 
+        Name of the text field to be used for significance model 
       </td>
     </tr><tr>
       <th> -l, --language &lt;language&gt;</th>
       <td>
-        Language of the text field, must be a valid language code from the <a href="https://www.rfc-editor.org/rfc/rfc5646">RFC5646</a> standard.
+        <p>
+          Language of the text field, must be a valid language code from the <a href="https://www.rfc-editor.org/rfc/rfc5646">RFC5646</a> standard. 
+        <br >
+          It is used with
+          OpenNLP's tokenizer to tokenize the text field based on that language's rules.
+        </p>
       </td>
     </tr><tr>
       <th> -d, --doc-type &lt;doc-id&gt;</th>
       <td>
-          Document type identifier for the dump file
+        <p>Document type identifier for the vespa dump file. <br>
+          It becomes a part of the id for <a href="../reference/document-json-format.html#put">put</a> operations in the vespa-feed file. <code>&#123; "put": "id::&lt;doc-id&gt;::1" &#125; </code>
+        </p>
       </td>
     </tr>
   </tbody>
diff --git a/en/reference/schema-reference.html b/en/reference/schema-reference.html
index eae80f23d6..41cd3aa8fd 100644
--- a/en/reference/schema-reference.html
+++ b/en/reference/schema-reference.html
@@ -2488,7 +2488,7 @@ <h2 id="onnx-model">onnx-model</h2>
 
 <h2 id="significance">significance</h2>
 <p>
-Constrained in <a href="#rank-profile">rank-profile</a>. True or false. By default this is false. When enabled Vespa will use the significance calculation based on provided significance models in the service.xml for the rank-profile it is defined in. 
+Contained in <a href="#rank-profile">rank-profile</a>. True or false. By default this is false. When enabled Vespa will use the significance calculation based on provided significance models in the service.xml for the rank-profile it is defined in. 
 <pre>
 significance {
     use-model: true
diff --git a/en/reference/services-search.html b/en/reference/services-search.html
index 65ee2c76e7..7ca79d44c9 100644
--- a/en/reference/services-search.html
+++ b/en/reference/services-search.html
@@ -331,10 +331,10 @@ <h2 id="renderer">renderer</h2>
 
 <h2 id="significance">significance</h2>
 <p>
-The significance tag can include multiple models. Their order determines the model precedence for a given language, with the last element having the highest. The models' document frequency is used to set a token's significance value based on the inverse document frequency (IDF). To enable the use of these models, the schema needs to have a rank-profile field with the <em>significance</em> element and the  <em>use-model</em> flag set to <em>true</em>.
+The significance element can include multiple models. Their order determines the model precedence for a given language, with the last element having the highest. The models' document frequency is used to set a token's significance. To enable the use of these models, the schema needs to have a rank-profile with the <em>significance</em> element and the  <em>use-model</em> set to <em>true</em>.
 </p>
 
-<p>Example of significance model with multiple models. These models are either provided by <em>Vespa</em> or can be generated with the <a href="vespa-cmdline-tools.html#vespa-significance">vepsa-signficance</a> cli. </p>
+<p>Example with multiple <a href="config-files.html#model">model</a> files. These models are either provided by <em>Vespa</em> or can be generated with the <a href="vespa-cmdline-tools.html#vespa-significance">vespa-signficance</a> cli. </p>
 <pre data-test="file" data-path="my-app/src/main/application/services.xml">
 &lt;significance&gt;  	
     &lt;model model-id="wikimedia"/&gt;
@@ -345,6 +345,30 @@ <h2 id="significance">significance</h2>
 </p>
 
 
+<h3 id="significance-reference-config">significance reference config</h3>
+<table class="table">
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Occurrence</th>
+      <th>Description</th>
+      <th>Type</th>
+      <th>Default</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>model</td>
+      <td>One To Many</td>
+      <td>Use to point to the significance model file</td>
+      <td><a href="#model-config-reference">model-type</a></td>
+      <td>N/A</td>
+    </tr>
+    
+  </tbody>
+</table>
+
+
 <h2 id="chain">chain</h2>
 <p>
   Specifies how a search chain should be instantiated, and how the contained searchers should be ordered.
diff --git a/en/reference/significance-reference.html b/en/reference/significance-reference.html
deleted file mode 100644
index 0c72a62320..0000000000
--- a/en/reference/significance-reference.html
+++ /dev/null
@@ -1,107 +0,0 @@
----
-# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-title: "Significance Reference"
-redirect_from:
-- /documentation/reference/significance-reference.html
----
-
-<p>Reference configuration for <a href="../embedding.html">embedders</a>.</p>
-
-<h2 id="model-config-reference">Model config reference</h2>
-<p>
-  Significance model uses the <a href="config-files.html#model">model</a> type configuration. 
-  The <em>model</em> type configuration accepts attributes <code>model-id</code>, <code>url</code> or <code>path</code>,
-  and multiple of these can be specified as a single config value. The model order determines the precedence for a given language - with the last element having the highest precedence.
-</p>
-  <ul>
-    <li>If a <code>model-id</code> is specified and the application is deployed on Vespa Cloud, the <code>model-id</code> is used.</li>
-    <li>Otherwise, if a <code>url</code> is specified, it is used</li>
-    <li>Otherwise, <code>path</code> is used.</li>
-  </ul>
-<p>
-  When using <code>path</code>, the model files must be supplied in the
-  Vespa <a href="../application-packages.html#deploying-remote-models">application package</a>.
-</p>
-
-<h2 id="significance">Significance</h2>
-<p>
-  A significance component is comprised of one or multiple significance models, for one or multiple languages. It uses these models' document frquencies to calculate the inverse document frequency (IDF) of terms in a query.
-</p>
-<p>
-  The significance component is configured in <a href="services.html">services.xml</a>, with the <code>significance</code> tag:
-</p>
-
-<pre>{% highlight xml %}
-<container version="1.0">
-    <search>
-      <significance>  	
-         <model model-id="wikimedia"/>
-         <model url="https://some/uri/bibel-multilingual.json" />
-         <model path="models/reddit-norge.no.json.zst" />
-      </significance>
-    </search>
-  </container>
-{% endhighlight %}</pre>
-
-<h3 id="significance-reference-config">Significance reference config</h3>
-<table class="table">
-  <thead>
-    <tr>
-      <th>Name</th>
-      <th>Occurrence</th>
-      <th>Description</th>
-      <th>Type</th>
-      <th>Default</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>model</td>
-      <td>One To Many</td>
-      <td>Use to point to the significance model file</td>
-      <td><a href="#model-config-reference">model-type</a></td>
-      <td>N/A</td>
-    </tr>
-    
-  </tbody>
-</table>
-
-
-<h2>Significance Model File format</h2>
-
-<p>
-The significance model file is a JSON file with the following format:
-<pre>{% highlight json %}
-{
-    "version": 1,
-    "id": "wikipedia",
-    "description": "Some optional description",
-    "languages": {
-      "en": {
-        "description": "Some optional description for English model", 
-        "document-count": 1000,
-        "document-frequencies": {
-          "and": 500,
-          "car": 100,
-          ...
-        }
-      },
-      "no": {
-        "description": "Some optional description for Norwegian model", 
-        "document-count": 800,
-        "document-frequencies": {
-          "bil": 80,
-          "og": 400,
-          ...
-        }
-      }
-    }
-  }
-{% endhighlight %}
-</pre>
-</p>
-<p>
-Each file contains a map of languages and their document frequencies.
-
-</p>
-  
\ No newline at end of file
diff --git a/en/reference/significance.html b/en/reference/significance.html
deleted file mode 100644
index 1a63ca6a87..0000000000
--- a/en/reference/significance.html
+++ /dev/null
@@ -1,217 +0,0 @@
----
-# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-title: "Significance Model Reference"
-redirect_from:
-- /documentation/reference/significance.html
----
-<p>
-The
-<a href="rank-features.html#significance">significance model feature</a>
-implements the <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency">inverse document frequency</a> query term for tokens based on an existing or user defined significance model. A siginficance model is a mapping from query terms to a floating point value. The significance model(s) are either provided by Vespa or can be generated using the Vespa-CLI command <a href="../operations-selfhosted/vespa-cmdline-tools.html#vepsa-signficance"><em>vespa-significance</em></a>. 
-
-<h2>Background</h2>
-The <a href="bm25.html">bm25 rank feature</a> uses the inverse document frequency (IDF) of each query term searching an index field when calculating the score of a document:
-
-<figure style="font-size: 1.2rem">
-  <div>
-    <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
-      <semantics>
-        <mrow>
-          <munderover>
-            <mo>&Sum;</mo>
-            <mi>i</mi>
-            <mi>n</mi>
-          </munderover>
-          <mi>I</mi>
-          <mi>D</mi>
-          <mi>F</mi>
-          <mrow>
-            <mo>(</mo>
-            <msub>
-              <mi>q</mi>
-              <mi>i</mi>
-            </msub>
-            <mo>)</mo>
-          </mrow>
-          <mo>&#x22C5;<!--MULTIPLICATION SIGN--></mo>
-          <mfrac>
-            <mrow>
-              <mi>f</mi>
-              <mo>(</mo>
-              <msub>
-                <mi>q</mi>
-                <mi>i</mi>
-              </msub>
-              <mo>,</mo>
-              <mi>D</mi>
-              <mo>)</mo>
-              <mo>&#x22C5;<!--MULTIPLICATION SIGN--></mo>
-              <mo>(</mo>
-              <msub>
-                <mi>k</mi>
-                <mn>1</mn>
-              </msub>
-              <mo>+</mo>
-              <mn>1</mn>
-              <mo>)</mo>
-            </mrow>
-            <mrow>
-              <mi>f</mi>
-              <mo stretchy="false">(</mo>
-              <msub>
-                <mi>q</mi>
-                <mi>i</mi>
-              </msub>
-              <mo>,</mo>
-              <mi>D</mi>
-              <mo stretchy="false">)</mo>
-              <mo>+</mo>
-              <msub>
-                <mi>k</mi>
-                <mn>1</mn>
-              </msub>
-              <mo>&#x22C5;<!--MULTIPLICATION SIGN--></mo>
-              <mo>(</mo>
-              <mn>1</mn>
-              <mo>-</mo>
-              <mi>b</mi>
-              <mo>+</mo>
-              <mi>b</mi>
-              <mo>&#x22C5;<!--MULTIPLICATION SIGN--></mo>
-              <mfrac>
-                <mrow>
-                  <mi>f</mi>
-                  <mi>i</mi>
-                  <mi>e</mi>
-                  <mi>l</mi>
-                  <mi>d</mi>
-                  <mi>_</mi>
-                  <mi>l</mi>
-                  <mi>e</mi>
-                  <mi>n</mi>
-                </mrow>
-                <mrow>
-                  <mi>a</mi>
-                  <mi>v</mi>
-                  <mi>g</mi>
-                  <mi>_</mi>
-                  <mi>f</mi>
-                  <mi>i</mi>
-                  <mi>e</mi>
-                  <mi>l</mi>
-                  <mi>d</mi>
-                  <mi>_</mi>
-                  <mi>l</mi>
-                  <mi>e</mi>
-                  <mi>n</mi>
-                </mrow>
-              </mfrac>
-              <mo>)</mo>
-            </mrow>
-          </mfrac>
-        </mrow>
-      </semantics>
-    </math>
-  </div>
-</figure>
-
-<p>The IDF of query term i in field t is currently calculated <strong>per field per content node:</strong></p>
-<figure style="font-size: 1.2rem">
-  <div>
-    <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
-      <semantics>
-        <mrow>
-          <mi>l</mi>
-          <mi>o</mi>
-          <mi>g</mi>
-          <mo>(</mo>
-          <mn>1</mn>
-          <mo>+</mo>
-          <mfrac>
-            <mrow>
-              <mi>N</mi>
-              <mo>-</mo>
-              <mi>n</mi>
-              <mo>(</mo>
-              <msub>
-                <mi>q</mi>
-                <mi>i</mi>
-              </msub>
-              <mo>)</mo>
-              <mo>+</mo>
-              <mn>0.5</mn>
-            </mrow>
-            <mrow>
-              <mi>n</mi>
-              <mo>(</mo>
-              <msub>
-                <mi>q</mi>
-                <mi>i</mi>
-              </msub>
-              <mo>)</mo>
-              <mo>+</mo>
-              <mn>0.5</mn>
-            </mrow>
-          </mfrac>
-          <mo>)</mo>
-        </mrow>
-      </semantics>
-    </math>
-  </div>
-</figure>
-
-<p>
-  <strong>N</strong> is the total number of documents on the content node, and <strong>n(q<sub>i</sub>)</strong> is the number of documents containing the query term <strong>q<sub>i</sub></strong> for field <strong>t</strong>.
-</p>
-
-<h3>Short commings:</h3>
-<ul>
-  <li>The IDF values will typically be different across content nodes, as they contain a different subset of the document corpus. This might lead to inconsistent ranking order between content nodes.</li>
-  <li>When using bm25 in <a href="../streaming-search.html" >streaming search</a> no IDF values are available at all, as we don’t build an inverted index where the IDF values can be extracted from</li>
-</ul>
-
-<p>With the user or Vepsa defined significance models, the IDF calculation can be overridden</p>
-
-<h2 id="example">Example</h2>
-<p>
-In the following example, we show how to reference a significance model in the <code>service.xml</code>. 
-Note that the field must be enabled for usage with the bm25 feature
-by setting the <em>use-model</em> flag in the
-<a href="schema-reference.html#index">significance</a> rank-profile 
-section of the field definition.
-</p>
-
-<p>
-<pre data-test="file" data-path="my-app/src/main/application/services.xml">
-&lt;container version="1.0"&gt;
-    &lt;search&gt;
-        &lt;significance&gt;  	
-            &lt;model model-id="wikimedia"/&gt;
-            &lt;model url="https://some/uri/bibel-multilingual.json" /&gt;
-            &lt;model path="models/reddit-norge.no.json.zst" /&gt;
-        &lt;/significance&gt;
-    &lt;/search&gt;
-&lt;/container&gt;      
-</pre>
-</p>
-
-Note that it is possible to specify multiple significance models in the <code>service.xml</code> file.
-
-<p>
-<pre>
-schema example {
-  document example {
-    field content type string {
-      indexing: index | summary
-      index: enable-bm25
-    }
-  }
-  rank-profile default {
-    significance {
-      use-model: true
-    }
-  }
-}
-</pre>
-</p>
-
diff --git a/en/significance.html b/en/significance.html
new file mode 100644
index 0000000000..c224879c38
--- /dev/null
+++ b/en/significance.html
@@ -0,0 +1,103 @@
+---
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+title: "Using Significance Model"
+redirect_from:
+- /documentation/reference/significance.html
+---
+<p>
+The
+<a href="reference/rank-features.html#significance">significance model feature</a>
+implements the <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency">inverse document frequency</a> query term for tokens based on an existing or user defined significance model. A siginficance model is a mapping from query terms to a floating point value. The significance model(s) are either provided by Vespa or can be generated using the Vespa-CLI command <a href="../operations-selfhosted/vespa-cmdline-tools.html#vespa-significance"><em>vespa-significance</em></a>. 
+
+ <h2>Background</h2>
+<p>The <a href="reference/bm25.html">bm25</a> and <a href="nativerank.html">native</a> rank features uses the <a href="nativerank.html#weight-significance-and-connectedness">significance value</a> of each query term searching an index field when calculating the score of a document. There are short commings with these ranking features, to name a few, the bm25 rank feature suffers from the following limitations:
+</p>
+
+<ul>
+  <li>The significance values will typically be different across content nodes, as they contain a different subset of the document corpus. This might lead to inconsistent ranking order between content nodes.</li>
+  <li>When using bm25 in <a href="../streaming-search.html" >streaming search</a> no significance values are available at all, as we don’t build an inverted index where the IDF values can be extracted from</li>
+</ul>
+
+<p>By explicitly using a Vespa or user defined significance model, these rank features calculations can be overridden</p>
+
+<h2 id="example">Example</h2>
+<p>
+In the following example, we show how to reference a significance model in the <code>service.xml</code>. 
+Note that the field must be enabled for usage with the bm25 feature
+by setting the <em>use-model</em> flag in the
+<a href="schema-reference.html#index">significance</a> rank-profile 
+section of the field definition.
+</p>
+<p>
+  A significance component is comprised of one or multiple significance models, for one or multiple languages. It uses these models' document frquencies to calculate the inverse document frequency (IDF) of terms in a query.
+</p>
+
+<p>
+<pre>{% highlight xml %}
+<container version="1.0">
+    <search>
+        <significance>  	
+            <model model-id="wikimedia"/>
+            <model url="https://some/uri/bibel-multilingual.json" />
+            <model path="models/reddit-norge.no.json.zst" />
+        </significance>
+    </search>
+</container>
+{% endhighlight %}</pre>  
+</p>
+
+Note that it is possible to specify multiple significance models in the <code>service.xml</code> file.
+
+<p>
+<pre>
+schema example {
+  document example {
+    field content type string {
+      indexing: index | summary
+      index: enable-bm25
+    }
+  }
+  rank-profile default {
+    significance {
+      use-model: true
+    }
+  }
+}
+</pre>
+</p>
+<h2 id="significance-file-format">Significance Model File format</h2>
+
+<p>
+  The significance model file is a JSON file with the following format:
+<pre>{% highlight json %}
+{
+  "version": 1,
+  "id": "wikipedia",
+  "description": "Some optional description",
+  "languages": {
+    "en": {
+      "description": "Some optional description for English model", 
+      "document-count": 1000,
+      "document-frequencies": {
+        "and": 500,
+        "car": 100,
+        ...
+      }
+    },
+    "no": {
+      "description": "Some optional description for Norwegian model", 
+      "document-count": 800,
+      "document-frequencies": {
+        "bil": 80,
+        "og": 400,
+        ...
+      }
+    }
+  }
+}{% endhighlight %}</pre>
+</p>
+<p>
+Each file contains a map of languages and their document frequencies. The document frequencies are the number of documents in the corpus that contain the term. The document count is the total number of documents in the corpus.
+
+</p>
+