Add colbert sample app and re-factor simple-semantic-search (#1372)

* Add colbert sample app and re-factor simple-semantic-search * messed up spaces and tabs * rm comment * space * remove vespa version note
vespa-engine · Jan 11, 2024 · e8f8db8 · e8f8db8
1 parent 73fe23a
commit e8f8db8
Show file tree

Hide file tree

Showing 10 changed files with 252 additions and 32 deletions.
diff --git a/colbert/.gitignore b/colbert/.gitignore
@@ -0,0 +1,3 @@
+application.zip
+src/main/application/security/
+.idea/
diff --git a/colbert/README.md b/colbert/README.md
@@ -0,0 +1,63 @@
+
+<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
+
+<picture>
+  <source media="(prefers-color-scheme: dark)" srcset="https://vespa.ai/assets/vespa-ai-logo-heather.svg">
+  <source media="(prefers-color-scheme: light)" srcset="https://vespa.ai/assets/vespa-ai-logo-rock.svg">
+  <img alt="#Vespa" width="200" src="https://vespa.ai/assets/vespa-ai-logo-rock.svg" style="margin-bottom: 25px;">
+</picture>
+
+# Vespa sample applications - Simple hybrid search with ColBERT
+
+This semantic search application uses a single vector embedding model for retrieval and ColBERT (multi-token vector representation)
+for re-ranking. It also features reciprocal rank fusion to fuse different rankings.
+
+- Query and document text is converted to embeddings by the application using Vespa's [embedder](https://docs.vespa.ai/en/embedding.html) functionality. 
+- Search by embedding or text match, and fuse the rankings each produces using [reciprocal rank fusion](https://docs.vespa.ai/en/phased-ranking.html#cross-hit-normalization-including-reciprocal-rank-fusion). 
+
+<p data-test="run-macro init-deploy colbert">
+Requires at least Vespa 8.283.46
+</p>
+
+## To try this application
+
+Follow the [vespa quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html)
+through the <code>vespa deploy</code> step, cloning `colbert` instead of `album-recommendation`.
+
+Feed documents (this includes embed inference in Vespa):
+<pre data-test="exec">
+vespa document ext/1.json
+vespa document ext/2.json
+vespa document ext/3.json
+</pre>
+
+Example queries:
+<pre data-test="exec" data-test-assert-contains="id:doc:doc::1">
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, q))'\
+ 'input.query(q)=embed(e5, "query: space contains many suns")' \
+ 'input.query(qt)=embed(colbert, "space contains many suns")' \
+ 'query=space contains many suns'
+</pre>
+
+<pre data-test="exec" data-test-assert-contains="id:doc:doc::1">
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, q))'\
+ 'input.query(q)=embed(e5, "query: shipping stuff over the sea")' \
+ 'input.query(qt)=embed(colbert, "shipping stuff over the sea")' \
+ 'query=shipping stuff over the sea'
+ </pre>
+
+ <pre data-test="exec" data-test-assert-contains="id:doc:doc::1">
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, q))'\
+ 'input.query(q)=embed(e5, "query: exchanging information by sound")' \
+ 'input.query(qt)=embed(colbert, "exchanging information by sound")' \
+ 'query=exchanging information by sound'
+ </pre>
+
+
+### Terminate container 
+
+Remove the container after use:
+<pre data-test="exec">
+$ docker rm -f vespa
+</pre>
+
diff --git a/colbert/ext/1.json b/colbert/ext/1.json
@@ -0,0 +1 @@
+{ "put": "id:doc:doc::1", "fields": { "chunk": "To transport goods on water, use a boat" } }
diff --git a/colbert/ext/2.json b/colbert/ext/2.json
@@ -0,0 +1 @@
+{ "put": "id:doc:doc::2", "fields": { "chunk": "Human interaction is often done by talking" } }
diff --git a/colbert/ext/3.json b/colbert/ext/3.json
@@ -0,0 +1 @@
+{ "put": "id:doc:doc::3", "fields": { "chunk": "The galaxy is filled with stars" } }
diff --git a/colbert/schemas/doc.sd b/colbert/schemas/doc.sd
@@ -0,0 +1,98 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+# See https://docs.vespa.ai/en/schemas.html
+schema doc {
+
+    document doc {
+
+        field id type string {
+            indexing: summary 
+        }
+        field title type string {
+            indexing: index | summary
+            index: enable-bm25
+        }
+        field chunk type string {
+            indexing: index | summary
+            index: enable-bm25
+        }
+    }
+    fieldset default {
+	    fields: title, chunk
+    }
+
+    field embedding type tensor<bfloat16>(x[384]) {
+        # e5 prefix instructions
+        indexing: "passage: " . (input title || "") . " " . (input chunk || "") | embed e5 | attribute | index 
+		attribute {
+		    distance-metric: angular
+		}
+    }
+
+    field colbert type tensor<int8>(dt{}, x[16]) {
+        indexing: (input title || "") . " " . (input chunk || "") | embed colbert | attribute  
+    }
+
+    # See https://docs.vespa.ai/en/ranking.html
+    rank-profile default inherits default {
+        inputs {
+            query(qt) tensor<float>(qt{}, x[128])
+            query(q) tensor<bfloat16>(x[384])
+        }
+        function unpack() {
+            expression: unpack_bits(attribute(colbert))
+        }
+        function cos_sim() {
+            expression: cos(distance(field, embedding))
+        }
+        function max_sim() {
+            expression {
+                sum(
+                    reduce(
+                        sum(
+                            query(qt) * unpack() , x
+                        ),
+                        max, dt
+                    ),
+                    qt
+                )
+            }
+        }
+        first-phase {
+            expression: cos_sim
+        }
+        second-phase {
+            expression: max_sim
+        }
+        match-features: max_sim cos_sim bm25(title) bm25(chunk)
+    }
+
+    rank-profile bm25 inherits default {
+        first-phase {
+            expression: bm25(title) + bm25(chunk)
+        }
+        second-phase {
+            expression: firstPhase
+        }
+    }
+
+    rank-profile e5 inherits default {
+         first-phase {
+            expression: cos_sim
+        }
+        second-phase {
+            expression: firstPhase
+        }
+    }
+
+    rank-profile hybrid inherits default {
+        global-phase {
+            expression {
+              reciprocal_rank(max_sim) + 
+              reciprocal_rank(cos_sim) + 
+              reciprocal_rank(bm25(chunk)) + 
+              reciprocal_rank(bm25(title))
+            }
+        }
+    }
+}
diff --git a/colbert/services.xml b/colbert/services.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8" ?>
+
+<services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties">
+
+    <!-- See https://docs.vespa.ai/en/reference/services-container.html -->
+    <container id="default" version="1.0">
+
+        <!-- See https://docs.vespa.ai/en/embedding.html#huggingface-embedder -->
+        <component id="e5" type="hugging-face-embedder">
+            <transformer-model url="https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"/>
+            <tokenizer-model url="https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"/>
+        </component>
+
+        <!-- See https://docs.vespa.ai/en/embedding.html#colbert-embedder -->
+        <component id="colbert" type="colbert-embedder">
+            <transformer-model url="https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"/>
+            <tokenizer-model url="https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"/>
+        </component>
+
+        <document-api/>
+        <search/>
+        <nodes count="1">
+            <resources vcpu="4" memory="16Gb" disk="125Gb">
+                <gpu count="1" memory="16Gb"/>
+            </resources>
+        </nodes>
+
+    </container>
+
+    <!-- See https://docs.vespa.ai/en/reference/services-content.html -->
+    <content id="text" version="1.0">
+        <redundancy>2</redundancy>
+        <documents>
+            <document type="doc" mode="index" />
+        </documents>
+        <nodes count="2"/>
+    </content>
+
+</services>
diff --git a/simple-semantic-search/README.md b/simple-semantic-search/README.md
@@ -10,34 +10,44 @@
 # Vespa sample applications - Simple semantic search
 
 A minimal semantic search application: 
-- Query and document text is converted to embeddings by the application. 
-- Search by embedding and/or text match.
+- Query and document text is converted to embeddings by the application using Vespa's [embedder functionality](https://docs.vespa.ai/en/embedding.html#huggingface-embedder).
+- Search by embedding or text match and use [reciprocal rank fusion](https://docs.vespa.ai/en/phased-ranking.html#cross-hit-normalization-including-reciprocal-rank-fusion) to fuse 
+different rankings.
 
 <p data-test="run-macro init-deploy simple-semantic-search">
-Requires at least Vespa 8.54.61.
-</p>
 
+## To try this application
 
-## To try this
+Follow the [vespa quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html)
+through the <code>vespa deploy</code> step, cloning `simple-semantic-search` instead of `album-recommendation`.
 
-Follow
-[vespa quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html)
-through the <code>vespa deploy</code> step, cloning simple-semantic-search instead of album-recommendation.
+Feed documents (this includes embed inference in Vespa):
 
-Feed documents:
 <pre data-test="exec">
 vespa document ext/1.json
 vespa document ext/2.json
 vespa document ext/3.json
 </pre>
 
-Example queries:
+Example queries using [E5-Small-V2](https://huggingface.co/intfloat/e5-small-v2) 
+embedding model that maps text to a 384-dimensional vector (the query prefix in the embed argument is an instruction to the embedding model).
+
+<pre data-test="exec" data-test-assert-contains="id:doc:doc::1">
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, e))' \
+ 'input.query(e)=embed(e5, "query: space contains many suns")' \
+ 'query=space contains many suns'
+</pre>
+
+<pre data-test="exec" data-test-assert-contains="id:doc:doc::1">
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, e))' \
+ 'input.query(e)=embed(e5, "query: shipping stuff over the sea")' \
+ 'query=shipping stuff over the sea'
+</pre>
+
 <pre data-test="exec" data-test-assert-contains="id:doc:doc::1">
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e)" "input.query(e)=embed(space contains many suns)"
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e)" "input.query(e)=embed(shipping stuff over the sea)"
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e)" "input.query(e)=embed(exchanging information by sound)"
-vespa query "yql=select * from doc where text contains 'boat'"
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e) AND text contains 'boat'" "input.query(e)=embed(exchanging information by sound)"
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, e))' \
+ 'input.query(e)=embed(e5, "query: exchanging information by sound")' \
+ 'query=exchanging information by sound' 
 </pre>
 
 Remove the container after use:
@@ -47,28 +57,29 @@ $ docker rm -f vespa
 
 ## Ready for production
 
-The [model included in this sample application](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
+The E5-small-v2 [embedding model](https://huggingface.co/intfloat/e5-small-v2) used in this sample application
 is suitable for production use and will produce good results in many domains without fine-tuning,
-especially when combined with text match features such as bm25.
+especially when combined with text match features.
 
 ## Model exporting
-Transformer based embedding models have named inputs and outputs that needs to be compatible with the input and output names used by the Bert embedder or the Huggingface embedder.
+Transformer-based embedding models have named inputs and outputs that must  
+be compatible with the input and output names used by the Vespa Bert embedder or the Huggingface embedder.
 
 ### Bert-embedder
-See [export_model_from_hf.py](export_model_from_hf.py) for how to export a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by
+See [export_model_from_hf.py](export_model_from_hf.py) for exporting a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by
 the [bert-embedder](https://docs.vespa.ai/en/embedding.html#bert-embedder). 
 
-The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in a ONNX file and the `vocab.txt` file 
+The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in an ONNX file and the `vocab.txt` file 
 in the format expected by the Vespa bert-embedder.
 <pre>
 ./export_model_from_hf.py --hf_model intfloat/e5-small-v2 --output_dir model
 </pre>
 
 ### Huggingface-embedder
-See [export_hf_model_from_hf.py](export_hf_model_from_hf.py) for how to export a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by
+See [export_hf_model_from_hf.py](export_hf_model_from_hf.py) for exporting a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by
 the [huggingface-embedder](https://docs.vespa.ai/en/embedding.html#huggingface-embedder). 
 
-The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in a ONNX file and the `tokenizer.json` file. 
+The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in an ONNX  and `tokenizer.json` files.
 <pre>
 ./export_hf_model_from_hf.py --hf_model intfloat/e5-small-v2 --output_dir model
 </pre>
diff --git a/simple-semantic-search/schemas/doc.sd b/simple-semantic-search/schemas/doc.sd
@@ -9,13 +9,15 @@ schema doc {
             indexing: index | summary
             index: enable-bm25
         }
-
+    }
+    fieldset default {
+        fields: text
     }
 
     # See https://docs.vespa.ai/en/embedding.html#embedding-a-document-field
     # and https://docs.vespa.ai/en/approximate-nn-hnsw.html
     field embedding type tensor<float>(x[384]) {
-        indexing: input text | embed | attribute | index
+        indexing: "passage: " . input text | embed e5 | attribute | index
         attribute {
             distance-metric: angular
         }
@@ -27,14 +29,15 @@ schema doc {
             query(e) tensor<float>(x[384])
         }
         first-phase {
-            expression: bm25(text) + closeness(field, embedding)
+            expression: closeness(field, embedding)
         }
-    }
-    # Returns cosine similarity as score instead of the angle 
-    rank-profile cosine inherits default {
-        first-phase {
-            expression: bm25(text) + cos(distance(field, embedding))
+        match-features: bm25(text) closeness(field, embedding)
+        global-phase {
+            expression {
+              reciprocal_rank(closeness(field,embedding)) + 
+              reciprocal_rank(bm25(text))
+            }
         }
     }
-
+   
 }
diff --git a/simple-semantic-search/services.xml b/simple-semantic-search/services.xml
@@ -6,7 +6,7 @@
     <container id="default" version="1.0">
 
         <!-- See https://docs.vespa.ai/en/embedding.html#huggingface-embedder -->
-        <component id="e5-small-q" type="hugging-face-embedder">
+        <component id="e5" type="hugging-face-embedder">
             <transformer-model path="model/e5-small-v2-int8.onnx"/>
             <tokenizer-model path="model/tokenizer.json"/>
         </component>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{ "put": "id:doc:doc::1", "fields": { "chunk": "To transport goods on water, use a boat" } }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{ "put": "id:doc:doc::2", "fields": { "chunk": "Human interaction is often done by talking" } }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{ "put": "id:doc:doc::3", "fields": { "chunk": "The galaxy is filled with stars" } }