diff --git a/colbert/.gitignore b/colbert/.gitignore new file mode 100644 index 000000000..1fab5d88e --- /dev/null +++ b/colbert/.gitignore @@ -0,0 +1,3 @@ +application.zip +src/main/application/security/ +.idea/ diff --git a/colbert/README.md b/colbert/README.md new file mode 100644 index 000000000..29de77b19 --- /dev/null +++ b/colbert/README.md @@ -0,0 +1,63 @@ + + + + + + + #Vespa + + +# Vespa sample applications - Simple hybrid search with ColBERT + +This semantic search application uses a single vector embedding model for retrieval and ColBERT (multi-token vector representation) +for re-ranking. It also features reciprocal rank fusion to fuse different rankings. + +- Query and document text is converted to embeddings by the application using Vespa's [embedder](https://docs.vespa.ai/en/embedding.html) functionality. +- Search by embedding or text match, and fuse the rankings each produces using [reciprocal rank fusion](https://docs.vespa.ai/en/phased-ranking.html#cross-hit-normalization-including-reciprocal-rank-fusion). + +

+Requires at least Vespa 8.283.46 +

+ +## To try this application + +Follow the [vespa quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html) +through the vespa deploy step, cloning `colbert` instead of `album-recommendation`. + +Feed documents (this includes embed inference in Vespa): +
+vespa document ext/1.json
+vespa document ext/2.json
+vespa document ext/3.json
+
+ +Example queries: +
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, q))'\
+ 'input.query(q)=embed(e5, "query: space contains many suns")' \
+ 'input.query(qt)=embed(colbert, "space contains many suns")' \
+ 'query=space contains many suns'
+
+ +
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, q))'\
+ 'input.query(q)=embed(e5, "query: shipping stuff over the sea")' \
+ 'input.query(qt)=embed(colbert, "shipping stuff over the sea")' \
+ 'query=shipping stuff over the sea'
+ 
+ +
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, q))'\
+ 'input.query(q)=embed(e5, "query: exchanging information by sound")' \
+ 'input.query(qt)=embed(colbert, "exchanging information by sound")' \
+ 'query=exchanging information by sound'
+ 
+ + +### Terminate container + +Remove the container after use: +
+$ docker rm -f vespa
+
+ diff --git a/colbert/ext/1.json b/colbert/ext/1.json new file mode 100644 index 000000000..7f43ec61a --- /dev/null +++ b/colbert/ext/1.json @@ -0,0 +1 @@ +{ "put": "id:doc:doc::1", "fields": { "chunk": "To transport goods on water, use a boat" } } diff --git a/colbert/ext/2.json b/colbert/ext/2.json new file mode 100644 index 000000000..cea8ec0bd --- /dev/null +++ b/colbert/ext/2.json @@ -0,0 +1 @@ +{ "put": "id:doc:doc::2", "fields": { "chunk": "Human interaction is often done by talking" } } diff --git a/colbert/ext/3.json b/colbert/ext/3.json new file mode 100644 index 000000000..15808b155 --- /dev/null +++ b/colbert/ext/3.json @@ -0,0 +1 @@ +{ "put": "id:doc:doc::3", "fields": { "chunk": "The galaxy is filled with stars" } } diff --git a/colbert/schemas/doc.sd b/colbert/schemas/doc.sd new file mode 100644 index 000000000..bf00f1697 --- /dev/null +++ b/colbert/schemas/doc.sd @@ -0,0 +1,98 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +# See https://docs.vespa.ai/en/schemas.html +schema doc { + + document doc { + + field id type string { + indexing: summary + } + field title type string { + indexing: index | summary + index: enable-bm25 + } + field chunk type string { + indexing: index | summary + index: enable-bm25 + } + } + fieldset default { + fields: title, chunk + } + + field embedding type tensor(x[384]) { + # e5 prefix instructions + indexing: "passage: " . (input title || "") . " " . (input chunk || "") | embed e5 | attribute | index + attribute { + distance-metric: angular + } + } + + field colbert type tensor(dt{}, x[16]) { + indexing: (input title || "") . " " . (input chunk || "") | embed colbert | attribute + } + + # See https://docs.vespa.ai/en/ranking.html + rank-profile default inherits default { + inputs { + query(qt) tensor(qt{}, x[128]) + query(q) tensor(x[384]) + } + function unpack() { + expression: unpack_bits(attribute(colbert)) + } + function cos_sim() { + expression: cos(distance(field, embedding)) + } + function max_sim() { + expression { + sum( + reduce( + sum( + query(qt) * unpack() , x + ), + max, dt + ), + qt + ) + } + } + first-phase { + expression: cos_sim + } + second-phase { + expression: max_sim + } + match-features: max_sim cos_sim bm25(title) bm25(chunk) + } + + rank-profile bm25 inherits default { + first-phase { + expression: bm25(title) + bm25(chunk) + } + second-phase { + expression: firstPhase + } + } + + rank-profile e5 inherits default { + first-phase { + expression: cos_sim + } + second-phase { + expression: firstPhase + } + } + + rank-profile hybrid inherits default { + global-phase { + expression { + reciprocal_rank(max_sim) + + reciprocal_rank(cos_sim) + + reciprocal_rank(bm25(chunk)) + + reciprocal_rank(bm25(title)) + } + } + } +} diff --git a/colbert/services.xml b/colbert/services.xml new file mode 100644 index 000000000..95c81e286 --- /dev/null +++ b/colbert/services.xml @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2 + + + + + + + diff --git a/simple-semantic-search/README.md b/simple-semantic-search/README.md index 39938dd07..f95e89470 100644 --- a/simple-semantic-search/README.md +++ b/simple-semantic-search/README.md @@ -10,34 +10,44 @@ # Vespa sample applications - Simple semantic search A minimal semantic search application: -- Query and document text is converted to embeddings by the application. -- Search by embedding and/or text match. +- Query and document text is converted to embeddings by the application using Vespa's [embedder functionality](https://docs.vespa.ai/en/embedding.html#huggingface-embedder). +- Search by embedding or text match and use [reciprocal rank fusion](https://docs.vespa.ai/en/phased-ranking.html#cross-hit-normalization-including-reciprocal-rank-fusion) to fuse +different rankings.

-Requires at least Vespa 8.54.61. -

+## To try this application -## To try this +Follow the [vespa quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html) +through the vespa deploy step, cloning `simple-semantic-search` instead of `album-recommendation`. -Follow -[vespa quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html) -through the vespa deploy step, cloning simple-semantic-search instead of album-recommendation. +Feed documents (this includes embed inference in Vespa): -Feed documents:
 vespa document ext/1.json
 vespa document ext/2.json
 vespa document ext/3.json
 
-Example queries: +Example queries using [E5-Small-V2](https://huggingface.co/intfloat/e5-small-v2) +embedding model that maps text to a 384-dimensional vector (the query prefix in the embed argument is an instruction to the embedding model). + +
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, e))' \
+ 'input.query(e)=embed(e5, "query: space contains many suns")' \
+ 'query=space contains many suns'
+
+ +
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, e))' \
+ 'input.query(e)=embed(e5, "query: shipping stuff over the sea")' \
+ 'query=shipping stuff over the sea'
+
+
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e)" "input.query(e)=embed(space contains many suns)"
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e)" "input.query(e)=embed(shipping stuff over the sea)"
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e)" "input.query(e)=embed(exchanging information by sound)"
-vespa query "yql=select * from doc where text contains 'boat'"
-vespa query "yql=select * from doc where {targetHits: 100}nearestNeighbor(embedding, e) AND text contains 'boat'" "input.query(e)=embed(exchanging information by sound)"
+vespa query 'yql=select * from doc where userQuery() or ({targetHits: 100}nearestNeighbor(embedding, e))' \
+ 'input.query(e)=embed(e5, "query: exchanging information by sound")' \
+ 'query=exchanging information by sound' 
 
Remove the container after use: @@ -47,28 +57,29 @@ $ docker rm -f vespa ## Ready for production -The [model included in this sample application](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) +The E5-small-v2 [embedding model](https://huggingface.co/intfloat/e5-small-v2) used in this sample application is suitable for production use and will produce good results in many domains without fine-tuning, -especially when combined with text match features such as bm25. +especially when combined with text match features. ## Model exporting -Transformer based embedding models have named inputs and outputs that needs to be compatible with the input and output names used by the Bert embedder or the Huggingface embedder. +Transformer-based embedding models have named inputs and outputs that must +be compatible with the input and output names used by the Vespa Bert embedder or the Huggingface embedder. ### Bert-embedder -See [export_model_from_hf.py](export_model_from_hf.py) for how to export a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by +See [export_model_from_hf.py](export_model_from_hf.py) for exporting a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by the [bert-embedder](https://docs.vespa.ai/en/embedding.html#bert-embedder). -The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in a ONNX file and the `vocab.txt` file +The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in an ONNX file and the `vocab.txt` file in the format expected by the Vespa bert-embedder.
 ./export_model_from_hf.py --hf_model intfloat/e5-small-v2 --output_dir model
 
### Huggingface-embedder -See [export_hf_model_from_hf.py](export_hf_model_from_hf.py) for how to export a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by +See [export_hf_model_from_hf.py](export_hf_model_from_hf.py) for exporting a Huggingface sentence-transformer model to ONNX format compatible with default input and output names used by the [huggingface-embedder](https://docs.vespa.ai/en/embedding.html#huggingface-embedder). -The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in a ONNX file and the `tokenizer.json` file. +The following exports [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) and saves the model parameters in an ONNX and `tokenizer.json` files.
 ./export_hf_model_from_hf.py --hf_model intfloat/e5-small-v2 --output_dir model
 
\ No newline at end of file diff --git a/simple-semantic-search/schemas/doc.sd b/simple-semantic-search/schemas/doc.sd index fa884dca7..39216d3f9 100644 --- a/simple-semantic-search/schemas/doc.sd +++ b/simple-semantic-search/schemas/doc.sd @@ -9,13 +9,15 @@ schema doc { indexing: index | summary index: enable-bm25 } - + } + fieldset default { + fields: text } # See https://docs.vespa.ai/en/embedding.html#embedding-a-document-field # and https://docs.vespa.ai/en/approximate-nn-hnsw.html field embedding type tensor(x[384]) { - indexing: input text | embed | attribute | index + indexing: "passage: " . input text | embed e5 | attribute | index attribute { distance-metric: angular } @@ -27,14 +29,15 @@ schema doc { query(e) tensor(x[384]) } first-phase { - expression: bm25(text) + closeness(field, embedding) + expression: closeness(field, embedding) } - } - # Returns cosine similarity as score instead of the angle - rank-profile cosine inherits default { - first-phase { - expression: bm25(text) + cos(distance(field, embedding)) + match-features: bm25(text) closeness(field, embedding) + global-phase { + expression { + reciprocal_rank(closeness(field,embedding)) + + reciprocal_rank(bm25(text)) + } } } - + } diff --git a/simple-semantic-search/services.xml b/simple-semantic-search/services.xml index d99300be4..fc04a6934 100644 --- a/simple-semantic-search/services.xml +++ b/simple-semantic-search/services.xml @@ -6,7 +6,7 @@ - +