From ca7ab285e46b74fd60b22a0af1985fe57c925d90 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Mon, 18 Nov 2024 13:07:57 +0100 Subject: [PATCH 1/5] add application to git --- .../application/deployment.xml | 5 + .../application/schemas/pdf_page.sd | 410 ++++++++++++++++++ .../search/query-profiles/default.xml | 2 + .../search/query-profiles/types/root.xml | 2 + .../application/services.xml | 50 +++ 5 files changed, 469 insertions(+) create mode 100644 visual-retrieval-colpali/application/deployment.xml create mode 100644 visual-retrieval-colpali/application/schemas/pdf_page.sd create mode 100644 visual-retrieval-colpali/application/search/query-profiles/default.xml create mode 100644 visual-retrieval-colpali/application/search/query-profiles/types/root.xml create mode 100644 visual-retrieval-colpali/application/services.xml diff --git a/visual-retrieval-colpali/application/deployment.xml b/visual-retrieval-colpali/application/deployment.xml new file mode 100644 index 000000000..95e7176e6 --- /dev/null +++ b/visual-retrieval-colpali/application/deployment.xml @@ -0,0 +1,5 @@ + + + aws-us-east-1c + + \ No newline at end of file diff --git a/visual-retrieval-colpali/application/schemas/pdf_page.sd b/visual-retrieval-colpali/application/schemas/pdf_page.sd new file mode 100644 index 000000000..533bb1a0e --- /dev/null +++ b/visual-retrieval-colpali/application/schemas/pdf_page.sd @@ -0,0 +1,410 @@ +schema pdf_page { + document pdf_page { + field id type string { + indexing: summary | index + match { + word + } + } + field url type string { + indexing: summary | index + } + field year type int { + indexing: summary | attribute + } + field title type string { + indexing: summary | index + index: enable-bm25 + match { + text + } + } + field page_number type int { + indexing: summary | attribute + } + field blur_image type raw { + indexing: summary + } + field full_image type raw { + indexing: summary + } + field text type string { + indexing: summary | index + index: enable-bm25 + match { + text + } + } + field embedding type tensor(patch{}, v[16]) { + indexing: attribute | index + attribute { + distance-metric: hamming + } + index { + hnsw { + max-links-per-node: 32 + neighbors-to-explore-at-insert: 400 + } + } + } + field questions type array { + indexing: summary | attribute + summary: matched-elements-only + } + field queries type array { + indexing: summary | attribute + summary: matched-elements-only + } + } + fieldset default { + fields: title, text + } + rank-profile bm25 { + inputs { + query(qt) tensor(querytoken{}, v[128]) + + } + function similarities() { + expression { + + sum( + query(qt) * unpack_bits(attribute(embedding)), v + ) + + } + } + function normalized() { + expression { + + (similarities - reduce(similarities, min)) / (reduce((similarities - reduce(similarities, min)), max)) * 2 - 1 + + } + } + function quantized() { + expression { + + cell_cast(normalized * 127.999, int8) + + } + } + first-phase { + expression { + bm25(title) + bm25(text) + } + } + } + rank-profile bm25_sim inherits bm25 { + first-phase { + expression { + bm25(title) + bm25(text) + } + } + summary-features { + quantized + } + } + rank-profile colpali { + inputs { + query(rq0) tensor(v[16]) + query(rq1) tensor(v[16]) + query(rq2) tensor(v[16]) + query(rq3) tensor(v[16]) + query(rq4) tensor(v[16]) + query(rq5) tensor(v[16]) + query(rq6) tensor(v[16]) + query(rq7) tensor(v[16]) + query(rq8) tensor(v[16]) + query(rq9) tensor(v[16]) + query(rq10) tensor(v[16]) + query(rq11) tensor(v[16]) + query(rq12) tensor(v[16]) + query(rq13) tensor(v[16]) + query(rq14) tensor(v[16]) + query(rq15) tensor(v[16]) + query(rq16) tensor(v[16]) + query(rq17) tensor(v[16]) + query(rq18) tensor(v[16]) + query(rq19) tensor(v[16]) + query(rq20) tensor(v[16]) + query(rq21) tensor(v[16]) + query(rq22) tensor(v[16]) + query(rq23) tensor(v[16]) + query(rq24) tensor(v[16]) + query(rq25) tensor(v[16]) + query(rq26) tensor(v[16]) + query(rq27) tensor(v[16]) + query(rq28) tensor(v[16]) + query(rq29) tensor(v[16]) + query(rq30) tensor(v[16]) + query(rq31) tensor(v[16]) + query(rq32) tensor(v[16]) + query(rq33) tensor(v[16]) + query(rq34) tensor(v[16]) + query(rq35) tensor(v[16]) + query(rq36) tensor(v[16]) + query(rq37) tensor(v[16]) + query(rq38) tensor(v[16]) + query(rq39) tensor(v[16]) + query(rq40) tensor(v[16]) + query(rq41) tensor(v[16]) + query(rq42) tensor(v[16]) + query(rq43) tensor(v[16]) + query(rq44) tensor(v[16]) + query(rq45) tensor(v[16]) + query(rq46) tensor(v[16]) + query(rq47) tensor(v[16]) + query(rq48) tensor(v[16]) + query(rq49) tensor(v[16]) + query(rq50) tensor(v[16]) + query(rq51) tensor(v[16]) + query(rq52) tensor(v[16]) + query(rq53) tensor(v[16]) + query(rq54) tensor(v[16]) + query(rq55) tensor(v[16]) + query(rq56) tensor(v[16]) + query(rq57) tensor(v[16]) + query(rq58) tensor(v[16]) + query(rq59) tensor(v[16]) + query(rq60) tensor(v[16]) + query(rq61) tensor(v[16]) + query(rq62) tensor(v[16]) + query(rq63) tensor(v[16]) + query(qt) tensor(querytoken{}, v[128]) + query(qtb) tensor(querytoken{}, v[16]) + + } + function similarities() { + expression { + + sum( + query(qt) * unpack_bits(attribute(embedding)), v + ) + + } + } + function normalized() { + expression { + + (similarities - reduce(similarities, min)) / (reduce((similarities - reduce(similarities, min)), max)) * 2 - 1 + + } + } + function quantized() { + expression { + + cell_cast(normalized * 127.999, int8) + + } + } + function max_sim() { + expression { + + sum( + reduce( + sum( + query(qt) * unpack_bits(attribute(embedding)), v + ), + max, patch + ), + querytoken + ) + + } + } + function max_sim_binary() { + expression { + + sum( + reduce( + 1 / (1 + sum( + hamming(query(qtb), attribute(embedding)), v) + ), + max, patch + ), + querytoken + ) + + } + } + first-phase { + expression { + max_sim_binary + } + } + second-phase { + rerank-count: 10 + expression { + max_sim + } + } + } + rank-profile colpali_sim inherits colpali { + first-phase { + expression { + max_sim_binary + } + } + summary-features { + quantized + } + } + rank-profile hybrid { + inputs { + query(rq0) tensor(v[16]) + query(rq1) tensor(v[16]) + query(rq2) tensor(v[16]) + query(rq3) tensor(v[16]) + query(rq4) tensor(v[16]) + query(rq5) tensor(v[16]) + query(rq6) tensor(v[16]) + query(rq7) tensor(v[16]) + query(rq8) tensor(v[16]) + query(rq9) tensor(v[16]) + query(rq10) tensor(v[16]) + query(rq11) tensor(v[16]) + query(rq12) tensor(v[16]) + query(rq13) tensor(v[16]) + query(rq14) tensor(v[16]) + query(rq15) tensor(v[16]) + query(rq16) tensor(v[16]) + query(rq17) tensor(v[16]) + query(rq18) tensor(v[16]) + query(rq19) tensor(v[16]) + query(rq20) tensor(v[16]) + query(rq21) tensor(v[16]) + query(rq22) tensor(v[16]) + query(rq23) tensor(v[16]) + query(rq24) tensor(v[16]) + query(rq25) tensor(v[16]) + query(rq26) tensor(v[16]) + query(rq27) tensor(v[16]) + query(rq28) tensor(v[16]) + query(rq29) tensor(v[16]) + query(rq30) tensor(v[16]) + query(rq31) tensor(v[16]) + query(rq32) tensor(v[16]) + query(rq33) tensor(v[16]) + query(rq34) tensor(v[16]) + query(rq35) tensor(v[16]) + query(rq36) tensor(v[16]) + query(rq37) tensor(v[16]) + query(rq38) tensor(v[16]) + query(rq39) tensor(v[16]) + query(rq40) tensor(v[16]) + query(rq41) tensor(v[16]) + query(rq42) tensor(v[16]) + query(rq43) tensor(v[16]) + query(rq44) tensor(v[16]) + query(rq45) tensor(v[16]) + query(rq46) tensor(v[16]) + query(rq47) tensor(v[16]) + query(rq48) tensor(v[16]) + query(rq49) tensor(v[16]) + query(rq50) tensor(v[16]) + query(rq51) tensor(v[16]) + query(rq52) tensor(v[16]) + query(rq53) tensor(v[16]) + query(rq54) tensor(v[16]) + query(rq55) tensor(v[16]) + query(rq56) tensor(v[16]) + query(rq57) tensor(v[16]) + query(rq58) tensor(v[16]) + query(rq59) tensor(v[16]) + query(rq60) tensor(v[16]) + query(rq61) tensor(v[16]) + query(rq62) tensor(v[16]) + query(rq63) tensor(v[16]) + query(qt) tensor(querytoken{}, v[128]) + query(qtb) tensor(querytoken{}, v[16]) + + } + function similarities() { + expression { + + sum( + query(qt) * unpack_bits(attribute(embedding)), v + ) + + } + } + function normalized() { + expression { + + (similarities - reduce(similarities, min)) / (reduce((similarities - reduce(similarities, min)), max)) * 2 - 1 + + } + } + function quantized() { + expression { + + cell_cast(normalized * 127.999, int8) + + } + } + function max_sim() { + expression { + + sum( + reduce( + sum( + query(qt) * unpack_bits(attribute(embedding)), v + ), + max, patch + ), + querytoken + ) + + } + } + function max_sim_binary() { + expression { + + sum( + reduce( + 1 / (1 + sum( + hamming(query(qtb), attribute(embedding)), v) + ), + max, patch + ), + querytoken + ) + + } + } + first-phase { + expression { + max_sim_binary + } + } + second-phase { + rerank-count: 10 + expression { + max_sim + 2 * (bm25(text) + bm25(title)) + } + } + } + rank-profile hybrid_sim inherits hybrid { + first-phase { + expression { + max_sim_binary + } + } + summary-features { + quantized + } + } + document-summary default { + summary text { + bolding: on + } + summary snippet { + source: text + dynamic + } + from-disk + } + document-summary suggestions { + summary questions {} + from-disk + } +} \ No newline at end of file diff --git a/visual-retrieval-colpali/application/search/query-profiles/default.xml b/visual-retrieval-colpali/application/search/query-profiles/default.xml new file mode 100644 index 000000000..a37284661 --- /dev/null +++ b/visual-retrieval-colpali/application/search/query-profiles/default.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/visual-retrieval-colpali/application/search/query-profiles/types/root.xml b/visual-retrieval-colpali/application/search/query-profiles/types/root.xml new file mode 100644 index 000000000..282933f6d --- /dev/null +++ b/visual-retrieval-colpali/application/search/query-profiles/types/root.xml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/visual-retrieval-colpali/application/services.xml b/visual-retrieval-colpali/application/services.xml new file mode 100644 index 000000000..519eebda7 --- /dev/null +++ b/visual-retrieval-colpali/application/services.xml @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + <strong> + </strong> + + ... + + + + + 1 + + + + + + + + + + + 4 + + + + + + + 2 + 1000 + 500 + 300 + + + \ No newline at end of file From ada47ec5ceb0cddee7a931e7a45b8722828856a2 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Mon, 18 Nov 2024 13:11:05 +0100 Subject: [PATCH 2/5] 1 threadspersearch for suggestions and image --- visual-retrieval-colpali/src/backend/vespa_app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/visual-retrieval-colpali/src/backend/vespa_app.py b/visual-retrieval-colpali/src/backend/vespa_app.py index 5b4509435..f4f7e55a4 100644 --- a/visual-retrieval-colpali/src/backend/vespa_app.py +++ b/visual-retrieval-colpali/src/backend/vespa_app.py @@ -302,6 +302,7 @@ async def get_full_image_from_vespa(self, doc_id: str) -> str: "yql": f'select full_image from {self.VESPA_SCHEMA_NAME} where id contains "{doc_id}"', "ranking": "unranked", "presentation.timing": True, + "ranking.matching.numThreadsPerSearch": 1, }, ) assert response.is_successful(), response.json @@ -340,6 +341,7 @@ async def get_suggestions(self, query: str) -> list: "ranking": "unranked", "presentation.timing": True, "presentation.summary": "suggestions", + "ranking.matching.numThreadsPerSearch": 1, }, ) assert response.is_successful(), response.json From 0204ddb35f52c826a57a15d5a980ef4a0f94a620 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Mon, 18 Nov 2024 13:17:23 +0100 Subject: [PATCH 3/5] redundancy 2 --- visual-retrieval-colpali/application/services.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/visual-retrieval-colpali/application/services.xml b/visual-retrieval-colpali/application/services.xml index 519eebda7..6c8d60a85 100644 --- a/visual-retrieval-colpali/application/services.xml +++ b/visual-retrieval-colpali/application/services.xml @@ -23,7 +23,7 @@ - 1 + 2 From 5bea4e7bafc4706fd6ddb2e437f4479c0d431c5d Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Mon, 18 Nov 2024 13:20:27 +0100 Subject: [PATCH 4/5] change sample query --- visual-retrieval-colpali/src/frontend/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/visual-retrieval-colpali/src/frontend/app.py b/visual-retrieval-colpali/src/frontend/app.py index e6e98a208..202b69934 100644 --- a/visual-retrieval-colpali/src/frontend/app.py +++ b/visual-retrieval-colpali/src/frontend/app.py @@ -274,7 +274,7 @@ def SampleQueries(): sample_queries = [ "What percentage of the funds unlisted real estate investments were in Switzerland 2023?", "Gender balance at level 4 or above in NY office 2023?", - "Number of internship applications trend 2021-2023", + "Number of graduate applications trend 2021-2023", "Total amount of fixed salaries paid in 2023?", "Proportion of female new hires 2021-2023?", "child jumping over puddle", From 93d7cd7d198aa7fd978f8c9e53ab2b9ada175949 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Mon, 18 Nov 2024 15:06:38 +0100 Subject: [PATCH 5/5] add resource spec --- visual-retrieval-colpali/application/services.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/visual-retrieval-colpali/application/services.xml b/visual-retrieval-colpali/application/services.xml index 6c8d60a85..4c160a085 100644 --- a/visual-retrieval-colpali/application/services.xml +++ b/visual-retrieval-colpali/application/services.xml @@ -1,6 +1,9 @@ + + + @@ -28,6 +31,7 @@ +