From 186031941a974bfbd4b953b5610a83289076a940 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Thu, 21 Nov 2024 15:06:33 +0100 Subject: [PATCH 01/12] use amp for autocasting --- ...trieval-vision-language-models-cloud.ipynb | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb b/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb index 124450c1..273b6093 100644 --- a/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb +++ b/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb @@ -22,8 +22,8 @@ "\n", "[ColPali: Efficient Document Retrieval with Vision Language Models Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo](https://arxiv.org/abs/2407.01449v2)\n", "\n", - "ColPail is a combination of [ColBERT](https://blog.vespa.ai/announcing-colbert-embedder-in-vespa/) \n", - "and [PailGemma](https://huggingface.co/blog/paligemma):\n", + "ColPali is a combination of [ColBERT](https://blog.vespa.ai/announcing-colbert-embedder-in-vespa/) \n", + "and [PaliGemma](https://huggingface.co/blog/paligemma):\n", "\n", ">ColPali is enabled by the latest advances in Vision Language Models, notably the PaliGemma model from the Google Zürich team, and leverages multi-vector retrieval through late interaction mechanisms as proposed in ColBERT by Omar Khattab.\n", "\n", @@ -61,7 +61,7 @@ "\n", "Let us get started. \n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb)\n", "\n", "\n", "Install dependencies: \n", @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "qKFOvdo5nCVl" }, @@ -118,7 +118,8 @@ " process_images,\n", " process_queries,\n", ")\n", - "from colpali_engine.utils.image_utils import scale_image, get_base64_image" + "from colpali_engine.utils.image_utils import scale_image, get_base64_image\n", + "from torch.amp import autocast" ] }, { @@ -142,19 +143,19 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if torch.cuda.is_available():\n", " device = torch.device(\"cuda\")\n", - " type = torch.bfloat16\n", + " dtype = torch.bfloat16\n", "elif torch.backends.mps.is_available():\n", " device = torch.device(\"mps\")\n", - " type = torch.float32\n", + " dtype = torch.float32\n", "else:\n", " device = torch.device(\"cpu\")\n", - " type = torch.float32" + " dtype = torch.float32" ] }, { @@ -497,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -526,9 +527,10 @@ " )\n", " for batch_doc in tqdm(dataloader):\n", " with torch.no_grad():\n", - " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", - " embeddings_doc = model(**batch_doc)\n", - " page_embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n", + " with autocast(device_type=device.type):\n", + " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", + " embeddings_doc = model(**batch_doc)\n", + " page_embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n", " pdf[\"embeddings\"] = page_embeddings" ] }, @@ -927,7 +929,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "id": "NxeDd3mcYDpL" }, @@ -944,7 +946,7 @@ " with torch.no_grad():\n", " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", " embeddings_query = model(**batch_query)\n", - " qs.extend(list(torch.unbind(embeddings_query.to(\"cpu\"))))" + " qs.extend(list(torch.unbind(embeddings_query.cpu())))" ] }, { From 6a0a19933b7a2b723a0737146f1e11adee33dd27 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Thu, 21 Nov 2024 15:09:27 +0100 Subject: [PATCH 02/12] one more --- ...document-retrieval-vision-language-models-cloud.ipynb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb b/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb index 273b6093..9cc40752 100644 --- a/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb +++ b/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb @@ -530,7 +530,7 @@ " with autocast(device_type=device.type):\n", " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", " embeddings_doc = model(**batch_doc)\n", - " page_embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n", + " page_embeddings.extend(list(torch.unbind(embeddings_doc.cpu())))\n", " pdf[\"embeddings\"] = page_embeddings" ] }, @@ -944,9 +944,10 @@ "qs = []\n", "for batch_query in dataloader:\n", " with torch.no_grad():\n", - " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", - " embeddings_query = model(**batch_query)\n", - " qs.extend(list(torch.unbind(embeddings_query.cpu())))" + " with autocast(device_type=device.type):\n", + " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", + " embeddings_query = model(**batch_query)\n", + " qs.extend(list(torch.unbind(embeddings_query.cpu())))" ] }, { From 374581080838f4c04400600a37ca181bbcd4820c Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Thu, 21 Nov 2024 15:21:58 +0100 Subject: [PATCH 03/12] use autocast in vqa notebook --- ...olpali-benchmark-vqa-vlm_Vespa-cloud.ipynb | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb b/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb index 7ac2da7a..da59d118 100644 --- a/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb +++ b/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb @@ -59,7 +59,8 @@ "from colpali_engine.utils.colpali_processing_utils import (\n", " process_images,\n", " process_queries,\n", - ")" + ")\n", + "from torch.amp import autocast" ] }, { @@ -88,13 +89,13 @@ "source": [ "if torch.cuda.is_available():\n", " device = torch.device(\"cuda\")\n", - " type = torch.bfloat16\n", + " dtype = torch.bfloat16\n", "elif torch.backends.mps.is_available():\n", " device = torch.device(\"mps\")\n", - " type = torch.float32\n", + " dtype = torch.float32\n", "else:\n", " device = torch.device(\"cpu\")\n", - " type = torch.float32" + " dtype = torch.float32" ] }, { @@ -356,9 +357,10 @@ "embeddings = []\n", "for batch_doc in tqdm(dataloader):\n", " with torch.no_grad():\n", - " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", - " embeddings_doc = model(**batch_doc)\n", - " embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))" + " with autocast(device_type=device.type):\n", + " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", + " embeddings_doc = model(**batch_doc)\n", + " embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))" ] }, { @@ -392,9 +394,10 @@ "query_embeddings = []\n", "for batch_query in tqdm(dataloader):\n", " with torch.no_grad():\n", - " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", - " embeddings_query = model(**batch_query)\n", - " query_embeddings.extend(list(torch.unbind(embeddings_query.to(\"cpu\"))))" + " with autocast(device_type=device.type):\n", + " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", + " embeddings_query = model(**batch_query)\n", + " query_embeddings.extend(list(torch.unbind(embeddings_query.to(\"cpu\"))))" ] }, { @@ -6238,4 +6241,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From 8be6036aaf4ffdb645fe07214f013bed91a36874 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Thu, 21 Nov 2024 15:22:11 +0100 Subject: [PATCH 04/12] use autocast in simplified notebook --- ...trieval-with-colpali-vlm_Vespa-cloud.ipynb | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb b/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb index 2bb9fd57..ad185d43 100644 --- a/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb +++ b/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb @@ -103,7 +103,8 @@ " process_images,\n", " process_queries,\n", ")\n", - "from colpali_engine.utils.image_utils import scale_image, get_base64_image" + "from colpali_engine.utils.image_utils import scale_image, get_base64_image\n", + "from torch.amp import autocast" ] }, { @@ -133,13 +134,13 @@ "source": [ "if torch.cuda.is_available():\n", " device = torch.device(\"cuda\")\n", - " type = torch.bfloat16\n", + " dtype = torch.bfloat16\n", "elif torch.backends.mps.is_available():\n", " device = torch.device(\"mps\")\n", - " type = torch.float32\n", + " dtype = torch.float32\n", "else:\n", " device = torch.device(\"cpu\")\n", - " type = torch.float32" + " dtype = torch.float32" ] }, { @@ -590,9 +591,10 @@ " )\n", " for batch_doc in tqdm(dataloader):\n", " with torch.no_grad():\n", - " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", - " embeddings_doc = model(**batch_doc)\n", - " page_embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n", + " with autocast(device_type=device.type):\n", + " batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}\n", + " embeddings_doc = model(**batch_doc)\n", + " page_embeddings.extend(list(torch.unbind(embeddings_doc.to(\"cpu\"))))\n", " pdf[\"embeddings\"] = page_embeddings" ] }, @@ -974,9 +976,10 @@ "qs = []\n", "for batch_query in dataloader:\n", " with torch.no_grad():\n", - " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", - " embeddings_query = model(**batch_query)\n", - " qs.extend(list(torch.unbind(embeddings_query.to(\"cpu\"))))" + " with autocast(device_type=device.type):\n", + " batch_query = {k: v.to(model.device) for k, v in batch_query.items()}\n", + " embeddings_query = model(**batch_query)\n", + " qs.extend(list(torch.unbind(embeddings_query.to(\"cpu\"))))" ] }, { From 9d1cb6d876d79f1116fbdf0c3854d120005cc15b Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Thu, 21 Nov 2024 16:32:01 +0100 Subject: [PATCH 05/12] dtype --- .../examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb | 2 +- ...ali-document-retrieval-vision-language-models-cloud.ipynb | 2 +- .../simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb b/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb index da59d118..483229b8 100644 --- a/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb +++ b/docs/sphinx/source/examples/colpali-benchmark-vqa-vlm_Vespa-cloud.ipynb @@ -287,7 +287,7 @@ "source": [ "model_name = \"vidore/colpali-v1.2\"\n", "model = ColPali.from_pretrained(\n", - " \"vidore/colpaligemma-3b-pt-448-base\", torch_dtype=type\n", + " \"vidore/colpaligemma-3b-pt-448-base\", torch_dtype=dtype\n", ").eval()\n", "model.load_adapter(model_name)\n", "model = model.eval()\n", diff --git a/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb b/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb index 9cc40752..bc5b6c3b 100644 --- a/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb +++ b/docs/sphinx/source/examples/colpali-document-retrieval-vision-language-models-cloud.ipynb @@ -340,7 +340,7 @@ "source": [ "model_name = \"vidore/colpali-v1.2\"\n", "model = ColPali.from_pretrained(\n", - " \"vidore/colpaligemma-3b-pt-448-base\", torch_dtype=type\n", + " \"vidore/colpaligemma-3b-pt-448-base\", torch_dtype=dtype\n", ").eval()\n", "model.load_adapter(model_name)\n", "model = model.eval()\n", diff --git a/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb b/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb index ad185d43..2867a9ef 100644 --- a/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb +++ b/docs/sphinx/source/examples/simplified-retrieval-with-colpali-vlm_Vespa-cloud.ipynb @@ -332,7 +332,7 @@ "source": [ "model_name = \"vidore/colpali-v1.2\"\n", "model = ColPali.from_pretrained(\n", - " \"vidore/colpaligemma-3b-pt-448-base\", torch_dtype=type\n", + " \"vidore/colpaligemma-3b-pt-448-base\", torch_dtype=dtype\n", ").eval()\n", "model.load_adapter(model_name)\n", "model = model.eval()\n", @@ -433,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "id": "YaDInfmT3Tbu" }, @@ -441,6 +441,7 @@ "source": [ "for pdf in sample_pdfs:\n", " page_images, page_texts = get_pdf_images(pdf[\"url\"])\n", + "\n", " pdf[\"images\"] = page_images\n", " pdf[\"texts\"] = page_texts" ] From a593a3518253c8f35d6b90f5f20277deba0a8916 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Fri, 22 Nov 2024 09:58:26 +0100 Subject: [PATCH 06/12] upda vector integration test --- .../test_integration_vespa_cloud_vector_search.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_integration_vespa_cloud_vector_search.py b/tests/integration/test_integration_vespa_cloud_vector_search.py index 38ee5510..045da837 100644 --- a/tests/integration/test_integration_vespa_cloud_vector_search.py +++ b/tests/integration/test_integration_vespa_cloud_vector_search.py @@ -93,7 +93,7 @@ def test_vector_indexing_and_query(self): from datasets import load_dataset - sample_size = 1000 + sample_size = 100 # streaming=True pages the data from S3. This is needed to avoid memory issues when loading the dataset. dataset = load_dataset( "KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True @@ -164,9 +164,7 @@ def callback(response: VespaResponse, id: str): ok = 0 callbacks = 0 start_time = time.time() - dataset = load_dataset( - "KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True - ).take(100) + feed_with_wrong_field = dataset.map( lambda x: { "id": x["_id"], @@ -186,9 +184,7 @@ def callback(response: VespaResponse, id: str): self.assertEqual(callbacks, 100) ok = 0 - dataset = load_dataset( - "KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True - ).take(sample_size) + # Run update - assign all docs with a meta field updates = dataset.map(lambda x: {"id": x["_id"], "fields": {"meta": "stuff"}}) @@ -239,7 +235,7 @@ def tearDown(self) -> None: class TestProdDeploymentFromDisk(unittest.TestCase): - def setUp(self) -> None: + def test_setup(self) -> None: self.app_package = create_vector_ada_application_package() prod_region = "aws-us-east-1c" self.app_package.clusters = [ From 084f4785e146212696eb27db8a633f552c9f3d7a Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Fri, 22 Nov 2024 09:59:09 +0100 Subject: [PATCH 07/12] update workflow --- .github/workflows/integration-cloud.yml | 81 ++++++++++++++++++++----- 1 file changed, 66 insertions(+), 15 deletions(-) diff --git a/.github/workflows/integration-cloud.yml b/.github/workflows/integration-cloud.yml index 3f028465..cff2868e 100644 --- a/.github/workflows/integration-cloud.yml +++ b/.github/workflows/integration-cloud.yml @@ -5,6 +5,7 @@ on: push: branches: - master + - thomasht86/fix-integrationtest-vectorsearch schedule: - cron: "0 11 * * 0" @@ -13,20 +14,58 @@ concurrency: cancel-in-progress: false jobs: + setup-environment: + runs-on: ubuntu-latest + outputs: + python-cache-key: ${{ steps.cache-python.outputs.cache-hit }} + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Cache dependencies + id: cache-python + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }} + + - name: Install dependencies + if: steps.cache-python.outputs.cache-hit != 'true' + run: | + pip install -e .[dev] + + - name: Upload Python environment + uses: actions/upload-artifact@v3 + with: + name: python-environment + path: ~/.cache/pip + integration-cloud: runs-on: ubuntu-latest + needs: setup-environment steps: - uses: actions/checkout@v4 + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.9" - cache: "pip" - cache-dependency-path: | - pyproject.toml + python-version: "3.10" + + - name: Download Python environment + uses: actions/download-artifact@v3 + with: + name: python-environment + path: ~/.cache/pip + - name: Install dependencies run: | pip install -e .[dev] + - name: Run integration tests env: VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }} @@ -35,42 +74,54 @@ jobs: integration-cloud-token: runs-on: ubuntu-latest - needs: integration-cloud + needs: setup-environment steps: - uses: actions/checkout@v4 + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.9" - cache: "pip" - cache-dependency-path: | - pyproject.toml + python-version: "3.10" + + - name: Download Python environment + uses: actions/download-artifact@v3 + with: + name: python-environment + path: ~/.cache/pip + - name: Install dependencies run: | pip install -e .[dev] + - name: Run integration tests env: VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }} VESPA_CLOUD_SECRET_TOKEN: ${{ secrets.VESPA_CLOUD_SECRET_TOKEN }} - VESPA_CLIENT_TOKEN_ID: ${{ secrets.VESPA_CLIENT_TOKEN_ID}} + VESPA_CLIENT_TOKEN_ID: ${{ secrets.VESPA_CLIENT_TOKEN_ID }} run: | pytest tests/integration/test_integration_vespa_cloud_token.py -s -v integration-cloud-vector-search: runs-on: ubuntu-latest - needs: integration-cloud-token + needs: setup-environment steps: - uses: actions/checkout@v4 + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.9" - cache: "pip" - cache-dependency-path: | - pyproject.toml + python-version: "3.10" + + - name: Download Python environment + uses: actions/download-artifact@v3 + with: + name: python-environment + path: ~/.cache/pip + - name: Install dependencies run: | pip install -e .[dev] + - name: Run integration tests env: VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }} From 48c8bd472404a3050a046ea1cd3695f6829bfd4b Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Fri, 22 Nov 2024 11:10:31 +0100 Subject: [PATCH 08/12] comment, not skip --- ...t_integration_vespa_cloud_vector_search.py | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/integration/test_integration_vespa_cloud_vector_search.py b/tests/integration/test_integration_vespa_cloud_vector_search.py index 045da837..9b73ac3f 100644 --- a/tests/integration/test_integration_vespa_cloud_vector_search.py +++ b/tests/integration/test_integration_vespa_cloud_vector_search.py @@ -21,7 +21,6 @@ ContainerCluster, Nodes, DeploymentConfiguration, - EmptyDeploymentConfiguration, Validation, ValidationID, ) @@ -298,32 +297,33 @@ def test_application_status(self): def test_vector_indexing_and_query(self): super().test_vector_indexing_and_query() - @unittest.skip("Do not run when not waiting for deployment.") - def tearDown(self) -> None: - self.app.delete_all_docs( - content_cluster_name="vector_content", - schema="vector", - namespace="benchmark", - ) - time.sleep(5) - with self.app.syncio() as sync_session: - response: VespaResponse = sync_session.query( - {"yql": "select id from sources * where true", "hits": 10} - ) - self.assertEqual(response.get_status_code(), 200) - self.assertEqual(len(response.hits), 0) - print(response.get_json()) + # DO NOT skip tearDown-method, as test will not exit. + # @unittest.skip("Do not run when not waiting for deployment.") + # def tearDown(self) -> None: + # self.app.delete_all_docs( + # content_cluster_name="vector_content", + # schema="vector", + # namespace="benchmark", + # ) + # time.sleep(5) + # with self.app.syncio() as sync_session: + # response: VespaResponse = sync_session.query( + # {"yql": "select id from sources * where true", "hits": 10} + # ) + # self.assertEqual(response.get_status_code(), 200) + # self.assertEqual(len(response.hits), 0) + # print(response.get_json()) - # Deployment is deleted by deploying with an empty deployment.xml file. - self.app_package.deployment_config = EmptyDeploymentConfiguration() + # # Deployment is deleted by deploying with an empty deployment.xml file. + # self.app_package.deployment_config = EmptyDeploymentConfiguration() - # Vespa won't push the deleted deployment.xml file unless we add a validation override - tomorrow = datetime.now() + timedelta(days=1) - formatted_date = tomorrow.strftime("%Y-%m-%d") - self.app_package.validations = [ - Validation(ValidationID("deployment-removal"), formatted_date) - ] - self.app_package.to_files(self.application_root) - # This will delete the deployment - self.vespa_cloud._start_prod_deployment(self.application_root) - shutil.rmtree(self.application_root, ignore_errors=True) + # # Vespa won't push the deleted deployment.xml file unless we add a validation override + # tomorrow = datetime.now() + timedelta(days=1) + # formatted_date = tomorrow.strftime("%Y-%m-%d") + # self.app_package.validations = [ + # Validation(ValidationID("deployment-removal"), formatted_date) + # ] + # self.app_package.to_files(self.application_root) + # # This will delete the deployment + # self.vespa_cloud._start_prod_deployment(self.application_root) + # shutil.rmtree(self.application_root, ignore_errors=True) From 82186d2b5e079167f705f97e1997a114200fbc1e Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Fri, 22 Nov 2024 11:34:22 +0100 Subject: [PATCH 09/12] remove run on branch --- .github/workflows/integration-cloud.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration-cloud.yml b/.github/workflows/integration-cloud.yml index cff2868e..9061abe0 100644 --- a/.github/workflows/integration-cloud.yml +++ b/.github/workflows/integration-cloud.yml @@ -5,7 +5,6 @@ on: push: branches: - master - - thomasht86/fix-integrationtest-vectorsearch schedule: - cron: "0 11 * * 0" From bd295007d3220ee1980fff2c51a7b4e9f85f9ff7 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Fri, 22 Nov 2024 13:09:51 +0100 Subject: [PATCH 10/12] only setup-python cache --- .github/workflows/integration-cloud.yml | 65 ++++--------------------- 1 file changed, 10 insertions(+), 55 deletions(-) diff --git a/.github/workflows/integration-cloud.yml b/.github/workflows/integration-cloud.yml index 9061abe0..139f98a8 100644 --- a/.github/workflows/integration-cloud.yml +++ b/.github/workflows/integration-cloud.yml @@ -5,6 +5,7 @@ on: push: branches: - master + - thomasht86/fix-integrationtest-vectorsearch schedule: - cron: "0 11 * * 0" @@ -13,54 +14,18 @@ concurrency: cancel-in-progress: false jobs: - setup-environment: - runs-on: ubuntu-latest - outputs: - python-cache-key: ${{ steps.cache-python.outputs.cache-hit }} - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - id: setup-python - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Cache dependencies - id: cache-python - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }} - - - name: Install dependencies - if: steps.cache-python.outputs.cache-hit != 'true' - run: | - pip install -e .[dev] - - - name: Upload Python environment - uses: actions/upload-artifact@v3 - with: - name: python-environment - path: ~/.cache/pip - integration-cloud: runs-on: ubuntu-latest - needs: setup-environment steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 + id: setup-python with: python-version: "3.10" - - - name: Download Python environment - uses: actions/download-artifact@v3 - with: - name: python-environment - path: ~/.cache/pip - + cache: "pip" + - run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key - name: Install dependencies run: | pip install -e .[dev] @@ -73,21 +38,16 @@ jobs: integration-cloud-token: runs-on: ubuntu-latest - needs: setup-environment steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 + id: setup-python with: python-version: "3.10" - - - name: Download Python environment - uses: actions/download-artifact@v3 - with: - name: python-environment - path: ~/.cache/pip - + cache: "pip" + - run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key - name: Install dependencies run: | pip install -e .[dev] @@ -102,21 +62,16 @@ jobs: integration-cloud-vector-search: runs-on: ubuntu-latest - needs: setup-environment steps: - uses: actions/checkout@v4 - name: Set up Python + id: setup-python uses: actions/setup-python@v5 with: python-version: "3.10" - - - name: Download Python environment - uses: actions/download-artifact@v3 - with: - name: python-environment - path: ~/.cache/pip - + cache: "pip" + - run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key - name: Install dependencies run: | pip install -e .[dev] From d9fde8d690f86a418954b1a25512caa86e9b2034 Mon Sep 17 00:00:00 2001 From: thomasht86 Date: Fri, 22 Nov 2024 13:18:18 +0100 Subject: [PATCH 11/12] simplify test --- .../test_integration_vespa_cloud_vector_search.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_integration_vespa_cloud_vector_search.py b/tests/integration/test_integration_vespa_cloud_vector_search.py index 9b73ac3f..3797eb42 100644 --- a/tests/integration/test_integration_vespa_cloud_vector_search.py +++ b/tests/integration/test_integration_vespa_cloud_vector_search.py @@ -108,6 +108,10 @@ def test_vector_indexing_and_query(self): docs = list( pyvespa_feed_format ) # we have enough memory to page everything into memory with list() + # seems like we sometimes can get more than sample_size docs + if len(docs) > sample_size: + docs = docs[:sample_size] + self.assertEqual(len(docs), sample_size) ok = 0 callbacks = 0 start_time = time.time() @@ -126,9 +130,6 @@ def callback(response: VespaResponse, id: str): schema="vector", namespace="benchmark", callback=callback, - max_workers=48, - max_connections=48, - max_queue_size=4000, ) self.assertEqual(ok, sample_size) duration = time.time() - start @@ -171,13 +172,14 @@ def callback(response: VespaResponse, id: str): } ) faulty_docs = list(feed_with_wrong_field) + if len(faulty_docs) > sample_size: + faulty_docs = faulty_docs[:sample_size] + self.assertEqual(len(faulty_docs), sample_size) self.app.feed_iterable( iter=faulty_docs, schema="vector", namespace="benchmark", callback=callback, - max_workers=48, - max_connections=48, ) self.assertEqual(ok, 0) self.assertEqual(callbacks, 100) From 40995d303a4a3b03b4397a030dd1118c6dfc1f6f Mon Sep 17 00:00:00 2001 From: Thomas Hjelde Thoresen Date: Fri, 22 Nov 2024 15:36:53 +0100 Subject: [PATCH 12/12] Update .github/workflows/integration-cloud.yml Amazing! Learned a new trick today as well :D Co-authored-by: Marlon (Esolitos) Saglia --- .github/workflows/integration-cloud.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-cloud.yml b/.github/workflows/integration-cloud.yml index 139f98a8..6e9b3cac 100644 --- a/.github/workflows/integration-cloud.yml +++ b/.github/workflows/integration-cloud.yml @@ -5,7 +5,8 @@ on: push: branches: - master - - thomasht86/fix-integrationtest-vectorsearch + pull_request: + paths: [".github/workflows/integration-cloud.yml"] schedule: - cron: "0 11 * * 0"