Skip to content

Commit

Permalink
delete directly via selection instead of making multiple calls to get…
Browse files Browse the repository at this point in the history
… chunk ids and delete each one (#2666)
  • Loading branch information
rkuo-danswer authored Oct 3, 2024
1 parent 0c54d9d commit 3fdd233
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 1 deletion.
2 changes: 1 addition & 1 deletion backend/danswer/background/connector_deletion.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def document_by_cc_pair_cleanup_task(
if count == 1:
# count == 1 means this is the only remaining cc_pair reference to the doc
# delete it from vespa and the db
document_index.delete(doc_ids=[document_id])
document_index.delete_single(doc_id=document_id)
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=[document_id],
Expand Down
10 changes: 10 additions & 0 deletions backend/danswer/document_index/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,16 @@ class Deletable(abc.ABC):
Class must implement the ability to delete document by their unique document ids.
"""

@abc.abstractmethod
def delete_single(self, doc_id: str) -> None:
"""
Given a single document id, hard delete it from the document index
Parameters:
- doc_id: document id as specified by the connector
"""
raise NotImplementedError

@abc.abstractmethod
def delete(self, doc_ids: list[str]) -> None:
"""
Expand Down
61 changes: 61 additions & 0 deletions backend/danswer/document_index/vespa/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import httpx
import requests

from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
from danswer.configs.chat_configs import DOC_TIME_DECAY
from danswer.configs.chat_configs import NUM_RETURNED_HITS
from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
Expand Down Expand Up @@ -479,6 +480,66 @@ def delete(self, doc_ids: list[str]) -> None:
document_ids=doc_ids, index_name=index_name, http_client=http_client
)

def delete_single(self, doc_id: str) -> None:
"""Possibly faster overall than the delete method due to using a single
delete call with a selection query."""

# Vespa deletion is poorly documented ... luckily we found this
# https://docs.vespa.ai/en/operations/batch-delete.html#example

doc_id = replace_invalid_doc_id_characters(doc_id)

# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
# indexing / updates / deletes since we have to make a large volume of requests.
index_names = [self.index_name]
if self.secondary_index_name:
index_names.append(self.secondary_index_name)

with httpx.Client(http2=True) as http_client:
for index_name in index_names:
params = httpx.QueryParams(
{
"selection": f"{index_name}.document_id=='{doc_id}'",
"cluster": DOCUMENT_INDEX_NAME,
}
)

total_chunks_deleted = 0
while True:
try:
resp = http_client.delete(
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
params=params,
)
resp.raise_for_status()
except httpx.HTTPStatusError as e:
logger.error(
f"Failed to delete chunk, details: {e.response.text}"
)
raise

resp_data = resp.json()

if "documentCount" in resp_data:
chunks_deleted = resp_data["documentCount"]
total_chunks_deleted += chunks_deleted

# Check for continuation token to handle pagination
if "continuation" not in resp_data:
break # Exit loop if no continuation token

if not resp_data["continuation"]:
break # Exit loop if continuation token is empty

params = params.set("continuation", resp_data["continuation"])

logger.debug(
f"VespaIndex.delete_single: "
f"index={index_name} "
f"doc={doc_id} "
f"chunks_deleted={total_chunks_deleted}"
)

def id_based_retrieval(
self,
chunk_requests: list[VespaChunkRequest],
Expand Down

0 comments on commit 3fdd233

Please sign in to comment.