Skip to content

Commit

Permalink
Ingestion API now always updates regardless of document updated_at (#786
Browse files Browse the repository at this point in the history
)
  • Loading branch information
yuhongsun96 authored Nov 29, 2023
1 parent 9b7069a commit 006fd4c
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
23 changes: 15 additions & 8 deletions backend/danswer/indexing/indexing_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def _indexing_pipeline(
document_index: DocumentIndex,
documents: list[Document],
index_attempt_metadata: IndexAttemptMetadata,
ignore_time_skip: bool = False,
) -> tuple[int, int]:
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
Note that the documents should already be batched at this point so that it does not inflate the
Expand All @@ -87,14 +88,18 @@ def _indexing_pipeline(
}

updatable_docs: list[Document] = []
for doc in documents:
if (
doc.id in id_update_time_map
and doc.doc_updated_at
and doc.doc_updated_at <= id_update_time_map[doc.id]
):
continue
updatable_docs.append(doc)
if ignore_time_skip:
updatable_docs = documents
else:
for doc in documents:
if (
doc.id in id_update_time_map
and doc.doc_updated_at
and doc.doc_updated_at <= id_update_time_map[doc.id]
):
continue
updatable_docs.append(doc)

updatable_ids = [doc.id for doc in updatable_docs]

# Acquires a lock on the documents so that no other process can modify them
Expand Down Expand Up @@ -175,6 +180,7 @@ def build_indexing_pipeline(
chunker: Chunker | None = None,
embedder: Embedder | None = None,
document_index: DocumentIndex | None = None,
ignore_time_skip: bool = False,
) -> IndexingPipelineProtocol:
"""Builds a pipline which takes in a list (batch) of docs and indexes them."""
chunker = chunker or DefaultChunker()
Expand All @@ -188,4 +194,5 @@ def build_indexing_pipeline(
chunker=chunker,
embedder=embedder,
document_index=document_index,
ignore_time_skip=ignore_time_skip,
)
2 changes: 1 addition & 1 deletion backend/danswer/server/danswer_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def document_ingestion(
if document.source == DocumentSource.INGESTION_API:
document.source = DocumentSource.FILE

indexing_pipeline = build_indexing_pipeline()
indexing_pipeline = build_indexing_pipeline(ignore_time_skip=True)

new_doc, chunks = indexing_pipeline(
documents=[document],
Expand Down

1 comment on commit 006fd4c

@vercel
Copy link

@vercel vercel bot commented on 006fd4c Nov 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.