Skip to content

Commit

Permalink
update document
Browse files Browse the repository at this point in the history
  • Loading branch information
pablonyx committed Dec 3, 2024
1 parent 0553062 commit 7fa3c9c
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 0 deletions.
5 changes: 5 additions & 0 deletions backend/danswer/context/search/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def predicted_flow(self) -> QueryFlow:

"""Retrieval and Postprocessing"""

@log_function_time(print_only=True)
def _get_chunks(self) -> list[InferenceChunk]:
if self._retrieved_chunks is not None:
return self._retrieved_chunks
Expand Down Expand Up @@ -306,6 +307,7 @@ def _get_sections(self) -> list[InferenceSection]:
return expanded_inference_sections

@property
@log_function_time(print_only=True)
def reranked_sections(self) -> list[InferenceSection]:
"""Reranking is always done at the chunk level since section merging could create arbitrarily
long sections which could be:
Expand All @@ -331,6 +333,7 @@ def reranked_sections(self) -> list[InferenceSection]:
return self._reranked_sections

@property
@log_function_time(print_only=True)
def final_context_sections(self) -> list[InferenceSection]:
if self._final_context_sections is not None:
return self._final_context_sections
Expand All @@ -339,6 +342,7 @@ def final_context_sections(self) -> list[InferenceSection]:
return self._final_context_sections

@property
@log_function_time(print_only=True)
def section_relevance(self) -> list[SectionRelevancePiece] | None:
if self._section_relevance is not None:
return self._section_relevance
Expand Down Expand Up @@ -393,6 +397,7 @@ def section_relevance(self) -> list[SectionRelevancePiece] | None:
return self._section_relevance

@property
@log_function_time(print_only=True)
def section_relevance_list(self) -> list[bool]:
llm_indices = relevant_sections_to_indices(
relevance_sections=self.section_relevance,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def _log_top_section_links(search_flow: str, sections: list[InferenceSection]) -
logger.debug(f"Top links from {search_flow} search: {', '.join(top_links)}")


@log_function_time(print_only=True)
def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk]:
def _remove_title(chunk: InferenceChunkUncleaned) -> str:
if not chunk.title or not chunk.content:
Expand Down Expand Up @@ -244,6 +245,7 @@ def filter_sections(
]


@log_function_time(print_only=True)
def search_postprocessing(
search_query: SearchQuery,
retrieved_sections: list[InferenceSection],
Expand Down
8 changes: 8 additions & 0 deletions backend/danswer/context/search/retrieval/search_runner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import string
import time
from collections.abc import Callable

import nltk # type:ignore
Expand Down Expand Up @@ -85,6 +86,7 @@ def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
return keywords


@log_function_time(print_only=True)
def combine_retrieval_results(
chunk_sets: list[list[InferenceChunk]],
) -> list[InferenceChunk]:
Expand Down Expand Up @@ -256,7 +258,13 @@ def retrieve_chunks(
(q_copy, document_index, db_session),
)
)

start_time = time.time()
parallel_search_results = run_functions_tuples_in_parallel(run_queries)
end_time = time.time()
logger.info(
f"Parallel search execution took {end_time - start_time:.2f} seconds"
)
top_chunks = combine_retrieval_results(parallel_search_results)

if not top_chunks:
Expand Down
2 changes: 2 additions & 0 deletions backend/danswer/context/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from danswer.context.search.models import SavedSearchDocWithContent
from danswer.context.search.models import SearchDoc
from danswer.db.models import SearchDoc as DBSearchDoc
from danswer.utils.timing import log_function_time


T = TypeVar(
Expand Down Expand Up @@ -88,6 +89,7 @@ def drop_llm_indices(
return [i for i, val in enumerate(llm_bools) if val]


@log_function_time(print_only=True)
def inference_section_from_chunks(
center_chunk: InferenceChunk,
chunks: list[InferenceChunk],
Expand Down
3 changes: 3 additions & 0 deletions backend/danswer/document_index/vespa/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
from danswer.key_value_store.factory import get_kv_store
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
from danswer.utils.timing import log_function_time
from shared_configs.configs import MULTI_TENANT
from shared_configs.model_server_models import Embedding

Expand Down Expand Up @@ -660,6 +661,7 @@ def delete_single(self, doc_id: str) -> int:

return total_chunks_deleted

@log_function_time(print_only=True)
def id_based_retrieval(
self,
chunk_requests: list[VespaChunkRequest],
Expand All @@ -681,6 +683,7 @@ def id_based_retrieval(
get_large_chunks=get_large_chunks,
)

@log_function_time(print_only=True)
def hybrid_retrieval(
self,
query: str,
Expand Down
4 changes: 4 additions & 0 deletions backend/danswer/llm/answering/prune_and_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from danswer.prompts.prompt_utils import build_doc_context_str
from danswer.tools.tool_implementations.search.search_utils import section_to_dict
from danswer.utils.logger import setup_logger
from danswer.utils.timing import log_function_time


logger = setup_logger()
Expand All @@ -43,6 +44,7 @@ class ChunkRange(BaseModel):
end: int


@log_function_time(print_only=True)
def merge_chunk_intervals(chunk_ranges: list[ChunkRange]) -> list[ChunkRange]:
"""
This acts on a single document to merge the overlapping ranges of chunks
Expand Down Expand Up @@ -300,6 +302,7 @@ def prune_sections(
)


@log_function_time(print_only=True)
def _merge_doc_chunks(chunks: list[InferenceChunk]) -> InferenceSection:
# Assuming there are no duplicates by this point
sorted_chunks = sorted(chunks, key=lambda x: x.chunk_id)
Expand Down Expand Up @@ -327,6 +330,7 @@ def _merge_doc_chunks(chunks: list[InferenceChunk]) -> InferenceSection:
)


@log_function_time(print_only=True)
def _merge_sections(sections: list[InferenceSection]) -> list[InferenceSection]:
docs_map: dict[str, dict[int, InferenceChunk]] = defaultdict(dict)
doc_order: dict[str, int] = {}
Expand Down
2 changes: 2 additions & 0 deletions backend/danswer/secondary_llm_flows/query_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import count_punctuation
from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel
from danswer.utils.timing import log_function_time

logger = setup_logger()

Expand Down Expand Up @@ -48,6 +49,7 @@ def _get_rephrase_messages() -> list[dict[str, str]]:
return model_output


@log_function_time(print_only=True)
def multilingual_query_expansion(
query: str,
expansion_languages: list[str],
Expand Down

0 comments on commit 7fa3c9c

Please sign in to comment.