From a150b2abe0477aae56bc32b65ccf1e46adcf8cac Mon Sep 17 00:00:00 2001 From: Kritin Vongthongsri Date: Fri, 22 Nov 2024 15:01:56 +0700 Subject: [PATCH] fix issue #1174 --- .../synthesizer/chunking/context_generator.py | 13 ++++++++++--- tests/test_context_generator.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 tests/test_context_generator.py diff --git a/deepeval/synthesizer/chunking/context_generator.py b/deepeval/synthesizer/chunking/context_generator.py index 5e42bd446..96905ab63 100644 --- a/deepeval/synthesizer/chunking/context_generator.py +++ b/deepeval/synthesizer/chunking/context_generator.py @@ -37,12 +37,17 @@ def __init__( ): from chromadb.api.models.Collection import Collection - self.model, self.using_native_model = initialize_model(model) - self.embedder = embedder + # Chunking parameters self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap - self.document_paths: List[str] = document_paths self.total_chunks = 0 + self.document_paths: List[str] = document_paths + + # Model parameters + self.model, self.using_native_model = initialize_model(model) + self.embedder = embedder + + # Quality parameters self.max_retries = max_retries self.filter_threshold = filter_threshold self.similarity_threshold = similarity_threshold @@ -106,6 +111,7 @@ def generate_contexts( ) # Generate contexts + self.total_chunks = 0 for path, collection in self.source_files_to_collections_map.items(): num_chunks = collection.count() min_num_context = min(num_context_per_document, num_chunks) @@ -181,6 +187,7 @@ async def a_chunk_and_store(key, chunker: DocumentChunker): ) # Generate contexts + self.total_chunks = 0 tasks = [ self._a_process_document_async( path, diff --git a/tests/test_context_generator.py b/tests/test_context_generator.py new file mode 100644 index 000000000..d0e50dcac --- /dev/null +++ b/tests/test_context_generator.py @@ -0,0 +1,18 @@ +from deepeval.synthesizer.chunking.context_generator import ContextGenerator +from deepeval.models.openai_embedding_model import OpenAIEmbeddingModel +from itertools import chain + +context_generator = ContextGenerator( + document_paths=["./synthesizer_data/pdf_example.pdf"], + embedder=OpenAIEmbeddingModel() +) +context_generator._load_docs() +context_generator._load_docs() +context_generator._load_docs() + +contexts, source_files, context_scores = context_generator.generate_contexts(num_context_per_document=10) +print(f"Utilizing {len(set(chain.from_iterable(contexts)))} out of {context_generator.total_chunks} chunks.") + +context_generator._load_docs() +contexts, source_files, context_scores = context_generator.generate_contexts(num_context_per_document=10) +print(f"Utilizing {len(set(chain.from_iterable(contexts)))} out of {context_generator.total_chunks} chunks.")