From ee4fa1779687dac0169e733f256b5680cd39c26e Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Tue, 12 Mar 2024 15:47:27 +0000 Subject: [PATCH 01/16] make document intelligence loader use prebuilt-layout --- .../doc_loader/documentIntelligenceLoader.py | 372 ++++++++++++++++-- .../doc_loader/pdfLoader.py | 32 +- .../doc_loader/structuredLoader.py | 26 +- requirements.txt | 1 + 4 files changed, 381 insertions(+), 50 deletions(-) diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index 23027f16..33fcdb23 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -1,54 +1,352 @@ -from rag_experiment_accelerator.utils.logging import get_logger +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import ExitStack +from tqdm import tqdm +import re +import os +from azure.ai.documentintelligence import DocumentIntelligenceClient from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader +from azure.core.credentials import AzureKeyCredential +from langchain_core.documents import Document from pathlib import Path +from langchain_community.document_loaders.base import BaseLoader +from typing import List, Iterator +from rag_experiment_accelerator.utils.logging import get_logger +from azure.ai.documentintelligence.models import DocumentParagraph + logger = get_logger(__name__) -def azure_document_intelligence_loader(file_path, endpoint, key): +class DocumentIntelligenceLoader(BaseLoader): + """ + Analyzes and loads documents and directories using Azure Document Intelligence. """ - Load a file using Azure Document Intelligence. - Args: - file_path (str): The path to the file. - endpoint (str): The Azure Document Intelligence endpoint. - key (str): The Azure Document Intelligence key. + def __init__( + self, + path: str, + endpoint: str, + key: str, + glob_patterns: List[str] = None, + excluded_paragraph_roles=[], + patterns_to_remove: List[str] = [], + ): + """ + Initializes an instance of the DocumentIntelligenceLoader class. - Returns: - Document: A Document object. - """ + Parameters: + path: path of the document or directory to load from, when a directory path is provided a glob_pattern has to be provided as well + end_point: Azure Document Intelligence endpoint + key: Azure Document Intelligence key + glob_patterns: when the given path is a directory, glob_patterns is used to match the files that should be loaded + excluded_paragraph_roles: a list of paragraph roles to exclude. The full list of paragraph roles can be viewed here: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout?view=doc-intel-4.0.0#paragraph-roles + patterns_to_remove: a list of specific regex patterns to be removed from the extracted text + """ + self.client = DocumentIntelligenceClient( + endpoint=endpoint, credential=AzureKeyCredential(key) + ) + self.path = path + self.endpoint = endpoint + self.key = key + self.patterns_to_remove = patterns_to_remove + self.glob_patterns = glob_patterns + self.excluded_paragraph_roles = excluded_paragraph_roles - document = [] - try: - loader = AzureAIDocumentIntelligenceLoader(file_path=file_path, api_key=key, api_endpoint=endpoint, api_model="prebuilt-read") - document += loader.load() - except Exception as e: - logger.warning(f"Failed to load {file_path}: {e}") - pass + def load(self) -> List[Document]: + documents = [] + file_paths = self._get_file_paths() - return document + with ExitStack() as stack: + executor = stack.enter_context(ThreadPoolExecutor()) + progress_bar = stack.enter_context( + tqdm(total=len(file_paths), desc="Analyzing documents") + ) + futures = { + executor.submit(self._analyze_document, file_path) + for file_path in file_paths + } -def azure_document_intelligence_directory_loader(pattern, folder_path, endpoint, key): - """ - Load pdf files from a folder using Azure Document Intelligence. + for future in as_completed(futures): + try: + documents += future.result() + except Exception as exc: + logger.error(f"Processing document generated an exception: {exc}") + progress_bar.update(1) - Args: - pattern (str): The file extension to look for. - folder_path (str): The path to the folder containing the files. - endpoint (str): The Azure Document Intelligence endpoint. - key (str): The Azure Document Intelligence key. + return documents - Returns: - list[Document]: A list of Document objects. - """ + def lazy_load(self) -> Iterator[Document]: + file_paths = self._get_file_paths() + for file_path in file_paths: + yield self._analyze_document(file_path) + + def _get_file_paths(self): + if not os.path.isdir(self.path): + return [self.path] + + directory = Path(self.path) + file_paths = [] + for pattern in self.glob_patterns: + file_paths += list(directory.rglob(f"*.{pattern}")) + + return [str(path) for path in file_paths] + + def _analyze_document(self, file_path: str): + documents = [] + try: + with open(file_path, "rb") as file: + content = file.read() + poller = self.client.begin_analyze_document( + "prebuilt-layout", + content, + content_type="application/octet-stream", + output_content_format="markdown", + ) + + result = poller.result() + paragraphs = self._substitute_table_paragraphs( + result.paragraphs, result.tables + ) + + relevant_paragraphs = [ + paragraph + for paragraph in paragraphs + if paragraph.role not in self.excluded_paragraph_roles + ] + paragraphs_by_role = self._get_paragraphs_by_role(result) + + paragraphs_by_page = self._split_paragraphs_by_page(relevant_paragraphs) + for page_number, page_paragraphs in paragraphs_by_page.items(): + documents.append( + self._convert_to_langchain_document( + page_paragraphs, file_path, paragraphs_by_role, page_number + ) + ) + + return documents + except Exception as exc: + logger.warning( + f"Failed to load {file_path} with Azure Document Intelligence using the 'prebuilt-layout' model: {exc}. Attempting to load using the simpler 'prebuilt-read' model..." + ) + return self._load_with_ocr(file_path) + + def _clean_content(self, content: str): + # Remove AI doc intelligence traces. + pattern = re.compile(r":selected:|:unselected:") + content = pattern.sub("", content) + # Remove specific regex patterns. + for regex_pattern in self.patterns_to_remove: + content = regex_pattern.sub("", content) + + return content + + def _get_paragraphs_by_role(self, result): + dict = {} + for paragraph in result.paragraphs: + if not paragraph.role or paragraph.role in self.excluded_paragraph_roles: + continue + paragraph_item = { + "content": paragraph.content, + "page": paragraph.bounding_regions[0].get("pageNumber"), + } + dict[paragraph.role] = dict.get(paragraph.role, []) + [paragraph_item] + + tables = [] + for table in result.tables: + table_item = { + "cells": table.cells, + "page": table.bounding_regions[0].get("pageNumber"), + } + tables.append(table_item) + dict["tables"] = tables + + return dict + + def _convert_to_langchain_document( + self, paragraphs, file_path, paragraphs_by_role, page_number + ): + content = "\n\n".join([paragraph.content for paragraph in paragraphs]) + clean_content = self._clean_content(content) + return Document( + page_content=clean_content, + metadata={ + "source": file_path, + "paragraphs_by_role": paragraphs_by_role, + "page": page_number - 1, + }, + ) + + def _is_intersecting_regions(self, bounding_region1, bounding_region2): + """ + Returns whether two bounding regions intersect or not + """ + for region1 in bounding_region1: + for region2 in bounding_region2: + if region1["pageNumber"] == region2[ + "pageNumber" + ] and self._is_intersecting_polygons(region1.polygon, region2.polygon): + return True + return False + + def _is_intersecting_polygons(self, polygon1, polygon2): + """ + Returns whether two polygons intersect or not + """ + x1_1, y1_1, x2_1, y2_1, x3_1, y3_1, x4_1, y4_1 = polygon1 + x1_2, y1_2, x2_2, y2_2, x3_2, y3_2, x4_2, y4_2 = polygon2 + + # Check for overlap along the x-axis + if max(x1_1, x2_1, x3_1, x4_1) < min(x1_2, x2_2, x3_2, x4_2) or min( + x1_1, x2_1, x3_1, x4_1 + ) > max(x1_2, x2_2, x3_2, x4_2): + return False + + # Check for overlap along the y-axis + if max(y1_1, y2_1, y3_1, y4_1) < min(y1_2, y2_2, y3_2, y4_2) or min( + y1_1, y2_1, y3_1, y4_1 + ) > max(y1_2, y2_2, y3_2, y4_2): + return False + + # If the boxes overlap along both axes, they intersect + return True + + def _assign_tables_to_paragraphs(self, paragraphs, tables): + """ + Returns a list that maps paragraph indexes to their tables indexes. + Indexes in the returned list match the indexes of the `paragraphs` list and the value at that index contains the index of the table in the `tables` list that the paragraph belongs to. + If the paragraph is not intersecting with any table, the index will be -1. + + For example, this assignments: [-1, 0, 0, 1, -1, -1, -1, 2, 2, 2, -1] means: + The paragraph at index 0 does not belong to any table. + The paragraphs at indexes 1 and 2 belong to table at index 0. + The paragraph at index 3 belongs to table 1 + The rest of the paragraphs in the example belong to the table at index 2, or do not belong to any table + """ + paragraph_to_table = [-1] * len(paragraphs) + + for paragraph_index, paragraph in enumerate(paragraphs): + for table_index, table in enumerate(tables): + if self._is_intersecting_regions( + paragraph.bounding_regions, table.bounding_regions + ): + paragraph_to_table[paragraph_index] = table_index + else: + continue + + return paragraph_to_table + + def _convert_to_paragraph(self, table): + content = self._format_table(table) + return DocumentParagraph( + content=content, bounding_regions=table.bounding_regions, role="table" + ) + + def _format_table(self, table): + """ + Formats Azure Document Intelligence's tables to the following format: + + : , : , : , ... + : , : , : , ... + : , : , : , ... + ... + """ + table_headers = [] + for cell in table["cells"]: + if cell.get("kind") == "columnHeader": + table_headers.append(cell["content"]) + + content = table.get("caption", {}).get("content", "") + + previous_row_index = -1 + rows_to_span = 0 + spanning_content = "" + for cell in table.cells: + if cell.get("kind") == "columnHeader": + continue + + header = ( + f"{table_headers[cell['columnIndex']]}: " + if cell["columnIndex"] < len(table_headers) + else "" + ) + + # If the cell spans multiple rows, we need to combine the content of the spanning cells + if rows_to_span > 0: + spanning_content += cell.content + rows_to_span -= 1 + if rows_to_span == 0: + content += f"{header}{spanning_content}" + spanning_content = "" + else: + spanning_content += ", " + continue + else: + rows_to_span = cell.get("rowSpan", 0) + + is_new_row = previous_row_index != cell["rowIndex"] + if is_new_row: + content += "\n" if content else "" + previous_row_index = cell["rowIndex"] + + content += f"{header}{cell.content}" + content += ", " if cell["columnIndex"] < len(table_headers) - 1 else "" + return content + + def _substitute_table_paragraphs(self, paragraphs, tables): + """ + Returns a modified version of the `paragraphs` list, where paragraphs that are part of a table are combined and replaced with a formatted table. + """ + result = [] + paragraphs_to_tables = self._assign_tables_to_paragraphs(paragraphs, tables) + + last_table_index = None + for paragraph_index, table_index in enumerate(paragraphs_to_tables): + is_table = table_index != -1 + if not is_table: + result.append(paragraphs[paragraph_index]) + continue + + is_new_table = table_index != last_table_index + if is_new_table: + table = tables[table_index] + formatted_table = self._convert_to_paragraph(table) + result.append(formatted_table) + last_table_index = table_index + + return result + + def _split_paragraphs_by_page(self, paragraphs): + paragraphs_by_page = {} + for paragraph in paragraphs: + page_number = paragraph.bounding_regions[0]["pageNumber"] + is_new_page = page_number not in paragraphs_by_page + if is_new_page: + paragraphs_by_page[page_number] = [] + paragraphs_by_page[page_number].append(paragraph) + return paragraphs_by_page + + def _load_with_ocr(self, file_path): + """ + Loads a file with a simpler 'prebuilt-read' model which uses a simple OCR approach to load the file. + Some files may not be supported by the 'prebuilt-layout' model, but can be loaded with the 'prebuilt-read' model. + """ - glob = f"**/[!.]*.{pattern}" - p = Path(folder_path) - documents = [] - items = p.glob(glob) - for i in items: - if i.is_file(): - documents += azure_document_intelligence_loader(i, endpoint, key) + document = [] + try: + loader = AzureAIDocumentIntelligenceLoader( + file_path=file_path, + api_key=self.key, + api_endpoint=self.endpoint, + api_model="prebuilt-read", + ) + document += loader.load() + except Exception as e: + logger.error( + f"Failed to load {file_path} with Azure Document Intelligence using the 'prebuilt-read' model: {e}" + ) - return documents + logger.info( + f'Successfully loaded {file_path} with Azure Document Intelligence using the "prebuilt-read" model.' + ) + return document diff --git a/rag_experiment_accelerator/doc_loader/pdfLoader.py b/rag_experiment_accelerator/doc_loader/pdfLoader.py index 9af03cd3..bae012a2 100644 --- a/rag_experiment_accelerator/doc_loader/pdfLoader.py +++ b/rag_experiment_accelerator/doc_loader/pdfLoader.py @@ -2,8 +2,12 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from rag_experiment_accelerator.utils.logging import get_logger -from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import azure_document_intelligence_directory_loader -from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials +from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import ( + DocumentIntelligenceLoader, +) +from rag_experiment_accelerator.config.credentials import ( + AzureDocumentIntelligenceCredentials, +) import uuid import re @@ -29,9 +33,9 @@ def preprocess_pdf_content(content: str): # Output: "hello world openai" """ - content = re.sub(r'\n{2,}', '\n', content) - content = re.sub(r'\n{1,}', '', content) - content = re.sub(r'\\u[0-9a-fA-F]{4}', '', content) + content = re.sub(r"\n{2,}", "\n", content) + content = re.sub(r"\n{1,}", "", content) + content = re.sub(r"\\u[0-9a-fA-F]{4}", "", content) content = content.lower() return content @@ -63,7 +67,19 @@ def load_pdf_files( documents = [] for pattern in glob_patterns: if chunking_strategy == "azure-document-intelligence": - documents += azure_document_intelligence_directory_loader(pattern, folder_path, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY) + loader = DocumentIntelligenceLoader( + folder_path, + AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, + AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY, + glob_patterns=[pattern], + excluded_paragraph_roles=[ + "pageHeader", + "pageFooter", + "footnote", + "pageNumber", + ], + ) + documents += loader.load() else: # using langchain loader = PyPDFDirectoryLoader( @@ -86,7 +102,9 @@ def load_pdf_files( docs = text_splitter.split_documents(documents) docsList = [] for doc in docs: - docsList.append(dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)})) + docsList.append( + dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)}) + ) logger.info(f"Split {len(documents)} PDF pages into {len(docs)} chunks") diff --git a/rag_experiment_accelerator/doc_loader/structuredLoader.py b/rag_experiment_accelerator/doc_loader/structuredLoader.py index 5dfe7f07..221d31e0 100644 --- a/rag_experiment_accelerator/doc_loader/structuredLoader.py +++ b/rag_experiment_accelerator/doc_loader/structuredLoader.py @@ -5,8 +5,12 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter from rag_experiment_accelerator.utils.logging import get_logger -from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials -from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import azure_document_intelligence_loader +from rag_experiment_accelerator.config.credentials import ( + AzureDocumentIntelligenceCredentials, +) +from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import ( + DocumentIntelligenceLoader, +) import uuid logger = get_logger(__name__) @@ -59,7 +63,19 @@ def load_structured_files( for file in matching_files: if chunking_strategy == "azure-document-intelligence": - document = azure_document_intelligence_loader(file, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY) + loader = DocumentIntelligenceLoader( + folder_path, + AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, + AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY, + glob_patterns=pattern, + excluded_paragraph_roles=[ + "pageHeader", + "pageFooter", + "footnote", + "pageNumber", + ], + ) + document = loader.load() else: # Use the loader defined in function call. document = loader(file, **loader_kwargs).load() @@ -89,8 +105,6 @@ def load_structured_files( for doc in docs: docsList.append(dict({str(uuid.uuid4()): doc.page_content})) - logger.info( - f"Split {len(documents)} {file_format} files into {len(docs)} chunks" - ) + logger.info(f"Split {len(documents)} {file_format} files into {len(docs)} chunks") return docsList diff --git a/requirements.txt b/requirements.txt index c565ebb2..c6ca1dd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,7 @@ hnswlib==0.7.0 pypdf==3.17.0 langchain==0.0.329 langchain-community==0.0.16 +azure-ai-documentintelligence==1.0.0b1 sentence-transformers==2.2.2 beautifulsoup4==4.12.2 lxml==4.9.3 From 1525a5aeb69a43a64214bbfd6ae96e4ec25c438a Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Sun, 17 Mar 2024 14:31:05 +0000 Subject: [PATCH 02/16] add option to split by page to doc intelligence loader --- .../doc_loader/documentIntelligenceLoader.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index 33fcdb23..91a3b2f5 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -28,6 +28,7 @@ def __init__( endpoint: str, key: str, glob_patterns: List[str] = None, + split_documents_by_page=False, excluded_paragraph_roles=[], patterns_to_remove: List[str] = [], ): @@ -39,6 +40,7 @@ def __init__( end_point: Azure Document Intelligence endpoint key: Azure Document Intelligence key glob_patterns: when the given path is a directory, glob_patterns is used to match the files that should be loaded + split_documents_by_page: if True, each page in the document will be loaded into separate LangChain document, otherwise (default) the entire document will be loaded into a single LangChain document excluded_paragraph_roles: a list of paragraph roles to exclude. The full list of paragraph roles can be viewed here: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-layout?view=doc-intel-4.0.0#paragraph-roles patterns_to_remove: a list of specific regex patterns to be removed from the extracted text """ @@ -50,6 +52,7 @@ def __init__( self.key = key self.patterns_to_remove = patterns_to_remove self.glob_patterns = glob_patterns + self.split_documents_by_page = split_documents_by_page self.excluded_paragraph_roles = excluded_paragraph_roles def load(self) -> List[Document]: @@ -116,11 +119,18 @@ def _analyze_document(self, file_path: str): ] paragraphs_by_role = self._get_paragraphs_by_role(result) - paragraphs_by_page = self._split_paragraphs_by_page(relevant_paragraphs) - for page_number, page_paragraphs in paragraphs_by_page.items(): + if self.split_documents_by_page: + paragraphs_by_page = self._split_paragraphs_by_page(relevant_paragraphs) + for page_number, page_paragraphs in paragraphs_by_page.items(): + documents.append( + self._convert_to_langchain_document( + page_paragraphs, file_path, paragraphs_by_role, page_number + ) + ) + else: documents.append( self._convert_to_langchain_document( - page_paragraphs, file_path, paragraphs_by_role, page_number + relevant_paragraphs, file_path, paragraphs_by_role, 1 ) ) From 3d0fdf8137830090eaf3967f2775ee7a98588ca2 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 15:14:32 +0000 Subject: [PATCH 03/16] add unit tests --- .../doc_loader/documentIntelligenceLoader.py | 32 +- .../simple_response.json | 1555 +++++++++++++++++ .../test_document_intelligence_loader.py | 53 + 3 files changed, 1628 insertions(+), 12 deletions(-) create mode 100644 rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json create mode 100644 rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index 91a3b2f5..e7274523 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -98,16 +98,8 @@ def _get_file_paths(self): def _analyze_document(self, file_path: str): documents = [] try: - with open(file_path, "rb") as file: - content = file.read() - poller = self.client.begin_analyze_document( - "prebuilt-layout", - content, - content_type="application/octet-stream", - output_content_format="markdown", - ) + result = self._call_document_intelligence(file_path) - result = poller.result() paragraphs = self._substitute_table_paragraphs( result.paragraphs, result.tables ) @@ -115,7 +107,7 @@ def _analyze_document(self, file_path: str): relevant_paragraphs = [ paragraph for paragraph in paragraphs - if paragraph.role not in self.excluded_paragraph_roles + if paragraph["role"] not in self.excluded_paragraph_roles ] paragraphs_by_role = self._get_paragraphs_by_role(result) @@ -141,6 +133,19 @@ def _analyze_document(self, file_path: str): ) return self._load_with_ocr(file_path) + def _call_document_intelligence(self, file_path): + with open(file_path, "rb") as file: + content = file.read() + poller = self.client.begin_analyze_document( + "prebuilt-layout", + content, + content_type="application/octet-stream", + output_content_format="markdown", + ) + + result = poller.result() + return result + def _clean_content(self, content: str): # Remove AI doc intelligence traces. pattern = re.compile(r":selected:|:unselected:") @@ -154,13 +159,16 @@ def _clean_content(self, content: str): def _get_paragraphs_by_role(self, result): dict = {} for paragraph in result.paragraphs: - if not paragraph.role or paragraph.role in self.excluded_paragraph_roles: + if ( + not paragraph["role"] + or paragraph["role"] in self.excluded_paragraph_roles + ): continue paragraph_item = { "content": paragraph.content, "page": paragraph.bounding_regions[0].get("pageNumber"), } - dict[paragraph.role] = dict.get(paragraph.role, []) + [paragraph_item] + dict[paragraph["role"]] = dict.get(paragraph["role"], []) + [paragraph_item] tables = [] for table in result.tables: diff --git a/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json b/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json new file mode 100644 index 00000000..31cf0d86 --- /dev/null +++ b/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json @@ -0,0 +1,1555 @@ +{ + "apiVersion": "2023-10-31-preview", + "modelId": "prebuilt-layout", + "content": "This is the Title\n\\nSome text\\n\\n| Col 1 | Col 2 | Col 3 |\\n| - | - | - |\\n| Row 1 Col 1 | Row 1 Col 2 | Row 1 Col 3 |\\n| Row 2 Col 1 | Row 2 Col 2 | Row 2 Col 3 |\\n\\nThis is the end.\\n", + "pages": [ + { + "pageNumber": 1, + "angle": 0, + "width": 8.5, + "height": 11, + "unit": "inch", + "words": [ + { + "content": "This", + "polygon": [ + 0.9869, + 1.0203, + 1.2815, + 1.0214, + 1.2784, + 1.1667, + 0.9838, + 1.1739 + ], + "confidence": 0.993, + "span": { + "offset": 0, + "length": 4 + } + }, + { + "content": "is", + "polygon": [ + 1.3222, + 1.0215, + 1.4323, + 1.022, + 1.4293, + 1.1648, + 1.3191, + 1.1661 + ], + "confidence": 0.993, + "span": { + "offset": 5, + "length": 2 + } + }, + { + "content": "the", + "polygon": [ + 1.4659, + 1.0221, + 1.691, + 1.0232, + 1.688, + 1.1655, + 1.4628, + 1.1647 + ], + "confidence": 0.999, + "span": { + "offset": 8, + "length": 3 + } + }, + { + "content": "Title", + "polygon": [ + 1.7221, + 1.0234, + 2.0438, + 1.0253, + 2.0438, + 1.1731, + 1.7191, + 1.1659 + ], + "confidence": 0.993, + "span": { + "offset": 12, + "length": 5 + } + }, + { + "content": "Some", + "polygon": [ + 0.9898, + 1.3361, + 1.3741, + 1.3411, + 1.3723, + 1.4728, + 0.9898, + 1.4753 + ], + "confidence": 0.993, + "span": { + "offset": 19, + "length": 4 + } + }, + { + "content": "text", + "polygon": [ + 1.4114, + 1.3414, + 1.6761, + 1.3426, + 1.6761, + 1.4727, + 1.4095, + 1.4726 + ], + "confidence": 0.993, + "span": { + "offset": 24, + "length": 4 + } + }, + { + "content": "Col", + "polygon": [ + 1.0697, + 1.6519, + 1.3192, + 1.6519, + 1.3184, + 1.7848, + 1.0697, + 1.7831 + ], + "confidence": 0.997, + "span": { + "offset": 32, + "length": 3 + } + }, + { + "content": "1", + "polygon": [ + 1.3594, + 1.6519, + 1.4351, + 1.6519, + 1.4342, + 1.7856, + 1.3585, + 1.7851 + ], + "confidence": 0.996, + "span": { + "offset": 36, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 3.2281, + 1.6519, + 3.4843, + 1.6519, + 3.4834, + 1.7896, + 3.2281, + 1.7879 + ], + "confidence": 0.998, + "span": { + "offset": 40, + "length": 3 + } + }, + { + "content": "2", + "polygon": [ + 3.5166, + 1.6519, + 3.6066, + 1.6519, + 3.6057, + 1.7903, + 3.5157, + 1.7898 + ], + "confidence": 0.996, + "span": { + "offset": 44, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 5.3961, + 1.6567, + 5.6483, + 1.6534, + 5.65, + 1.7871, + 5.3961, + 1.7903 + ], + "confidence": 0.999, + "span": { + "offset": 48, + "length": 3 + } + }, + { + "content": "3", + "polygon": [ + 5.6817, + 1.653, + 5.7775, + 1.6519, + 5.7781, + 1.7855, + 5.6834, + 1.7867 + ], + "confidence": 0.996, + "span": { + "offset": 52, + "length": 1 + } + }, + { + "content": "Row", + "polygon": [ + 1.0698, + 1.848, + 1.3652, + 1.8458, + 1.366, + 1.9852, + 1.0728, + 1.9854 + ], + "confidence": 0.999, + "span": { + "offset": 72, + "length": 3 + } + }, + { + "content": "1", + "polygon": [ + 1.4061, + 1.8457, + 1.4516, + 1.8455, + 1.4517, + 1.9852, + 1.4066, + 1.9852 + ], + "confidence": 0.993, + "span": { + "offset": 76, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 1.513, + 1.8457, + 1.7402, + 1.847, + 1.7381, + 1.9856, + 1.5125, + 1.9853 + ], + "confidence": 0.997, + "span": { + "offset": 78, + "length": 3 + } + }, + { + "content": "1", + "polygon": [ + 1.7834, + 1.8475, + 1.8576, + 1.8482, + 1.8576, + 1.9859, + 1.7809, + 1.9857 + ], + "confidence": 0.996, + "span": { + "offset": 82, + "length": 1 + } + }, + { + "content": "Row", + "polygon": [ + 3.2327, + 1.8463, + 3.5305, + 1.8466, + 3.5294, + 1.9861, + 3.2327, + 1.9849 + ], + "confidence": 0.999, + "span": { + "offset": 86, + "length": 3 + } + }, + { + "content": "1", + "polygon": [ + 3.5741, + 1.8467, + 3.6199, + 1.8467, + 3.6184, + 1.9861, + 3.5727, + 1.9861 + ], + "confidence": 0.997, + "span": { + "offset": 90, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 3.6794, + 1.8469, + 3.9039, + 1.8475, + 3.9013, + 1.9861, + 3.6777, + 1.9861 + ], + "confidence": 0.996, + "span": { + "offset": 92, + "length": 3 + } + }, + { + "content": "2", + "polygon": [ + 3.9406, + 1.8476, + 4.0256, + 1.8479, + 4.0256, + 1.9861, + 3.9379, + 1.9861 + ], + "confidence": 0.975, + "span": { + "offset": 96, + "length": 1 + } + }, + { + "content": "Row", + "polygon": [ + 5.3987, + 1.8475, + 5.6963, + 1.8469, + 5.697, + 1.9878, + 5.4017, + 1.9837 + ], + "confidence": 0.995, + "span": { + "offset": 100, + "length": 3 + } + }, + { + "content": "1", + "polygon": [ + 5.7378, + 1.8468, + 5.7839, + 1.8467, + 5.784, + 1.9885, + 5.7382, + 1.9881 + ], + "confidence": 0.973, + "span": { + "offset": 104, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 5.8416, + 1.8464, + 6.07, + 1.8451, + 6.0678, + 1.9881, + 5.8412, + 1.9886 + ], + "confidence": 0.996, + "span": { + "offset": 106, + "length": 3 + } + }, + { + "content": "3", + "polygon": [ + 6.1046, + 1.8449, + 6.1888, + 1.8443, + 6.1888, + 1.9872, + 6.1022, + 1.9878 + ], + "confidence": 0.997, + "span": { + "offset": 110, + "length": 1 + } + }, + { + "content": "Row", + "polygon": [ + 1.0657, + 2.0421, + 1.3659, + 2.0438, + 1.3629, + 2.1768, + 1.0627, + 2.1771 + ], + "confidence": 0.995, + "span": { + "offset": 116, + "length": 3 + } + }, + { + "content": "2", + "polygon": [ + 1.4032, + 2.0439, + 1.4776, + 2.0441, + 1.4746, + 2.1768, + 1.4001, + 2.1768 + ], + "confidence": 0.996, + "span": { + "offset": 120, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 1.5149, + 2.0441, + 1.7647, + 2.0439, + 1.7616, + 2.1771, + 1.5119, + 2.1769 + ], + "confidence": 0.999, + "span": { + "offset": 122, + "length": 3 + } + }, + { + "content": "1", + "polygon": [ + 1.7932, + 2.0439, + 1.8576, + 2.0438, + 1.8576, + 2.1771, + 1.7901, + 2.1771 + ], + "confidence": 0.996, + "span": { + "offset": 126, + "length": 1 + } + }, + { + "content": "Row", + "polygon": [ + 3.2317, + 2.0408, + 3.5306, + 2.0421, + 3.5283, + 2.1808, + 3.2316, + 2.1796 + ], + "confidence": 0.998, + "span": { + "offset": 130, + "length": 3 + } + }, + { + "content": "2", + "polygon": [ + 3.5671, + 2.0422, + 3.6401, + 2.0422, + 3.6369, + 2.181, + 3.5645, + 2.1809 + ], + "confidence": 0.996, + "span": { + "offset": 134, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 3.6812, + 2.042, + 3.907, + 2.0407, + 3.9019, + 2.1804, + 3.6777, + 2.181 + ], + "confidence": 0.993, + "span": { + "offset": 136, + "length": 3 + } + }, + { + "content": "2", + "polygon": [ + 3.9413, + 2.0404, + 4.0303, + 2.0396, + 4.0287, + 2.1799, + 3.9358, + 2.1803 + ], + "confidence": 0.996, + "span": { + "offset": 140, + "length": 1 + } + }, + { + "content": "Row", + "polygon": [ + 5.3976, + 2.042, + 5.6964, + 2.0422, + 5.6964, + 2.1815, + 5.3976, + 2.1789 + ], + "confidence": 0.999, + "span": { + "offset": 144, + "length": 3 + } + }, + { + "content": "2", + "polygon": [ + 5.7286, + 2.0422, + 5.8021, + 2.042, + 5.8021, + 2.1819, + 5.7286, + 2.1817 + ], + "confidence": 0.995, + "span": { + "offset": 148, + "length": 1 + } + }, + { + "content": "Col", + "polygon": [ + 5.8435, + 2.0417, + 6.0664, + 2.04, + 6.0664, + 2.1819, + 5.8435, + 2.1819 + ], + "confidence": 0.997, + "span": { + "offset": 150, + "length": 3 + } + }, + { + "content": "3", + "polygon": [ + 6.1032, + 2.0396, + 6.1935, + 2.0387, + 6.1935, + 2.1819, + 6.1032, + 2.1819 + ], + "confidence": 0.997, + "span": { + "offset": 154, + "length": 1 + } + }, + { + "content": "This", + "polygon": [ + 0.9809, + 2.5439, + 1.2621, + 2.5463, + 1.2612, + 2.6912, + 0.9809, + 2.6958 + ], + "confidence": 0.993, + "span": { + "offset": 159, + "length": 4 + } + }, + { + "content": "is", + "polygon": [ + 1.305, + 2.5466, + 1.4051, + 2.547, + 1.4038, + 2.6897, + 1.304, + 2.6907 + ], + "confidence": 0.997, + "span": { + "offset": 164, + "length": 2 + } + }, + { + "content": "the", + "polygon": [ + 1.4408, + 2.547, + 1.6481, + 2.5469, + 1.6461, + 2.689, + 1.4394, + 2.6895 + ], + "confidence": 0.999, + "span": { + "offset": 167, + "length": 3 + } + }, + { + "content": "end.", + "polygon": [ + 1.691, + 2.5468, + 1.9961, + 2.5447, + 1.9961, + 2.6912, + 1.6889, + 2.6891 + ], + "confidence": 0.993, + "span": { + "offset": 171, + "length": 4 + } + } + ], + "lines": [ + { + "content": "This is the Title", + "polygon": [ + 0.9837, + 1.0169, + 2.039, + 1.0169, + 2.039, + 1.1697, + 0.9837, + 1.1697 + ], + "spans": [ + { + "offset": 0, + "length": 17 + } + ] + }, + { + "content": "Some text", + "polygon": [ + 0.9885, + 1.332, + 1.6713, + 1.3368, + 1.6713, + 1.4753, + 0.9885, + 1.4705 + ], + "spans": [ + { + "offset": 19, + "length": 9 + } + ] + }, + { + "content": "Col 1", + "polygon": [ + 1.0744, + 1.6519, + 1.4326, + 1.6567, + 1.4326, + 1.7856, + 1.0697, + 1.7856 + ], + "spans": [ + { + "offset": 32, + "length": 5 + } + ] + }, + { + "content": "Col 2", + "polygon": [ + 3.2281, + 1.6519, + 3.6006, + 1.6519, + 3.6053, + 1.7904, + 3.2281, + 1.7856 + ], + "spans": [ + { + "offset": 40, + "length": 5 + } + ] + }, + { + "content": "Col 3", + "polygon": [ + 5.3961, + 1.6567, + 5.7733, + 1.6519, + 5.7685, + 1.7856, + 5.3961, + 1.7904 + ], + "spans": [ + { + "offset": 48, + "length": 5 + } + ] + }, + { + "content": "Row 1 Col 1", + "polygon": [ + 1.0697, + 1.8429, + 1.8528, + 1.8429, + 1.8528, + 1.9813, + 1.0697, + 1.9813 + ], + "spans": [ + { + "offset": 72, + "length": 11 + } + ] + }, + { + "content": "Row 1 Col 2", + "polygon": [ + 3.2281, + 1.8429, + 4.0208, + 1.8429, + 4.0208, + 1.9813, + 3.2281, + 1.9813 + ], + "spans": [ + { + "offset": 86, + "length": 11 + } + ] + }, + { + "content": "Row 1 Col 3", + "polygon": [ + 5.3961, + 1.8429, + 6.184, + 1.8429, + 6.184, + 1.9861, + 5.3961, + 1.9861 + ], + "spans": [ + { + "offset": 100, + "length": 11 + } + ] + }, + { + "content": "Row 2 Col 1", + "polygon": [ + 1.0601, + 2.0386, + 1.8528, + 2.0386, + 1.8528, + 2.1723, + 1.0601, + 2.1723 + ], + "spans": [ + { + "offset": 116, + "length": 11 + } + ] + }, + { + "content": "Row 2 Col 2", + "polygon": [ + 3.2281, + 2.0386, + 4.0256, + 2.0386, + 4.0256, + 2.1771, + 3.2281, + 2.1771 + ], + "spans": [ + { + "offset": 130, + "length": 11 + } + ] + }, + { + "content": "Row 2 Col 3", + "polygon": [ + 5.3961, + 2.0386, + 6.1888, + 2.0386, + 6.1888, + 2.1771, + 5.3961, + 2.1771 + ], + "spans": [ + { + "offset": 144, + "length": 11 + } + ] + }, + { + "content": "This is the end.", + "polygon": [ + 0.9789, + 2.5399, + 1.9913, + 2.5399, + 1.9913, + 2.6879, + 0.9789, + 2.6927 + ], + "spans": [ + { + "offset": 159, + "length": 16 + } + ] + } + ], + "spans": [ + { + "offset": 0, + "length": 176 + } + ] + } + ], + "tables": [ + { + "rowCount": 3, + "columnCount": 3, + "cells": [ + { + "kind": "columnHeader", + "rowIndex": 0, + "columnIndex": 0, + "content": "Col 1", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9964, + 1.6192, + 3.1622, + 1.6192, + 3.1622, + 1.8195, + 0.9964, + 1.8195 + ] + } + ], + "spans": [ + { + "offset": 32, + "length": 5 + } + ], + "elements": [ + "/paragraphs/2" + ] + }, + { + "kind": "columnHeader", + "rowIndex": 0, + "columnIndex": 1, + "content": "Col 2", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1622, + 1.6192, + 5.328, + 1.6192, + 5.3346, + 1.8195, + 3.1622, + 1.8195 + ] + } + ], + "spans": [ + { + "offset": 40, + "length": 5 + } + ], + "elements": [ + "/paragraphs/3" + ] + }, + { + "kind": "columnHeader", + "rowIndex": 0, + "columnIndex": 2, + "content": "Col 3", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 5.328, + 1.6192, + 7.4871, + 1.6192, + 7.4871, + 1.8195, + 5.3346, + 1.8195 + ] + } + ], + "spans": [ + { + "offset": 48, + "length": 5 + } + ], + "elements": [ + "/paragraphs/4" + ] + }, + { + "rowIndex": 1, + "columnIndex": 0, + "content": "Row 1 Col 1", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9964, + 1.8195, + 3.1622, + 1.8195, + 3.1622, + 2.0065, + 0.9964, + 2.0065 + ] + } + ], + "spans": [ + { + "offset": 72, + "length": 11 + } + ], + "elements": [ + "/paragraphs/5" + ] + }, + { + "rowIndex": 1, + "columnIndex": 1, + "content": "Row 1 Col 2", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1622, + 1.8195, + 5.3346, + 1.8195, + 5.3346, + 2.0065, + 3.1622, + 2.0065 + ] + } + ], + "spans": [ + { + "offset": 86, + "length": 11 + } + ], + "elements": [ + "/paragraphs/6" + ] + }, + { + "rowIndex": 1, + "columnIndex": 2, + "content": "Row 1 Col 3", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 5.3346, + 1.8195, + 7.4871, + 1.8195, + 7.4937, + 2.0065, + 5.3346, + 2.0065 + ] + } + ], + "spans": [ + { + "offset": 100, + "length": 11 + } + ], + "elements": [ + "/paragraphs/7" + ] + }, + { + "rowIndex": 2, + "columnIndex": 0, + "content": "Row 2 Col 1", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9964, + 2.0065, + 3.1622, + 2.0065, + 3.1555, + 2.2068, + 0.9964, + 2.2068 + ] + } + ], + "spans": [ + { + "offset": 116, + "length": 11 + } + ], + "elements": [ + "/paragraphs/8" + ] + }, + { + "rowIndex": 2, + "columnIndex": 1, + "content": "Row 2 Col 2", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1622, + 2.0065, + 5.3346, + 2.0065, + 5.3346, + 2.2068, + 3.1555, + 2.2068 + ] + } + ], + "spans": [ + { + "offset": 130, + "length": 11 + } + ], + "elements": [ + "/paragraphs/9" + ] + }, + { + "rowIndex": 2, + "columnIndex": 2, + "content": "Row 2 Col 3", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 5.3346, + 2.0065, + 7.4937, + 2.0065, + 7.4937, + 2.2135, + 5.3346, + 2.2068 + ] + } + ], + "spans": [ + { + "offset": 144, + "length": 11 + } + ], + "elements": [ + "/paragraphs/10" + ] + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.999, + 1.6078, + 7.4993, + 1.61, + 7.5003, + 2.2221, + 1.0011, + 2.2207 + ] + } + ], + "spans": [ + { + "offset": 30, + "length": 127 + } + ] + } + ], + "paragraphs": [ + { + "spans": [ + { + "offset": 0, + "length": 17 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9837, + 1.0169, + 2.039, + 1.0169, + 2.039, + 1.1697, + 0.9837, + 1.1697 + ] + } + ], + "content": "This is the Title" + }, + { + "spans": [ + { + "offset": 19, + "length": 9 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9885, + 1.332, + 1.6723, + 1.3368, + 1.6713, + 1.4753, + 0.9875, + 1.4705 + ] + } + ], + "content": "Some text" + }, + { + "spans": [ + { + "offset": 32, + "length": 5 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9964, + 1.6192, + 3.1622, + 1.6192, + 3.1622, + 1.8195, + 0.9964, + 1.8195 + ] + } + ], + "content": "Col 1" + }, + { + "spans": [ + { + "offset": 40, + "length": 5 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1622, + 1.6192, + 5.328, + 1.6192, + 5.3346, + 1.8195, + 3.1622, + 1.8195 + ] + } + ], + "content": "Col 2" + }, + { + "spans": [ + { + "offset": 48, + "length": 5 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 5.328, + 1.6192, + 7.4871, + 1.6192, + 7.4871, + 1.8195, + 5.3346, + 1.8195 + ] + } + ], + "content": "Col 3" + }, + { + "spans": [ + { + "offset": 72, + "length": 11 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9964, + 1.8195, + 3.1622, + 1.8195, + 3.1622, + 2.0065, + 0.9964, + 2.0065 + ] + } + ], + "content": "Row 1 Col 1" + }, + { + "spans": [ + { + "offset": 86, + "length": 11 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1622, + 1.8195, + 5.3346, + 1.8195, + 5.3346, + 2.0065, + 3.1622, + 2.0065 + ] + } + ], + "content": "Row 1 Col 2" + }, + { + "spans": [ + { + "offset": 100, + "length": 11 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 5.3346, + 1.8195, + 7.4871, + 1.8195, + 7.4937, + 2.0065, + 5.3346, + 2.0065 + ] + } + ], + "content": "Row 1 Col 3" + }, + { + "spans": [ + { + "offset": 116, + "length": 11 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9964, + 2.0065, + 3.1622, + 2.0065, + 3.1555, + 2.2068, + 0.9964, + 2.2068 + ] + } + ], + "content": "Row 2 Col 1" + }, + { + "spans": [ + { + "offset": 130, + "length": 11 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1622, + 2.0065, + 5.3346, + 2.0065, + 5.3346, + 2.2068, + 3.1555, + 2.2068 + ] + } + ], + "content": "Row 2 Col 2" + }, + { + "spans": [ + { + "offset": 144, + "length": 11 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 5.3346, + 2.0065, + 7.4937, + 2.0065, + 7.4937, + 2.2135, + 5.3346, + 2.2068 + ] + } + ], + "content": "Row 2 Col 3" + }, + { + "spans": [ + { + "offset": 159, + "length": 16 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9789, + 2.5399, + 1.9913, + 2.5399, + 1.9913, + 2.6927, + 0.9789, + 2.6927 + ] + } + ], + "content": "This is the end." + } + ] +} \ No newline at end of file diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py new file mode 100644 index 00000000..e85806ef --- /dev/null +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -0,0 +1,53 @@ +import json +from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import ( + DocumentIntelligenceLoader, +) +from unittest.mock import patch +from types import SimpleNamespace + + +class SimplePythonObject: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def __getitem__(self, key): + return getattr(self, key, None) + + def get(self, key, default=None): + return getattr(self, key, default) + + +def mock_simple_response(): + with open( + "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json", + "r", + ) as f: + return json.load(f, object_hook=lambda d: SimplePythonObject(**d)) + + +@patch( + "rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", + return_value=["path/to/some/file"], +) +@patch( + "rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence" +) +def test_DocumentIntelligenceLoader(mock_document_intelligence, _): + mock_document_intelligence.return_value = mock_simple_response() + + loader = DocumentIntelligenceLoader( + path="path", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + ) + + documents = loader.load() + + assert len(documents) == 1, "No documents were loaded" + assert ( + documents[0].page_content + == "This is the Title\n\nSome text\n\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3\nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3\n\nThis is the end." + ) + assert documents[0].metadata["source"] == "path/to/some/file" + assert documents[0].metadata["page"] == 0 From 7f01c5ee395fde360b6232d16f85e352cc9e4063 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 15:26:41 +0000 Subject: [PATCH 04/16] add unit test --- .../test_document_intelligence_loader.py | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index e85806ef..8a013762 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -3,7 +3,6 @@ DocumentIntelligenceLoader, ) from unittest.mock import patch -from types import SimpleNamespace class SimplePythonObject: @@ -25,14 +24,9 @@ def mock_simple_response(): return json.load(f, object_hook=lambda d: SimplePythonObject(**d)) -@patch( - "rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", - return_value=["path/to/some/file"], -) -@patch( - "rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence" -) -def test_DocumentIntelligenceLoader(mock_document_intelligence, _): +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") +def test__load(mock_document_intelligence, _): mock_document_intelligence.return_value = mock_simple_response() loader = DocumentIntelligenceLoader( @@ -51,3 +45,21 @@ def test_DocumentIntelligenceLoader(mock_document_intelligence, _): ) assert documents[0].metadata["source"] == "path/to/some/file" assert documents[0].metadata["page"] == 0 + + +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence", side_effect=Exception("Error")) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._load_with_ocr") +def test_load_with_ocr_is_used_as_fallback(mock_load_with_ocr, _, __): + + loader = DocumentIntelligenceLoader( + path="path", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + ) + + loader.load() + + mock_load_with_ocr.assert_called_once() + mock_load_with_ocr.assert_called_with('path/to/some/file') From f7b59b0cff951cb14f5880af7fffb84893cc1dba Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 15:38:16 +0000 Subject: [PATCH 05/16] add unit test --- .../doc_loader/documentIntelligenceLoader.py | 3 ++- .../test_document_intelligence_loader.py | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index e7274523..c8044be8 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -152,7 +152,8 @@ def _clean_content(self, content: str): content = pattern.sub("", content) # Remove specific regex patterns. for regex_pattern in self.patterns_to_remove: - content = regex_pattern.sub("", content) + pattern = re.compile(regex_pattern) + content = pattern.sub("", content) return content diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index 8a013762..4727155a 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -63,3 +63,24 @@ def test_load_with_ocr_is_used_as_fallback(mock_load_with_ocr, _, __): mock_load_with_ocr.assert_called_once() mock_load_with_ocr.assert_called_with('path/to/some/file') + + +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") +def test_content_cleaning(mock_document_intelligence, _): + mock_document_intelligence.return_value = mock_simple_response() + + loader = DocumentIntelligenceLoader( + path="path", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + patterns_to_remove=["Ti.*e"] + ) + + documents = loader.load() + + assert ( + documents[0].page_content + == "This is the \n\nSome text\n\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3\nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3\n\nThis is the end." + ) From a440d2da4f9d5c9834e49fcf3e16f5a2bbd45c3c Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 16:25:05 +0000 Subject: [PATCH 06/16] remove formatting of spanning rows --- .../doc_loader/documentIntelligenceLoader.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index c8044be8..f6a22a4a 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -278,8 +278,6 @@ def _format_table(self, table): content = table.get("caption", {}).get("content", "") previous_row_index = -1 - rows_to_span = 0 - spanning_content = "" for cell in table.cells: if cell.get("kind") == "columnHeader": continue @@ -290,19 +288,6 @@ def _format_table(self, table): else "" ) - # If the cell spans multiple rows, we need to combine the content of the spanning cells - if rows_to_span > 0: - spanning_content += cell.content - rows_to_span -= 1 - if rows_to_span == 0: - content += f"{header}{spanning_content}" - spanning_content = "" - else: - spanning_content += ", " - continue - else: - rows_to_span = cell.get("rowSpan", 0) - is_new_row = previous_row_index != cell["rowIndex"] if is_new_row: content += "\n" if content else "" From 87c06802d91e80df16bdb25286047f9ff321e812 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 16:44:29 +0000 Subject: [PATCH 07/16] add unit test for tables without headers --- .../doc_loader/documentIntelligenceLoader.py | 2 +- .../table_without_headers.json | 1158 +++++++++++++++++ .../test_document_intelligence_loader.py | 36 +- 3 files changed, 1189 insertions(+), 7 deletions(-) create mode 100644 rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/table_without_headers.json diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index f6a22a4a..f950357d 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -294,7 +294,7 @@ def _format_table(self, table): previous_row_index = cell["rowIndex"] content += f"{header}{cell.content}" - content += ", " if cell["columnIndex"] < len(table_headers) - 1 else "" + content += ", " if cell["columnIndex"] < len(table_headers) - 1 else " " return content def _substitute_table_paragraphs(self, paragraphs, tables): diff --git a/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/table_without_headers.json b/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/table_without_headers.json new file mode 100644 index 00000000..d7bae5ae --- /dev/null +++ b/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/table_without_headers.json @@ -0,0 +1,1158 @@ +{ + "apiVersion": "2023-10-31-preview", + "modelId": "prebuilt-layout", + "content": "Table without Headers\n===\\n\\nTesting a table that has no headers\\n\\n|||\\n| - | - |\\n| A | B |\\n| C | D |\\n| E | F |\\n| G | H |\\n\\nThis is the end.\\n", + "pages": [ + { + "pageNumber": 1, + "angle": 0, + "width": 8.5, + "height": 11, + "unit": "inch", + "words": [ + { + "content": "Table", + "polygon": [ + 0.9839, + 1.026, + 1.3776, + 1.0228, + 1.3768, + 1.1729, + 0.9839, + 1.1712 + ], + "confidence": 0.997, + "span": { + "offset": 0, + "length": 5 + } + }, + { + "content": "without", + "polygon": [ + 1.4098, + 1.0226, + 1.9396, + 1.0214, + 1.9378, + 1.1745, + 1.4089, + 1.173 + ], + "confidence": 0.995, + "span": { + "offset": 6, + "length": 7 + } + }, + { + "content": "Headers", + "polygon": [ + 1.9842, + 1.0215, + 2.5739, + 1.0245, + 2.5739, + 1.1745, + 1.9823, + 1.1745 + ], + "confidence": 0.997, + "span": { + "offset": 14, + "length": 7 + } + }, + { + "content": "Testing", + "polygon": [ + 0.9898, + 1.3413, + 1.4657, + 1.3387, + 1.4626, + 1.4834, + 0.9868, + 1.4877 + ], + "confidence": 0.996, + "span": { + "offset": 27, + "length": 7 + } + }, + { + "content": "a", + "polygon": [ + 1.4956, + 1.3386, + 1.5668, + 1.3383, + 1.5638, + 1.4826, + 1.4925, + 1.4832 + ], + "confidence": 0.995, + "span": { + "offset": 35, + "length": 1 + } + }, + { + "content": "table", + "polygon": [ + 1.6059, + 1.3381, + 1.9346, + 1.3375, + 1.9316, + 1.4799, + 1.6029, + 1.4823 + ], + "confidence": 0.997, + "span": { + "offset": 37, + "length": 5 + } + }, + { + "content": "that", + "polygon": [ + 1.9737, + 1.3374, + 2.2404, + 1.3376, + 2.2373, + 1.4781, + 1.9707, + 1.4796 + ], + "confidence": 0.993, + "span": { + "offset": 43, + "length": 4 + } + }, + { + "content": "has", + "polygon": [ + 2.268, + 1.3377, + 2.507, + 1.3383, + 2.504, + 1.4768, + 2.2649, + 1.4779 + ], + "confidence": 0.999, + "span": { + "offset": 48, + "length": 3 + } + }, + { + "content": "no", + "polygon": [ + 2.5346, + 1.3384, + 2.6978, + 1.339, + 2.6948, + 1.476, + 2.5316, + 1.4767 + ], + "confidence": 0.997, + "span": { + "offset": 52, + "length": 2 + } + }, + { + "content": "headers", + "polygon": [ + 2.7369, + 1.3392, + 3.2949, + 1.343, + 3.2948, + 1.4744, + 2.7339, + 1.4759 + ], + "confidence": 0.995, + "span": { + "offset": 55, + "length": 7 + } + }, + { + "content": "A", + "polygon": [ + 1.0697, + 1.6615, + 1.1843, + 1.6615, + 1.1843, + 1.7856, + 1.0697, + 1.7856 + ], + "confidence": 0.995, + "span": { + "offset": 80, + "length": 1 + } + }, + { + "content": "B", + "polygon": [ + 3.2376, + 1.6615, + 3.3379, + 1.6615, + 3.3379, + 1.7808, + 3.2376, + 1.7808 + ], + "confidence": 0.993, + "span": { + "offset": 84, + "length": 1 + } + }, + { + "content": "C", + "polygon": [ + 1.084, + 1.8524, + 1.189, + 1.8524, + 1.189, + 1.9766, + 1.084, + 1.9766 + ], + "confidence": 0.975, + "span": { + "offset": 90, + "length": 1 + } + }, + { + "content": "D", + "polygon": [ + 3.2281, + 1.8524, + 3.3522, + 1.8524, + 3.3496, + 1.9765, + 3.2281, + 1.9739 + ], + "confidence": 0.993, + "span": { + "offset": 94, + "length": 1 + } + }, + { + "content": "E", + "polygon": [ + 1.0744, + 2.0482, + 1.1699, + 2.0482, + 1.1666, + 2.1674, + 1.0744, + 2.1648 + ], + "confidence": 0.995, + "span": { + "offset": 100, + "length": 1 + } + }, + { + "content": "F", + "polygon": [ + 3.2329, + 2.053, + 3.3246, + 2.053, + 3.3246, + 2.1675, + 3.2329, + 2.1675 + ], + "confidence": 0.994, + "span": { + "offset": 104, + "length": 1 + } + }, + { + "content": "G", + "polygon": [ + 1.0792, + 2.2439, + 1.189, + 2.2439, + 1.1864, + 2.3536, + 1.0792, + 2.351 + ], + "confidence": 0.995, + "span": { + "offset": 110, + "length": 1 + } + }, + { + "content": "H", + "polygon": [ + 3.236, + 2.2439, + 3.3522, + 2.2439, + 3.3522, + 2.3633, + 3.236, + 2.3633 + ], + "confidence": 0.993, + "span": { + "offset": 114, + "length": 1 + } + }, + { + "content": "This", + "polygon": [ + 0.9839, + 2.7348, + 1.2519, + 2.737, + 1.2511, + 2.8816, + 0.9839, + 2.887 + ], + "confidence": 0.993, + "span": { + "offset": 119, + "length": 4 + } + }, + { + "content": "is", + "polygon": [ + 1.2806, + 2.7371, + 1.4051, + 2.7376, + 1.4038, + 2.8798, + 1.2797, + 2.8812 + ], + "confidence": 0.997, + "span": { + "offset": 124, + "length": 2 + } + }, + { + "content": "the", + "polygon": [ + 1.441, + 2.7375, + 1.6492, + 2.7372, + 1.6471, + 2.8798, + 1.4396, + 2.8797 + ], + "confidence": 0.999, + "span": { + "offset": 127, + "length": 3 + } + }, + { + "content": "end.", + "polygon": [ + 1.6898, + 2.737, + 1.9865, + 2.7346, + 1.9859, + 2.8841, + 1.6877, + 2.88 + ], + "confidence": 0.993, + "span": { + "offset": 131, + "length": 4 + } + } + ], + "lines": [ + { + "content": "Table without Headers\n===", + "polygon": [ + 0.9837, + 1.0169, + 2.5691, + 1.0217, + 2.5691, + 1.1697, + 0.9837, + 1.1697 + ], + "spans": [ + { + "offset": 0, + "length": 25 + } + ] + }, + { + "content": "Testing a table that has no headers", + "polygon": [ + 0.9837, + 1.3368, + 3.2902, + 1.3368, + 3.2902, + 1.4753, + 0.9837, + 1.4848 + ], + "spans": [ + { + "offset": 27, + "length": 35 + } + ] + }, + { + "content": "A", + "polygon": [ + 1.0744, + 1.6615, + 1.1795, + 1.6615, + 1.1747, + 1.7856, + 1.0697, + 1.7856 + ], + "spans": [ + { + "offset": 80, + "length": 1 + } + ] + }, + { + "content": "B", + "polygon": [ + 3.2424, + 1.6615, + 3.3331, + 1.6615, + 3.3331, + 1.7808, + 3.2376, + 1.7808 + ], + "spans": [ + { + "offset": 84, + "length": 1 + } + ] + }, + { + "content": "C", + "polygon": [ + 1.0888, + 1.8524, + 1.1843, + 1.8524, + 1.1843, + 1.9766, + 1.084, + 1.9766 + ], + "spans": [ + { + "offset": 90, + "length": 1 + } + ] + }, + { + "content": "D", + "polygon": [ + 3.2329, + 1.8524, + 3.3475, + 1.8524, + 3.3427, + 1.9766, + 3.2281, + 1.9718 + ], + "spans": [ + { + "offset": 94, + "length": 1 + } + ] + }, + { + "content": "E", + "polygon": [ + 1.0792, + 2.0482, + 1.1652, + 2.053, + 1.1652, + 2.1675, + 1.0744, + 2.1675 + ], + "spans": [ + { + "offset": 100, + "length": 1 + } + ] + }, + { + "content": "F", + "polygon": [ + 3.2329, + 2.053, + 3.3236, + 2.053, + 3.3236, + 2.1675, + 3.2329, + 2.1675 + ], + "spans": [ + { + "offset": 104, + "length": 1 + } + ] + }, + { + "content": "G", + "polygon": [ + 1.084, + 2.2439, + 1.1843, + 2.2439, + 1.1795, + 2.3537, + 1.0792, + 2.349 + ], + "spans": [ + { + "offset": 110, + "length": 1 + } + ] + }, + { + "content": "H", + "polygon": [ + 3.2281, + 2.2439, + 3.3475, + 2.2439, + 3.3475, + 2.3633, + 3.2281, + 2.3633 + ], + "spans": [ + { + "offset": 114, + "length": 1 + } + ] + }, + { + "content": "This is the end.", + "polygon": [ + 0.9837, + 2.7309, + 1.9817, + 2.7309, + 1.9817, + 2.8789, + 0.9837, + 2.8837 + ], + "spans": [ + { + "offset": 119, + "length": 16 + } + ] + } + ], + "spans": [ + { + "offset": 0, + "length": 136 + } + ] + } + ], + "tables": [ + { + "rowCount": 4, + "columnCount": 2, + "cells": [ + { + "rowIndex": 0, + "columnIndex": 0, + "content": "A", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 1.6233, + 3.166, + 1.628, + 3.166, + 1.819, + 0.9933, + 1.8142 + ] + } + ], + "spans": [ + { + "offset": 80, + "length": 1 + } + ], + "elements": [ + "/paragraphs/2" + ] + }, + { + "rowIndex": 0, + "columnIndex": 1, + "content": "B", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.166, + 1.628, + 5.3197, + 1.628, + 5.3244, + 1.8142, + 3.166, + 1.819 + ] + } + ], + "spans": [ + { + "offset": 84, + "length": 1 + } + ], + "elements": [ + "/paragraphs/3" + ] + }, + { + "rowIndex": 1, + "columnIndex": 0, + "content": "C", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 1.8142, + 3.166, + 1.819, + 3.1612, + 2.01, + 0.9933, + 2.01 + ] + } + ], + "spans": [ + { + "offset": 90, + "length": 1 + } + ], + "elements": [ + "/paragraphs/4" + ] + }, + { + "rowIndex": 1, + "columnIndex": 1, + "content": "D", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.166, + 1.819, + 5.3244, + 1.8142, + 5.3292, + 2.01, + 3.1612, + 2.01 + ] + } + ], + "spans": [ + { + "offset": 94, + "length": 1 + } + ], + "elements": [ + "/paragraphs/5" + ] + }, + { + "rowIndex": 2, + "columnIndex": 0, + "content": "E", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 2.01, + 3.1612, + 2.01, + 3.1612, + 2.2057, + 0.9933, + 2.2057 + ] + } + ], + "spans": [ + { + "offset": 100, + "length": 1 + } + ], + "elements": [ + "/paragraphs/6" + ] + }, + { + "rowIndex": 2, + "columnIndex": 1, + "content": "F", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1612, + 2.01, + 5.3292, + 2.01, + 5.3292, + 2.2057, + 3.1612, + 2.2057 + ] + } + ], + "spans": [ + { + "offset": 104, + "length": 1 + } + ], + "elements": [ + "/paragraphs/7" + ] + }, + { + "rowIndex": 3, + "columnIndex": 0, + "content": "G", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 2.2057, + 3.1612, + 2.2057, + 3.1612, + 2.3967, + 0.998, + 2.3967 + ] + } + ], + "spans": [ + { + "offset": 110, + "length": 1 + } + ], + "elements": [ + "/paragraphs/8" + ] + }, + { + "rowIndex": 3, + "columnIndex": 1, + "content": "H", + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1612, + 2.2057, + 5.3292, + 2.2057, + 5.334, + 2.3967, + 3.1612, + 2.3967 + ] + } + ], + "spans": [ + { + "offset": 114, + "length": 1 + } + ], + "elements": [ + "/paragraphs/9" + ] + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 1.0016, + 1.6182, + 5.3415, + 1.6188, + 5.3409, + 2.3975, + 1.0018, + 2.3976 + ] + } + ], + "spans": [ + { + "offset": 64, + "length": 53 + } + ] + } + ], + "paragraphs": [ + { + "spans": [ + { + "offset": 0, + "length": 25 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9837, + 1.0169, + 2.5691, + 1.0169, + 2.5691, + 1.1697, + 0.9837, + 1.1697 + ] + } + ], + "role": "title", + "content": "Table without Headers\n===" + }, + { + "spans": [ + { + "offset": 27, + "length": 35 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9837, + 1.3368, + 3.2902, + 1.3368, + 3.2902, + 1.4848, + 0.9837, + 1.4848 + ] + } + ], + "content": "Testing a table that has no headers" + }, + { + "spans": [ + { + "offset": 80, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 1.6233, + 3.166, + 1.628, + 3.166, + 1.819, + 0.9933, + 1.8142 + ] + } + ], + "content": "A" + }, + { + "spans": [ + { + "offset": 84, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.166, + 1.628, + 5.3197, + 1.628, + 5.3244, + 1.8142, + 3.166, + 1.819 + ] + } + ], + "content": "B" + }, + { + "spans": [ + { + "offset": 90, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 1.8142, + 3.166, + 1.819, + 3.1612, + 2.01, + 0.9933, + 2.01 + ] + } + ], + "content": "C" + }, + { + "spans": [ + { + "offset": 94, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.166, + 1.819, + 5.3244, + 1.8142, + 5.3292, + 2.01, + 3.1612, + 2.01 + ] + } + ], + "content": "D" + }, + { + "spans": [ + { + "offset": 100, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 2.01, + 3.1612, + 2.01, + 3.1612, + 2.2057, + 0.9933, + 2.2057 + ] + } + ], + "content": "E" + }, + { + "spans": [ + { + "offset": 104, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1612, + 2.01, + 5.3292, + 2.01, + 5.3292, + 2.2057, + 3.1612, + 2.2057 + ] + } + ], + "content": "F" + }, + { + "spans": [ + { + "offset": 110, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9933, + 2.2057, + 3.1612, + 2.2057, + 3.1612, + 2.3967, + 0.998, + 2.3967 + ] + } + ], + "content": "G" + }, + { + "spans": [ + { + "offset": 114, + "length": 1 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 3.1612, + 2.2057, + 5.3292, + 2.2057, + 5.334, + 2.3967, + 3.1612, + 2.3967 + ] + } + ], + "content": "H" + }, + { + "spans": [ + { + "offset": 119, + "length": 16 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9837, + 2.7309, + 1.9817, + 2.7309, + 1.9817, + 2.8837, + 0.9837, + 2.8837 + ] + } + ], + "content": "This is the end." + } + ], + "contentFormat": "markdown" +} \ No newline at end of file diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index 4727155a..08d71f28 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -16,9 +16,9 @@ def get(self, key, default=None): return getattr(self, key, default) -def mock_simple_response(): +def mock_simple_response(file_name): with open( - "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json", + f"rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/{file_name}", "r", ) as f: return json.load(f, object_hook=lambda d: SimplePythonObject(**d)) @@ -27,7 +27,8 @@ def mock_simple_response(): @patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) @patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") def test__load(mock_document_intelligence, _): - mock_document_intelligence.return_value = mock_simple_response() + mock_document_intelligence.return_value = mock_simple_response( + 'simple_response.json') loader = DocumentIntelligenceLoader( path="path", @@ -41,7 +42,7 @@ def test__load(mock_document_intelligence, _): assert len(documents) == 1, "No documents were loaded" assert ( documents[0].page_content - == "This is the Title\n\nSome text\n\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3\nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3\n\nThis is the end." + == "This is the Title\n\nSome text\n\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3 \nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3 \n\nThis is the end." ) assert documents[0].metadata["source"] == "path/to/some/file" assert documents[0].metadata["page"] == 0 @@ -68,7 +69,8 @@ def test_load_with_ocr_is_used_as_fallback(mock_load_with_ocr, _, __): @patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) @patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") def test_content_cleaning(mock_document_intelligence, _): - mock_document_intelligence.return_value = mock_simple_response() + mock_document_intelligence.return_value = mock_simple_response( + 'simple_response.json') loader = DocumentIntelligenceLoader( path="path", @@ -82,5 +84,27 @@ def test_content_cleaning(mock_document_intelligence, _): assert ( documents[0].page_content - == "This is the \n\nSome text\n\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3\nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3\n\nThis is the end." + == "This is the \n\nSome text\n\nCol 1: Row 1 Col 1, Col 2: Row 1 Col 2, Col 3: Row 1 Col 3 \nCol 1: Row 2 Col 1, Col 2: Row 2 Col 2, Col 3: Row 2 Col 3 \n\nThis is the end." + ) + + +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") +def test_table_without_headers(mock_document_intelligence, _): + mock_document_intelligence.return_value = mock_simple_response( + 'table_without_headers.json') + + loader = DocumentIntelligenceLoader( + path="path", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + patterns_to_remove=["Ti.*e"] + ) + + documents = loader.load() + + assert ( + documents[0].page_content + == "Table without Headers\n===\n\nTesting a table that has no headers\n\nA B \nC D \nE F \nG H \n\nThis is the end." ) From 419620909625782d093a4598fc19b866e9dce4dd Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 17:20:51 +0000 Subject: [PATCH 08/16] add unit tests for multipages --- .../multiple_pages.json | 1581 +++++++++++++++++ .../test_document_intelligence_loader.py | 46 +- 2 files changed, 1626 insertions(+), 1 deletion(-) create mode 100644 rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/multiple_pages.json diff --git a/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/multiple_pages.json b/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/multiple_pages.json new file mode 100644 index 00000000..c23a5910 --- /dev/null +++ b/rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/multiple_pages.json @@ -0,0 +1,1581 @@ +{ + "apiVersion": "2023-10-31-preview", + "modelId": "prebuilt-layout", + "content": "Title for page number one Some text for the first page\n\\n# Title for page number two\\n\\nSome text for the 2nd page. Here we also have a table:\\n\\n| Name | Age |\\n| - | - |\\n| Alice | 25 |\\n| Bob | 32 |\\n\\nTitle for page number three This is the end - at page 3.\\n===\\n", + "pages": [ + { + "pageNumber": 1, + "angle": 0.30557239055633545, + "width": 8.5, + "height": 11, + "unit": "inch", + "words": [ + { + "content": "Title", + "polygon": [ + 0.9798, + 1.0215, + 1.2837, + 1.0215, + 1.2837, + 1.1807, + 0.9798, + 1.1762 + ], + "confidence": 0.996, + "span": { + "offset": 0, + "length": 5 + } + }, + { + "content": "for", + "polygon": [ + 1.3152, + 1.0215, + 1.5143, + 1.0215, + 1.5143, + 1.1831, + 1.3152, + 1.181 + ], + "confidence": 0.999, + "span": { + "offset": 6, + "length": 3 + } + }, + { + "content": "page", + "polygon": [ + 1.5457, + 1.0215, + 1.8811, + 1.0217, + 1.8811, + 1.184, + 1.5457, + 1.1833 + ], + "confidence": 0.993, + "span": { + "offset": 10, + "length": 4 + } + }, + { + "content": "number", + "polygon": [ + 1.9125, + 1.0217, + 2.4679, + 1.0222, + 2.4679, + 1.1824, + 1.9125, + 1.184 + ], + "confidence": 0.998, + "span": { + "offset": 15, + "length": 6 + } + }, + { + "content": "one", + "polygon": [ + 2.4994, + 1.0222, + 2.7792, + 1.0225, + 2.7792, + 1.1786, + 2.4994, + 1.1821 + ], + "confidence": 0.999, + "span": { + "offset": 22, + "length": 3 + } + }, + { + "content": "Some", + "polygon": [ + 0.9828, + 1.3283, + 1.3779, + 1.3283, + 1.3755, + 1.4882, + 0.9798, + 1.4837 + ], + "confidence": 0.993, + "span": { + "offset": 26, + "length": 4 + } + }, + { + "content": "text", + "polygon": [ + 1.4119, + 1.3284, + 1.6579, + 1.3292, + 1.656, + 1.4909, + 1.4096, + 1.4886 + ], + "confidence": 0.993, + "span": { + "offset": 31, + "length": 4 + } + }, + { + "content": "for", + "polygon": [ + 1.6893, + 1.3293, + 1.8619, + 1.3303, + 1.8604, + 1.4925, + 1.6874, + 1.4912 + ], + "confidence": 0.999, + "span": { + "offset": 36, + "length": 3 + } + }, + { + "content": "the", + "polygon": [ + 1.8933, + 1.3305, + 2.1105, + 1.3321, + 2.1094, + 1.494, + 1.8918, + 1.4927 + ], + "confidence": 0.999, + "span": { + "offset": 40, + "length": 3 + } + }, + { + "content": "first", + "polygon": [ + 2.1471, + 1.3324, + 2.4035, + 1.335, + 2.4029, + 1.4944, + 2.1461, + 1.4942 + ], + "confidence": 0.995, + "span": { + "offset": 44, + "length": 5 + } + }, + { + "content": "page", + "polygon": [ + 2.4428, + 1.3354, + 2.7744, + 1.3397, + 2.7744, + 1.4944, + 2.4422, + 1.4944 + ], + "confidence": 0.993, + "span": { + "offset": 50, + "length": 4 + } + } + ], + "lines": [ + { + "content": "Title for page number one", + "polygon": [ + 0.9789, + 1.0169, + 2.7744, + 1.0217, + 2.7744, + 1.1793, + 0.9789, + 1.1793 + ], + "spans": [ + { + "offset": 0, + "length": 25 + } + ] + }, + { + "content": "Some text for the first page", + "polygon": [ + 0.9789, + 1.3273, + 2.7697, + 1.332, + 2.7697, + 1.4944, + 0.9789, + 1.48 + ], + "spans": [ + { + "offset": 26, + "length": 28 + } + ] + } + ], + "spans": [ + { + "offset": 0, + "length": 55 + } + ] + }, + { + "pageNumber": 2, + "angle": 0.07827287167310715, + "width": 8.5, + "height": 11, + "unit": "inch", + "words": [ + { + "content": "Title", + "polygon": [ + 0.9828, + 1.0245, + 1.2902, + 1.0232, + 1.2902, + 1.1789, + 0.9828, + 1.1735 + ], + "confidence": 0.997, + "span": { + "offset": 58, + "length": 5 + } + }, + { + "content": "for", + "polygon": [ + 1.3238, + 1.0231, + 1.5201, + 1.0224, + 1.5201, + 1.1817, + 1.3238, + 1.1794 + ], + "confidence": 0.995, + "span": { + "offset": 64, + "length": 3 + } + }, + { + "content": "page", + "polygon": [ + 1.5511, + 1.0223, + 1.8894, + 1.0215, + 1.8894, + 1.1834, + 1.5511, + 1.182 + ], + "confidence": 0.993, + "span": { + "offset": 68, + "length": 4 + } + }, + { + "content": "number", + "polygon": [ + 1.923, + 1.0214, + 2.4706, + 1.0207, + 2.4706, + 1.1797, + 1.923, + 1.1834 + ], + "confidence": 0.998, + "span": { + "offset": 73, + "length": 6 + } + }, + { + "content": "two", + "polygon": [ + 2.5016, + 1.0206, + 2.7744, + 1.0204, + 2.7744, + 1.1746, + 2.5016, + 1.1794 + ], + "confidence": 0.999, + "span": { + "offset": 80, + "length": 3 + } + }, + { + "content": "Some", + "polygon": [ + 0.9898, + 1.3293, + 1.3766, + 1.3298, + 1.3735, + 1.4844, + 0.9868, + 1.4803 + ], + "confidence": 0.993, + "span": { + "offset": 85, + "length": 4 + } + }, + { + "content": "text", + "polygon": [ + 1.4127, + 1.3298, + 1.6498, + 1.3301, + 1.6468, + 1.4869, + 1.4096, + 1.4848 + ], + "confidence": 0.993, + "span": { + "offset": 90, + "length": 4 + } + }, + { + "content": "for", + "polygon": [ + 1.6808, + 1.3301, + 1.8716, + 1.3304, + 1.8685, + 1.4886, + 1.6777, + 1.4871 + ], + "confidence": 0.999, + "span": { + "offset": 95, + "length": 3 + } + }, + { + "content": "the", + "polygon": [ + 1.9025, + 1.3304, + 2.1113, + 1.3306, + 2.1083, + 1.49, + 1.8994, + 1.4888 + ], + "confidence": 0.999, + "span": { + "offset": 99, + "length": 3 + } + }, + { + "content": "2nd", + "polygon": [ + 2.1526, + 1.3307, + 2.3279, + 1.3309, + 2.3248, + 1.491, + 2.1495, + 1.4902 + ], + "confidence": 0.938, + "span": { + "offset": 103, + "length": 3 + } + }, + { + "content": "page.", + "polygon": [ + 2.3691, + 1.3309, + 2.7172, + 1.3312, + 2.7141, + 1.4921, + 2.3661, + 1.4912 + ], + "confidence": 0.996, + "span": { + "offset": 107, + "length": 5 + } + }, + { + "content": "Here", + "polygon": [ + 2.7687, + 1.3313, + 3.0755, + 1.3316, + 3.0725, + 1.4924, + 2.7657, + 1.4922 + ], + "confidence": 0.993, + "span": { + "offset": 113, + "length": 4 + } + }, + { + "content": "we", + "polygon": [ + 3.1116, + 1.3316, + 3.2972, + 1.3317, + 3.2942, + 1.4921, + 3.1086, + 1.4924 + ], + "confidence": 0.997, + "span": { + "offset": 118, + "length": 2 + } + }, + { + "content": "also", + "polygon": [ + 3.3385, + 1.3318, + 3.6066, + 1.332, + 3.6036, + 1.4913, + 3.3354, + 1.4921 + ], + "confidence": 0.993, + "span": { + "offset": 121, + "length": 4 + } + }, + { + "content": "have", + "polygon": [ + 3.6504, + 1.332, + 3.9521, + 1.3322, + 3.949, + 1.4897, + 3.6474, + 1.4911 + ], + "confidence": 0.993, + "span": { + "offset": 126, + "length": 4 + } + }, + { + "content": "a", + "polygon": [ + 3.9933, + 1.3322, + 4.0603, + 1.3323, + 4.0573, + 1.4889, + 3.9903, + 1.4894 + ], + "confidence": 0.996, + "span": { + "offset": 131, + "length": 1 + } + }, + { + "content": "table:", + "polygon": [ + 4.099, + 1.3323, + 4.484, + 1.3324, + 4.484, + 1.4857, + 4.096, + 1.4887 + ], + "confidence": 0.993, + "span": { + "offset": 133, + "length": 6 + } + }, + { + "content": "Name", + "polygon": [ + 1.0687, + 1.657, + 1.5042, + 1.6553, + 1.5042, + 1.7996, + 1.0627, + 1.7933 + ], + "confidence": 0.993, + "span": { + "offset": 143, + "length": 4 + } + }, + { + "content": "Age", + "polygon": [ + 3.2185, + 1.6615, + 3.5146, + 1.6615, + 3.5134, + 1.8094, + 3.2185, + 1.807 + ], + "confidence": 0.993, + "span": { + "offset": 150, + "length": 3 + } + }, + { + "content": "Alice", + "polygon": [ + 1.0553, + 1.8429, + 1.4135, + 1.8429, + 1.4135, + 1.9813, + 1.0553, + 1.9789 + ], + "confidence": 0.995, + "span": { + "offset": 168, + "length": 5 + } + }, + { + "content": "25", + "polygon": [ + 3.2281, + 1.8524, + 3.4185, + 1.8524, + 3.4185, + 1.9766, + 3.2281, + 1.9766 + ], + "confidence": 0.997, + "span": { + "offset": 176, + "length": 2 + } + }, + { + "content": "Bob", + "polygon": [ + 1.0693, + 2.0386, + 1.3463, + 2.0386, + 1.3451, + 2.1771, + 1.0682, + 2.1747 + ], + "confidence": 0.999, + "span": { + "offset": 183, + "length": 3 + } + }, + { + "content": "32", + "polygon": [ + 3.2281, + 2.0434, + 3.4067, + 2.0434, + 3.4085, + 2.1699, + 3.2281, + 2.1723 + ], + "confidence": 0.997, + "span": { + "offset": 189, + "length": 2 + } + } + ], + "lines": [ + { + "content": "# Title for page number two", + "polygon": [ + 0.9789, + 1.0217, + 2.7697, + 1.0169, + 2.7697, + 1.1793, + 0.9789, + 1.1793 + ], + "spans": [ + { + "offset": 56, + "length": 27 + } + ] + }, + { + "content": "Some text for the 2nd page. Here we also have a table:", + "polygon": [ + 0.9837, + 1.3273, + 4.4792, + 1.332, + 4.4792, + 1.4896, + 0.9837, + 1.4848 + ], + "spans": [ + { + "offset": 85, + "length": 54 + } + ] + }, + { + "content": "Name", + "polygon": [ + 1.0601, + 1.6519, + 1.4994, + 1.6519, + 1.4994, + 1.7951, + 1.0601, + 1.7951 + ], + "spans": [ + { + "offset": 143, + "length": 4 + } + ] + }, + { + "content": "Age", + "polygon": [ + 3.2185, + 1.6615, + 3.5098, + 1.6615, + 3.5051, + 1.8095, + 3.2185, + 1.8047 + ], + "spans": [ + { + "offset": 150, + "length": 3 + } + ] + }, + { + "content": "Alice", + "polygon": [ + 1.0601, + 1.8429, + 1.4087, + 1.8477, + 1.4039, + 1.9813, + 1.0553, + 1.9813 + ], + "spans": [ + { + "offset": 168, + "length": 5 + } + ] + }, + { + "content": "25", + "polygon": [ + 3.2281, + 1.8524, + 3.4143, + 1.8524, + 3.4096, + 1.9766, + 3.2281, + 1.9766 + ], + "spans": [ + { + "offset": 176, + "length": 2 + } + ] + }, + { + "content": "Bob", + "polygon": [ + 1.0649, + 2.0386, + 1.3419, + 2.0386, + 1.3371, + 2.1771, + 1.0601, + 2.1723 + ], + "spans": [ + { + "offset": 183, + "length": 3 + } + ] + }, + { + "content": "32", + "polygon": [ + 3.2281, + 2.0482, + 3.4048, + 2.0434, + 3.4048, + 2.1723, + 3.2281, + 2.1723 + ], + "spans": [ + { + "offset": 189, + "length": 2 + } + ] + } + ], + "spans": [ + { + "offset": 56, + "length": 138 + } + ] + }, + { + "pageNumber": 3, + "angle": 0.07891959697008133, + "width": 8.5, + "height": 11, + "unit": "inch", + "words": [ + { + "content": "Title", + "polygon": [ + 0.9798, + 1.0234, + 1.2894, + 1.0226, + 1.2899, + 1.181, + 0.9798, + 1.1762 + ], + "confidence": 0.997, + "span": { + "offset": 195, + "length": 5 + } + }, + { + "content": "for", + "polygon": [ + 1.3232, + 1.0226, + 1.5105, + 1.0221, + 1.5114, + 1.1833, + 1.3238, + 1.1814 + ], + "confidence": 0.999, + "span": { + "offset": 201, + "length": 3 + } + }, + { + "content": "page", + "polygon": [ + 1.5418, + 1.0221, + 1.8826, + 1.0213, + 1.884, + 1.184, + 1.5427, + 1.1835 + ], + "confidence": 0.993, + "span": { + "offset": 205, + "length": 4 + } + }, + { + "content": "number", + "polygon": [ + 1.9164, + 1.0212, + 2.4575, + 1.02, + 2.4599, + 1.1797, + 1.9179, + 1.184 + ], + "confidence": 0.995, + "span": { + "offset": 210, + "length": 6 + } + }, + { + "content": "three", + "polygon": [ + 2.4887, + 1.0199, + 2.8841, + 1.019, + 2.8843, + 1.1717, + 2.4911, + 1.1793 + ], + "confidence": 0.998, + "span": { + "offset": 217, + "length": 5 + } + }, + { + "content": "This", + "polygon": [ + 0.9701, + 1.321, + 1.2405, + 1.3233, + 1.2369, + 1.4938, + 0.9658, + 1.4923 + ], + "confidence": 0.993, + "span": { + "offset": 223, + "length": 4 + } + }, + { + "content": "is", + "polygon": [ + 1.2743, + 1.3235, + 1.4067, + 1.3242, + 1.4035, + 1.4944, + 1.2708, + 1.494 + ], + "confidence": 0.996, + "span": { + "offset": 228, + "length": 2 + } + }, + { + "content": "the", + "polygon": [ + 1.4405, + 1.3244, + 1.649, + 1.3251, + 1.6464, + 1.4944, + 1.4374, + 1.4944 + ], + "confidence": 0.999, + "span": { + "offset": 231, + "length": 3 + } + }, + { + "content": "end", + "polygon": [ + 1.6913, + 1.3253, + 1.9279, + 1.3254, + 1.926, + 1.4944, + 1.6888, + 1.4944 + ], + "confidence": 0.999, + "span": { + "offset": 235, + "length": 3 + } + }, + { + "content": "-", + "polygon": [ + 1.9702, + 1.3254, + 2.0152, + 1.3253, + 2.0136, + 1.4944, + 1.9684, + 1.4944 + ], + "confidence": 0.995, + "span": { + "offset": 239, + "length": 1 + } + }, + { + "content": "at", + "polygon": [ + 2.0575, + 1.3252, + 2.1645, + 1.3249, + 2.1632, + 1.4944, + 2.0559, + 1.4944 + ], + "confidence": 0.996, + "span": { + "offset": 241, + "length": 2 + } + }, + { + "content": "page", + "polygon": [ + 2.1983, + 1.3249, + 2.528, + 1.323, + 2.5275, + 1.4944, + 2.1971, + 1.4944 + ], + "confidence": 0.993, + "span": { + "offset": 244, + "length": 4 + } + }, + { + "content": "3.", + "polygon": [ + 2.5702, + 1.3227, + 2.7028, + 1.3216, + 2.7028, + 1.4944, + 2.5699, + 1.4944 + ], + "confidence": 0.993, + "span": { + "offset": 249, + "length": 2 + } + } + ], + "lines": [ + { + "content": "Title for page number three", + "polygon": [ + 0.9789, + 1.0217, + 2.8795, + 1.0169, + 2.8795, + 1.1793, + 0.9789, + 1.1793 + ], + "spans": [ + { + "offset": 195, + "length": 27 + } + ] + }, + { + "content": "This is the end - at page 3.\n===", + "polygon": [ + 0.9646, + 1.3177, + 2.698, + 1.3177, + 2.698, + 1.4944, + 0.9646, + 1.4896 + ], + "spans": [ + { + "offset": 223, + "length": 32 + } + ] + } + ], + "spans": [ + { + "offset": 195, + "length": 61 + } + ] + } + ], + "tables": [ + { + "rowCount": 3, + "columnCount": 2, + "cells": [ + { + "kind": "columnHeader", + "rowIndex": 0, + "columnIndex": 0, + "content": "Name", + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 1.0028, + 1.6185, + 3.1612, + 1.6233, + 3.1612, + 1.819, + 1.0028, + 1.819 + ] + } + ], + "spans": [ + { + "offset": 143, + "length": 4 + } + ], + "elements": [ + "/paragraphs/3" + ] + }, + { + "kind": "columnHeader", + "rowIndex": 0, + "columnIndex": 1, + "content": "Age", + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 3.1612, + 1.6233, + 5.3244, + 1.6233, + 5.3292, + 1.819, + 3.1612, + 1.819 + ] + } + ], + "spans": [ + { + "offset": 150, + "length": 3 + } + ], + "elements": [ + "/paragraphs/4" + ] + }, + { + "rowIndex": 1, + "columnIndex": 0, + "content": "Alice", + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 1.0028, + 1.819, + 3.1612, + 1.819, + 3.1612, + 2.01, + 1.0028, + 2.01 + ] + } + ], + "spans": [ + { + "offset": 168, + "length": 5 + } + ], + "elements": [ + "/paragraphs/5" + ] + }, + { + "rowIndex": 1, + "columnIndex": 1, + "content": "25", + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 3.1612, + 1.819, + 5.3292, + 1.819, + 5.334, + 2.01, + 3.1612, + 2.01 + ] + } + ], + "spans": [ + { + "offset": 176, + "length": 2 + } + ], + "elements": [ + "/paragraphs/6" + ] + }, + { + "rowIndex": 2, + "columnIndex": 0, + "content": "Bob", + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 1.0028, + 2.01, + 3.1612, + 2.01, + 3.1612, + 2.2057, + 1.0028, + 2.2057 + ] + } + ], + "spans": [ + { + "offset": 183, + "length": 3 + } + ], + "elements": [ + "/paragraphs/7" + ] + }, + { + "rowIndex": 2, + "columnIndex": 1, + "content": "32", + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 3.1612, + 2.01, + 5.334, + 2.01, + 5.334, + 2.2057, + 3.1612, + 2.2057 + ] + } + ], + "spans": [ + { + "offset": 189, + "length": 2 + } + ], + "elements": [ + "/paragraphs/8" + ] + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 0.9968, + 1.6109, + 5.3383, + 1.6132, + 5.3382, + 2.2191, + 0.9976, + 2.2177 + ] + } + ], + "spans": [ + { + "offset": 141, + "length": 52 + } + ] + } + ], + "paragraphs": [ + { + "spans": [ + { + "offset": 0, + "length": 54 + } + ], + "bounding_regions": [ + { + "pageNumber": 1, + "polygon": [ + 0.9789, + 1.0169, + 2.7749, + 1.0217, + 2.7736, + 1.4944, + 0.9777, + 1.4896 + ] + } + ], + "content": "Title for page number one Some text for the first page" + }, + { + "spans": [ + { + "offset": 56, + "length": 27 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 0.9789, + 1.0169, + 2.7697, + 1.0169, + 2.7697, + 1.1793, + 0.9789, + 1.1793 + ] + } + ], + "role": "sectionHeading", + "content": "# Title for page number two" + }, + { + "spans": [ + { + "offset": 85, + "length": 54 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 0.9837, + 1.3273, + 4.4794, + 1.332, + 4.4792, + 1.4896, + 0.9835, + 1.4848 + ] + } + ], + "content": "Some text for the 2nd page. Here we also have a table:" + }, + { + "spans": [ + { + "offset": 143, + "length": 4 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 1.0028, + 1.6185, + 3.1612, + 1.6233, + 3.1612, + 1.819, + 1.0028, + 1.819 + ] + } + ], + "content": "Name" + }, + { + "spans": [ + { + "offset": 150, + "length": 3 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 3.1612, + 1.6233, + 5.3244, + 1.6233, + 5.3292, + 1.819, + 3.1612, + 1.819 + ] + } + ], + "content": "Age" + }, + { + "spans": [ + { + "offset": 168, + "length": 5 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 1.0028, + 1.819, + 3.1612, + 1.819, + 3.1612, + 2.01, + 1.0028, + 2.01 + ] + } + ], + "content": "Alice" + }, + { + "spans": [ + { + "offset": 176, + "length": 2 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 3.1612, + 1.819, + 5.3292, + 1.819, + 5.334, + 2.01, + 3.1612, + 2.01 + ] + } + ], + "content": "25" + }, + { + "spans": [ + { + "offset": 183, + "length": 3 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 1.0028, + 2.01, + 3.1612, + 2.01, + 3.1612, + 2.2057, + 1.0028, + 2.2057 + ] + } + ], + "content": "Bob" + }, + { + "spans": [ + { + "offset": 189, + "length": 2 + } + ], + "bounding_regions": [ + { + "pageNumber": 2, + "polygon": [ + 3.1612, + 2.01, + 5.334, + 2.01, + 5.334, + 2.2057, + 3.1612, + 2.2057 + ] + } + ], + "content": "32" + }, + { + "spans": [ + { + "offset": 195, + "length": 60 + } + ], + "bounding_regions": [ + { + "pageNumber": 3, + "polygon": [ + 0.9634, + 1.0217, + 2.8795, + 1.0169, + 2.8807, + 1.4939, + 0.9646, + 1.4987 + ] + } + ], + "role": "title", + "content": "Title for page number three This is the end - at page 3.\n===" + } + ] +} \ No newline at end of file diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index 08d71f28..480a2494 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -77,7 +77,6 @@ def test_content_cleaning(mock_document_intelligence, _): endpoint="endpoint", key="key", glob_patterns=["pdf"], - patterns_to_remove=["Ti.*e"] ) documents = loader.load() @@ -108,3 +107,48 @@ def test_table_without_headers(mock_document_intelligence, _): documents[0].page_content == "Table without Headers\n===\n\nTesting a table that has no headers\n\nA B \nC D \nE F \nG H \n\nThis is the end." ) + + +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") +def test_document_with_multiple_pages_without_splitting_documents_by_page(mock_document_intelligence, _): + mock_document_intelligence.return_value = mock_simple_response( + 'multiple_pages.json') + + loader = DocumentIntelligenceLoader( + path="path", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + split_documents_by_page=False + ) + + documents = loader.load() + + assert ( + documents[0].page_content + == "Title for page number one Some text for the first page\n\n# Title for page number two\n\nSome text for the 2nd page. Here we also have a table:\n\nName: Alice, Age: 25 \nName: Bob, Age: 32 \n\nTitle for page number three This is the end - at page 3.\n===" + ) + assert len(documents) == 1 + + +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") +def test_document_with_multiple_pages_with_split_documents_by_page(mock_document_intelligence, _): + mock_document_intelligence.return_value = mock_simple_response( + 'multiple_pages.json') + + loader = DocumentIntelligenceLoader( + path="path", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + split_documents_by_page=True + ) + + documents = loader.load() + + assert len(documents) == 3 + assert documents[0].page_content == "Title for page number one Some text for the first page" + assert documents[1].page_content == "# Title for page number two\n\nSome text for the 2nd page. Here we also have a table:\n\nName: Alice, Age: 25 \nName: Bob, Age: 32 " + assert documents[2].page_content == "Title for page number three This is the end - at page 3.\n===" From ebb7b52a21b1b3c0cfc0986e1b6e90fe81796f75 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 17:22:00 +0000 Subject: [PATCH 09/16] fix test --- .../doc_loader/tests/test_document_intelligence_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index 480a2494..02908fd3 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -77,6 +77,7 @@ def test_content_cleaning(mock_document_intelligence, _): endpoint="endpoint", key="key", glob_patterns=["pdf"], + patterns_to_remove=["Ti.*e"] ) documents = loader.load() @@ -98,7 +99,6 @@ def test_table_without_headers(mock_document_intelligence, _): endpoint="endpoint", key="key", glob_patterns=["pdf"], - patterns_to_remove=["Ti.*e"] ) documents = loader.load() From 735318a24e8316d07dde0e31d487cf7b48271b0f Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 17:27:38 +0000 Subject: [PATCH 10/16] add test for excluding roles --- .../test_document_intelligence_loader.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index 02908fd3..831d3706 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -152,3 +152,25 @@ def test_document_with_multiple_pages_with_split_documents_by_page(mock_document assert documents[0].page_content == "Title for page number one Some text for the first page" assert documents[1].page_content == "# Title for page number two\n\nSome text for the 2nd page. Here we also have a table:\n\nName: Alice, Age: 25 \nName: Bob, Age: 32 " assert documents[2].page_content == "Title for page number three This is the end - at page 3.\n===" + + +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._get_file_paths", return_value=["path/to/some/file"],) +@patch("rag_experiment_accelerator.doc_loader.documentIntelligenceLoader.DocumentIntelligenceLoader._call_document_intelligence") +def test_excluding_paragraphs(mock_document_intelligence, _): + mock_document_intelligence.return_value = mock_simple_response( + 'multiple_pages.json') + + loader = DocumentIntelligenceLoader( + path="path", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + excluded_paragraph_roles=['sectionHeading'] + ) + + documents = loader.load() + + assert ( + documents[0].page_content + == "Title for page number one Some text for the first page\n\nSome text for the 2nd page. Here we also have a table:\n\nName: Alice, Age: 25 \nName: Bob, Age: 32 \n\nTitle for page number three This is the end - at page 3.\n===" + ) From 1dac9f08ae6027de775885c4b3ab199dc3339003 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 17:33:58 +0000 Subject: [PATCH 11/16] add test for get_file_paths --- .../test_document_intelligence_loader.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index 831d3706..a693c39b 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -174,3 +174,38 @@ def test_excluding_paragraphs(mock_document_intelligence, _): documents[0].page_content == "Title for page number one Some text for the first page\n\nSome text for the 2nd page. Here we also have a table:\n\nName: Alice, Age: 25 \nName: Bob, Age: 32 \n\nTitle for page number three This is the end - at page 3.\n===" ) + + +def test_get_file_paths(): + loader = DocumentIntelligenceLoader( + path="rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response", + endpoint="endpoint", + key="key", + glob_patterns=["json"], + ) + + assert set(loader._get_file_paths()) == set([ + "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json", + "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/table_without_headers.json", + "rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/multiple_pages.json" + ]) + + +def test_get_file_paths_returns_according_to_glob(): + loader = DocumentIntelligenceLoader( + path="rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response", + endpoint="endpoint", + key="key", + glob_patterns=["pdf"], + ) + + assert loader._get_file_paths() == [] + +def test_get_file_paths_works_for_single_files(): + loader = DocumentIntelligenceLoader( + path="rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json", + endpoint="endpoint", + key="key", + ) + + assert loader._get_file_paths() == ['rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json'] \ No newline at end of file From c53166804f08e52707fbf42abc6846569c32b6a9 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Mon, 18 Mar 2024 17:34:13 +0000 Subject: [PATCH 12/16] fix format --- .../doc_loader/tests/test_document_intelligence_loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py index a693c39b..1027908e 100644 --- a/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py +++ b/rag_experiment_accelerator/doc_loader/tests/test_document_intelligence_loader.py @@ -201,6 +201,7 @@ def test_get_file_paths_returns_according_to_glob(): assert loader._get_file_paths() == [] + def test_get_file_paths_works_for_single_files(): loader = DocumentIntelligenceLoader( path="rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json", @@ -208,4 +209,5 @@ def test_get_file_paths_works_for_single_files(): key="key", ) - assert loader._get_file_paths() == ['rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json'] \ No newline at end of file + assert loader._get_file_paths() == [ + 'rag_experiment_accelerator/doc_loader/tests/test_data/document_intelligence_response/simple_response.json'] From 92ebab584b069541d24f33101c5192dc9463b67b Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Tue, 19 Mar 2024 09:30:56 +0000 Subject: [PATCH 13/16] remove duplicate dependency --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3b201c05..d3e4df9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,6 @@ fuzzywuzzy==0.18.0 hnswlib==0.8.0 langchain==0.1.12 langchain-community==0.0.28 -azure-ai-documentintelligence==1.0.0b1 levenshtein==0.25.0 lxml==5.1.0 matplotlib==3.8.3 From ebf96f84656b2345fa4ec91b3827d4c59f008ee0 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Tue, 26 Mar 2024 09:33:22 +0000 Subject: [PATCH 14/16] fix conflicts with development --- .../doc_loader/documentIntelligenceLoader.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index f950357d..246a8c1f 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -5,6 +5,7 @@ import os from azure.ai.documentintelligence import DocumentIntelligenceClient from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader +from rag_experiment_accelerator.config.environment import Environment from azure.core.credentials import AzureKeyCredential from langchain_core.documents import Document from pathlib import Path @@ -17,6 +18,46 @@ logger = get_logger(__name__) +def load_with_azure_document_intelligence( + environment: Environment, + file_paths: list[str], + chunk_size: int, + overlap_size: int, +) -> list[Document]: + """ + Load pdf files from a folder using Azure Document Intelligence. + + Args: + environment (Environment): The environment class + file_paths (list[str]): Sequence of paths to load. + chunk_size (int): Unused. + overlap_size (int): Unused. + + Returns: + list[Document]: A list of Document objects. + """ + documents: list[Document] = [] + for file_path in file_paths: + try: + loader = DocumentIntelligenceLoader( + file_path, + environment.azure_document_intelligence_endpoint, + environment.azure_document_intelligence_admin_key, + glob_patterns=["*"], + excluded_paragraph_roles=[ + "pageHeader", + "pageFooter", + "footnote", + "pageNumber", + ], + ) + documents += loader.load() + except Exception as e: + logger.warning(f"Failed to load {file_path}: {e}") + + return documents + + class DocumentIntelligenceLoader(BaseLoader): """ Analyzes and loads documents and directories using Azure Document Intelligence. @@ -349,6 +390,7 @@ def _load_with_ocr(self, file_path): logger.error( f"Failed to load {file_path} with Azure Document Intelligence using the 'prebuilt-read' model: {e}" ) + raise e logger.info( f'Successfully loaded {file_path} with Azure Document Intelligence using the "prebuilt-read" model.' From 916d7e0d3f27c42b512f423f7252cbda0377f42b Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Tue, 2 Apr 2024 19:22:39 +0000 Subject: [PATCH 15/16] remove progress bar --- .../doc_loader/documentIntelligenceLoader.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py index 5d8f1e2f..3e41835e 100644 --- a/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py +++ b/rag_experiment_accelerator/doc_loader/documentIntelligenceLoader.py @@ -1,6 +1,5 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import ExitStack -from tqdm import tqdm import re import os import uuid @@ -126,9 +125,6 @@ def load(self) -> List[Document]: with ExitStack() as stack: executor = stack.enter_context(ThreadPoolExecutor()) - progress_bar = stack.enter_context( - tqdm(total=len(file_paths), desc="Analyzing documents") - ) futures = { executor.submit(self._analyze_document, file_path) @@ -140,7 +136,6 @@ def load(self) -> List[Document]: documents += future.result() except Exception as exc: logger.error(f"Processing document generated an exception: {exc}") - progress_bar.update(1) return documents From 4622b68377e63e477bc4bb8c054635040359fe98 Mon Sep 17 00:00:00 2001 From: Yuval Yaron Date: Wed, 3 Apr 2024 14:57:10 +0000 Subject: [PATCH 16/16] Update README --- README.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 6d70af09..151cb268 100644 --- a/README.md +++ b/README.md @@ -28,25 +28,27 @@ The **RAG Experiment Accelerator** is config driven and offers a rich set of fea 1. **Experiment Setup**: You can define and configure experiments by specifying a range of search engine parameters, search types, query sets, and evaluation metrics. -2. **Integration**: It integrates seamlessly with Azure AI Search, Azure Machine Learning, MLFlow and Azure OpenAI. +1. **Integration**: It integrates seamlessly with Azure AI Search, Azure Machine Learning, MLFlow and Azure OpenAI. -3. **Rich Search Index**: It creates multiple search indexes based on hyperparameter configurations available in the config file. +1. **Rich Search Index**: It creates multiple search indexes based on hyperparameter configurations available in the config file. -4. **Multiple Document Chunking Strategies**: The tool supports multiple chunking strategies, including using Azure Document Intelligence and basic chunking using langchain. This gives you the flexibility to experiment with different chunking strategies and evaluate their effectiveness. +1. **Multiple Document Chunking Strategies**: The tool supports multiple chunking strategies, including using Azure Document Intelligence and basic chunking using LangChain. This gives you the flexibility to experiment with different chunking strategies and evaluate their effectiveness. -5. **Query Generation**: The tool can generate a variety of diverse and customizable query sets, which can be tailored for specific experimentation needs. +1. **Custom Document Intelligence Loader** : When selecting the 'prebuilt-layout' API model for Document Intelligence, the tool utilizes a custom Document Intelligence loader to load the data. This custom loader supports formatting of tables into key-value pairs (to enhance readability for the LLM), excludes irrelevant parts of the file for the LLM (such as page numbers and footers), removes recurring patterns in the file using regex, and more. The custom loader resorts to the simpler 'prebuilt-layout' API model as a fallback when the 'prebuilt-layout' fails. Any other API model will utilize LangChain's implementation, which returns the raw response from Document Intelligence's API. -6. **Multiple Search Types**: It supports multiple search types, including pure text, pure vector, cross-vector, multi-vector, hybrid, and more. This gives you the ability to conduct comprehensive analysis on search capabilities and results. +1. **Query Generation**: The tool can generate a variety of diverse and customizable query sets, which can be tailored for specific experimentation needs. -7. **Sub-Querying**: The pattern evaluates the user query and if it finds it complex enough, it breaks it down into smaller sub-queries to generate relevant context. +1. **Multiple Search Types**: It supports multiple search types, including pure text, pure vector, cross-vector, multi-vector, hybrid, and more. This gives you the ability to conduct comprehensive analysis on search capabilities and results. -8. **Re-Ranking**: The query responses from Azure AI Search are re-evaluated using LLM and ranked according to the relevance between the query and the context. +1. **Sub-Querying**: The pattern evaluates the user query and if it finds it complex enough, it breaks it down into smaller sub-queries to generate relevant context. -9. **Metrics and Evaluation**: You can define custom evaluation metrics, which enable precise and granular assessment of search algorithm performance. It includes distance-based, cosine, semantic similarity, and more metrics out of the box. +1. **Re-Ranking**: The query responses from Azure AI Search are re-evaluated using LLM and ranked according to the relevance between the query and the context. -10. **Report Generation**: The **RAG Experiment Accelerator** automates the process of report generation, complete with visualizations that make it easy to analyze and share experiment findings. +1. **Metrics and Evaluation**: You can define custom evaluation metrics, which enable precise and granular assessment of search algorithm performance. It includes distance-based, cosine, semantic similarity, and more metrics out of the box. -11. **Multi-Lingual**: The tool supports language analyzers for linguistic support on individual languages and specialized (language-agnostic) analyzers for user-defined patterns on search indexes. For more information, see [Types of Analyzers](https://learn.microsoft.com/en-us/azure/search/search-analyzers#types-of-analyzers). +1. **Report Generation**: The **RAG Experiment Accelerator** automates the process of report generation, complete with visualizations that make it easy to analyze and share experiment findings. + +1. **Multi-Lingual**: The tool supports language analyzers for linguistic support on individual languages and specialized (language-agnostic) analyzers for user-defined patterns on search indexes. For more information, see [Types of Analyzers](https://learn.microsoft.com/en-us/azure/search/search-analyzers#types-of-analyzers). ## Products used @@ -229,7 +231,7 @@ Alternatively, you can run the above steps (apart from `02_qa_generation.py`) us "openai_temperature": "determines the OpenAI temperature. Valid value ranges from 0 to 1.", "search_relevancy_threshold": "the similarity threshold to determine if a doc is relevant. Valid ranges are from 0.0 to 1.0", "chunking_strategy": "determines the chunking strategy. Valid values are 'azure-document-intelligence' or 'basic'", - "azure_document_intelligence_model": "represents the Azure Document Intelligence Model. Used when chunking strategy is 'azure-document-intelligence'." + "azure_document_intelligence_model": "represents the Azure Document Intelligence Model. Used when chunking strategy is 'azure-document-intelligence'. When set to 'prebuilt-layout', provides additional features (see above)", } ```