Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update requirements fix warnings #393

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions rag_experiment_accelerator/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:
self.AzureSearchCredentials = AzureSearchCredentials.from_env()
self.AzureMLCredentials = AzureMLCredentials.from_env()
self.AzureSkillsCredentials = AzureSkillsCredentials.from_env()
self.AzureDocumentIntelligenceCredentials = AzureDocumentIntelligenceCredentials.from_env()
self.AzureDocumentIntelligenceCredentials = (
AzureDocumentIntelligenceCredentials.from_env()
)

self.embedding_models: list[EmbeddingModel] = []
embedding_model_config = data.get("embedding_models", [])
Expand All @@ -145,11 +147,11 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:

self.MAIN_PROMPT_INSTRUCTION = data["main_prompt_instruction"]
if self.MAIN_PROMPT_INSTRUCTION is None:
logger.warn(
logger.warning(
"prompt_config.json found but main_prompt_instruction is"
" not set. Using default prompts"
)
self.MAIN_PROMPT_INSTRUCTION = main_prompt_instruction
except OSError:
logger.warn("prompt_config.json not found. Using default prompts")
logger.warning("prompt_config.json not found. Using default prompts")
self.MAIN_PROMPT_INSTRUCTION = main_prompt_instruction
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/docxLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import Docx2txtLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/htmlLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import BSHTMLLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/markdownLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
27 changes: 19 additions & 8 deletions rag_experiment_accelerator/doc_loader/pdfLoader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import azure_document_intelligence_directory_loader
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import (
azure_document_intelligence_directory_loader,
)
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)
import uuid
import re

Expand All @@ -29,9 +33,9 @@ def preprocess_pdf_content(content: str):
# Output: "hello world openai"
"""

content = re.sub(r'\n{2,}', '\n', content)
content = re.sub(r'\n{1,}', '', content)
content = re.sub(r'\\u[0-9a-fA-F]{4}', '', content)
content = re.sub(r"\n{2,}", "\n", content)
content = re.sub(r"\n{1,}", "", content)
content = re.sub(r"\\u[0-9a-fA-F]{4}", "", content)
content = content.lower()

return content
Expand Down Expand Up @@ -63,7 +67,12 @@ def load_pdf_files(
documents = []
for pattern in glob_patterns:
if chunking_strategy == "azure-document-intelligence":
documents += azure_document_intelligence_directory_loader(pattern, folder_path, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY)
documents += azure_document_intelligence_directory_loader(
pattern,
folder_path,
AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY,
)
else:
# using langchain
loader = PyPDFDirectoryLoader(
Expand All @@ -86,7 +95,9 @@ def load_pdf_files(
docs = text_splitter.split_documents(documents)
docsList = []
for doc in docs:
docsList.append(dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)}))
docsList.append(
dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)})
)

logger.info(f"Split {len(documents)} PDF pages into {len(docs)} chunks")

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/textLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
27 changes: 13 additions & 14 deletions rag_experiment_accelerator/init_Index/create_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def create_acs_index(
analyzers,
):
try:

credential = AzureKeyCredential(key)

# Apply checks on analyzer settings. Search analyzer and index analyzer must be set together
Expand All @@ -44,15 +43,20 @@ def create_acs_index(

if bool(index_analyzer_name) != bool(search_analyzer_name):
raise ValueError(
"Both 'index_analyzer_name' and 'search_analyzer_name' must be set together")
"Both 'index_analyzer_name' and 'search_analyzer_name' must be set together"
)

index_analyzer = index_analyzer_name if index_analyzer_name else ""
search_analyzer = search_analyzer_name if search_analyzer_name else ""

# Analyzer can only be used if neither search analyzer or index analyzer are set
if analyzers.get("analyzer_name") and (analyzers.get("search_analyzer_name") or analyzers.get("index_analyzer_name")):
if analyzers.get("analyzer_name") and (
analyzers.get("search_analyzer_name")
or analyzers.get("index_analyzer_name")
):
raise ValueError(
"analyzer_name should be empty if either search_analyzer_name or index_analyzer_name is not empty")
"analyzer_name should be empty if either search_analyzer_name or index_analyzer_name is not empty"
)
analyzer = analyzers.get("analyzer_name") or ""

# Create a search index
Expand Down Expand Up @@ -100,24 +104,21 @@ def create_acs_index(
),
SearchField(
name="contentVector",
type=SearchFieldDataType.Collection(
SearchFieldDataType.Single),
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=int(dimension),
vector_search_profile="my-vector-search-profile",
),
SearchField(
name="contentTitle",
type=SearchFieldDataType.Collection(
SearchFieldDataType.Single),
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=int(dimension),
vector_search_profile="my-vector-search-profile",
),
SearchField(
name="contentSummary",
type=SearchFieldDataType.Collection(
SearchFieldDataType.Single),
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=int(dimension),
vector_search_profile="my-vector-search-profile",
Expand Down Expand Up @@ -154,8 +155,7 @@ def create_acs_index(
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=PrioritizedFields(
prioritized_content_fields=[
SemanticField(field_name="content")]
prioritized_content_fields=[SemanticField(field_name="content")]
),
)

Expand Down Expand Up @@ -190,8 +190,7 @@ def create_acs_index(
for char_filter in analyzers["char_filters"]
]

cors_options = CorsOptions(
allowed_origins=["*"], max_age_in_seconds=60)
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []

# Create the search index with the semantic, tokenizer, and filter settings
Expand Down
Loading
Loading