Skip to content

Commit

Permalink
Update requirements and code breaking changes
Browse files Browse the repository at this point in the history
  • Loading branch information
julia-meshcheryakova committed Mar 14, 2024
1 parent 3a063cb commit 49146b8
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 40 deletions.
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/docxLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import Docx2txtLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/htmlLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import BSHTMLLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/markdownLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
27 changes: 19 additions & 8 deletions rag_experiment_accelerator/doc_loader/pdfLoader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import azure_document_intelligence_directory_loader
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import (
azure_document_intelligence_directory_loader,
)
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)
import uuid
import re

Expand All @@ -29,9 +33,9 @@ def preprocess_pdf_content(content: str):
# Output: "hello world openai"
"""

content = re.sub(r'\n{2,}', '\n', content)
content = re.sub(r'\n{1,}', '', content)
content = re.sub(r'\\u[0-9a-fA-F]{4}', '', content)
content = re.sub(r"\n{2,}", "\n", content)
content = re.sub(r"\n{1,}", "", content)
content = re.sub(r"\\u[0-9a-fA-F]{4}", "", content)
content = content.lower()

return content
Expand Down Expand Up @@ -63,7 +67,12 @@ def load_pdf_files(
documents = []
for pattern in glob_patterns:
if chunking_strategy == "azure-document-intelligence":
documents += azure_document_intelligence_directory_loader(pattern, folder_path, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY)
documents += azure_document_intelligence_directory_loader(
pattern,
folder_path,
AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY,
)
else:
# using langchain
loader = PyPDFDirectoryLoader(
Expand All @@ -86,7 +95,9 @@ def load_pdf_files(
docs = text_splitter.split_documents(documents)
docsList = []
for doc in docs:
docsList.append(dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)}))
docsList.append(
dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)})
)

logger.info(f"Split {len(documents)} PDF pages into {len(docs)} chunks")

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/textLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
44 changes: 22 additions & 22 deletions rag_experiment_accelerator/init_Index/create_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
CharFilter,
CorsOptions,
HnswParameters,
HnswVectorSearchAlgorithmConfiguration,
VectorSearchAlgorithmConfiguration,
LexicalTokenizer,
PrioritizedFields,
SemanticPrioritizedFields,
SearchableField,
SearchField,
SearchFieldDataType,
SearchIndex,
SemanticConfiguration,
SemanticField,
SemanticSettings,
SemanticSearch,
SimpleField,
TokenFilter,
VectorSearch,
Expand All @@ -35,7 +35,6 @@ def create_acs_index(
analyzers,
):
try:

credential = AzureKeyCredential(key)

# Apply checks on analyzer settings. Search analyzer and index analyzer must be set together
Expand All @@ -44,15 +43,20 @@ def create_acs_index(

if bool(index_analyzer_name) != bool(search_analyzer_name):
raise ValueError(
"Both 'index_analyzer_name' and 'search_analyzer_name' must be set together")
"Both 'index_analyzer_name' and 'search_analyzer_name' must be set together"
)

index_analyzer = index_analyzer_name if index_analyzer_name else ""
search_analyzer = search_analyzer_name if search_analyzer_name else ""

# Analyzer can only be used if neither search analyzer or index analyzer are set
if analyzers.get("analyzer_name") and (analyzers.get("search_analyzer_name") or analyzers.get("index_analyzer_name")):
if analyzers.get("analyzer_name") and (
analyzers.get("search_analyzer_name")
or analyzers.get("index_analyzer_name")
):
raise ValueError(
"analyzer_name should be empty if either search_analyzer_name or index_analyzer_name is not empty")
"analyzer_name should be empty if either search_analyzer_name or index_analyzer_name is not empty"
)
analyzer = analyzers.get("analyzer_name") or ""

# Create a search index
Expand Down Expand Up @@ -100,24 +104,21 @@ def create_acs_index(
),
SearchField(
name="contentVector",
type=SearchFieldDataType.Collection(
SearchFieldDataType.Single),
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=int(dimension),
vector_search_profile="my-vector-search-profile",
),
SearchField(
name="contentTitle",
type=SearchFieldDataType.Collection(
SearchFieldDataType.Single),
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=int(dimension),
vector_search_profile="my-vector-search-profile",
),
SearchField(
name="contentSummary",
type=SearchFieldDataType.Collection(
SearchFieldDataType.Single),
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
vector_search_dimensions=int(dimension),
vector_search_profile="my-vector-search-profile",
Expand All @@ -134,9 +135,10 @@ def create_acs_index(

vector_search = VectorSearch(
algorithms=[
HnswVectorSearchAlgorithmConfiguration(
VectorSearchAlgorithmConfiguration(
name="my-vector-config",
parameters=HnswParameters(
kind="hnsw",
hnsw_parameters=HnswParameters(
m=4,
ef_construction=int(ef_construction),
ef_search=int(ef_search),
Expand All @@ -153,14 +155,13 @@ def create_acs_index(

semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=PrioritizedFields(
prioritized_content_fields=[
SemanticField(field_name="content")]
prioritized_fields=SemanticPrioritizedFields(
content_fields=[SemanticField(field_name="content")]
),
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])
semantic_search = SemanticSearch(configurations=[semantic_config])

# Define a custom tokenizer, token filter and char filter
tokenizers = []
Expand Down Expand Up @@ -190,16 +191,15 @@ def create_acs_index(
for char_filter in analyzers["char_filters"]
]

cors_options = CorsOptions(
allowed_origins=["*"], max_age_in_seconds=60)
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []

# Create the search index with the semantic, tokenizer, and filter settings
index = SearchIndex(
name=index_name,
fields=fields,
vector_search=vector_search,
semantic_settings=semantic_settings,
semantic_search=semantic_search,
scoring_profiles=scoring_profiles,
cors_options=cors_options,
tokenizers=tokenizers,
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ azure-ai-textanalytics==5.3.0
azure-core==1.30.1
azure-identity==1.15.0
azure-keyvault-secrets==4.8.*
azure-search-documents==11.4.b11
azure-search-documents==11.6.0b2
azure.ai.documentintelligence==1.0.0b2
azureml-mlflow==1.55.0
beautifulsoup4==4.12.3
Expand All @@ -12,7 +12,7 @@ docx2txt==0.8
evaluate==0.4.1
fuzzywuzzy==0.18.0
hnswlib==0.8.0
langchain==0.1.9
langchain==0.1.11
langchain-community==0.0.28
levenshtein==0.25.0
lxml==5.1.0
Expand Down

0 comments on commit 49146b8

Please sign in to comment.