Skip to content

Commit

Permalink
Update requirements, fix warnings (#394)
Browse files Browse the repository at this point in the history
* Update requirements, fix warnings

* Upd
  • Loading branch information
julia-meshcheryakova authored Mar 18, 2024
1 parent 3a063cb commit 78c68f2
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 23 deletions.
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ pytest==8.1.1
pytest-cov==4.1.0
flake8==7.0.0
pre-commit==3.6.2
black==24.2.0
black==24.3.0
8 changes: 5 additions & 3 deletions rag_experiment_accelerator/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:
self.AzureSearchCredentials = AzureSearchCredentials.from_env()
self.AzureMLCredentials = AzureMLCredentials.from_env()
self.AzureSkillsCredentials = AzureSkillsCredentials.from_env()
self.AzureDocumentIntelligenceCredentials = AzureDocumentIntelligenceCredentials.from_env()
self.AzureDocumentIntelligenceCredentials = (
AzureDocumentIntelligenceCredentials.from_env()
)

self.embedding_models: list[EmbeddingModel] = []
embedding_model_config = data.get("embedding_models", [])
Expand All @@ -145,11 +147,11 @@ def _initialize(self, config_dir: str, data_dir: str, filename: str) -> None:

self.MAIN_PROMPT_INSTRUCTION = data["main_prompt_instruction"]
if self.MAIN_PROMPT_INSTRUCTION is None:
logger.warn(
logger.warning(
"prompt_config.json found but main_prompt_instruction is"
" not set. Using default prompts"
)
self.MAIN_PROMPT_INSTRUCTION = main_prompt_instruction
except OSError:
logger.warn("prompt_config.json not found. Using default prompts")
logger.warning("prompt_config.json not found. Using default prompts")
self.MAIN_PROMPT_INSTRUCTION = main_prompt_instruction
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/docxLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import Docx2txtLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/htmlLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import BSHTMLLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/markdownLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
27 changes: 19 additions & 8 deletions rag_experiment_accelerator/doc_loader/pdfLoader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import azure_document_intelligence_directory_loader
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.doc_loader.documentIntelligenceLoader import (
azure_document_intelligence_directory_loader,
)
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)
import uuid
import re

Expand All @@ -29,9 +33,9 @@ def preprocess_pdf_content(content: str):
# Output: "hello world openai"
"""

content = re.sub(r'\n{2,}', '\n', content)
content = re.sub(r'\n{1,}', '', content)
content = re.sub(r'\\u[0-9a-fA-F]{4}', '', content)
content = re.sub(r"\n{2,}", "\n", content)
content = re.sub(r"\n{1,}", "", content)
content = re.sub(r"\\u[0-9a-fA-F]{4}", "", content)
content = content.lower()

return content
Expand Down Expand Up @@ -63,7 +67,12 @@ def load_pdf_files(
documents = []
for pattern in glob_patterns:
if chunking_strategy == "azure-document-intelligence":
documents += azure_document_intelligence_directory_loader(pattern, folder_path, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT, AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY)
documents += azure_document_intelligence_directory_loader(
pattern,
folder_path,
AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
AzureDocumentIntelligenceCredentials.AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY,
)
else:
# using langchain
loader = PyPDFDirectoryLoader(
Expand All @@ -86,7 +95,9 @@ def load_pdf_files(
docs = text_splitter.split_documents(documents)
docsList = []
for doc in docs:
docsList.append(dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)}))
docsList.append(
dict({str(uuid.uuid4()): preprocess_pdf_content(doc.page_content)})
)

logger.info(f"Split {len(documents)} PDF pages into {len(docs)} chunks")

Expand Down
6 changes: 4 additions & 2 deletions rag_experiment_accelerator/doc_loader/textLoader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

from rag_experiment_accelerator.doc_loader.structuredLoader import (
load_structured_files,
)
from rag_experiment_accelerator.utils.logging import get_logger
from rag_experiment_accelerator.config.credentials import AzureDocumentIntelligenceCredentials
from rag_experiment_accelerator.config.credentials import (
AzureDocumentIntelligenceCredentials,
)

logger = get_logger(__name__)

Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ docx2txt==0.8
evaluate==0.4.1
fuzzywuzzy==0.18.0
hnswlib==0.8.0
langchain==0.1.9
langchain==0.1.12
langchain-community==0.0.28
levenshtein==0.25.0
lxml==5.1.0
matplotlib==3.8.3
mlflow==2.11.1
openai==1.13.3
plotly==5.19.0
openai==1.14.1
plotly==5.20.0
pypdf==4.1.0
pytesseract==0.3.10
python-dotenv==1.0.1
Expand Down

0 comments on commit 78c68f2

Please sign in to comment.