diff --git a/.github/contribute_guide.md b/.github/contribute_guide.md new file mode 100644 index 0000000..c5f4798 --- /dev/null +++ b/.github/contribute_guide.md @@ -0,0 +1,25 @@ +# Contribution Guide +這個資料夾主要處理 CI Pipeline, 目前僅有檢測程式碼規範 (pre-commit), 且在發 PR & merge to main 才會觸發 + +We follow GitHub Flow for contributing. The steps are as follows: + +1. **Claim an issue**: Start by picking an issue from GitHub. +2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`). +3. **Development**: After completing the feature, ensure you run pre-commit hooks: + ``` + pre-commit run --all-files + ``` +4. **Create PR Request (PR)**: + - Ensure your PR is small and easily reviewable. + - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference. + - Write a clear description including the reason for the change and what was modified (`Reason & Changes`). +5. **Review & Approval**: + - Assign the PR to all members of the team for review. + - Wait for at least one approval. + - Ensure all CI checks pass. +6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself. + +## Additional Notes +- Keep your commits focused and ensure meaningful commit messages. +- Always rebase your branch on top of `main` before merging. +- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues. diff --git a/Model/README.md b/Model/README.md new file mode 100644 index 0000000..884c41b --- /dev/null +++ b/Model/README.md @@ -0,0 +1,10 @@ +# 進行檢索的主程式 + +## flask_app.py +會開出一個 API 供 main.py 呼叫,每次呼叫會送入一題問題,並回傳一個答案 pid + +## utils/retrieval_agent.py +負責呼叫 weaviate & voyage reranker 進行檢索 + +## utils/config_log.py +負責處理 config 檔案,並設定 log 檔案 diff --git a/src/flask_app.py b/Model/flask_app.py similarity index 86% rename from src/flask_app.py rename to Model/flask_app.py index 991bfa2..e53164b 100644 --- a/src/flask_app.py +++ b/Model/flask_app.py @@ -59,18 +59,18 @@ def get(self): return response -# TODO: Modify the output format for general RAG purposes @ns.route('/chat') class ChatBot(Resource): @api.doc('chat_bot') @api.expect(model) def post(self): + """retrieve and rank api entry point""" qid = request.json.get('qid') source = request.json.get('source') question = request.json.get('query') category = request.json.get('category') - # for alpha testing + # for alpha testing (finding best hybrid search alpha) # alpha = request.json.get('alpha') # input template @@ -81,9 +81,10 @@ def post(self): # "category": "insurance" # }, - alpha = 0.5 + alpha = 0.5 # 最終因使用 Reranker 全盤處理 sources,故任何 alpha 對準確率都無影響 if not question: + # 為避免任何萬一,無論如何都須回傳一個結果,不做 Error logging response = jsonify({'qid': '1', 'retrieve': '1'}) response.status_code = 200 return response @@ -103,19 +104,24 @@ def post(self): response.status_code = 200 return response except TypeError: + # 為避免任何萬一,無論如何都須回傳一個結果,不做 Error logging response = jsonify({'qid': qid, 'retrieve': source[-1]}) response.status_code = 200 return response +# For API Docs @app.before_request def require_auth_for_docs(): + """Require authentication for API Docs""" if request.path == '/': return auth.login_required()(swagger_ui)() +# For API Docs @app.route('/') def swagger_ui(): + """Redirect to the Swagger UI""" return api.render_doc() diff --git a/Model/utils/README.md b/Model/utils/README.md new file mode 100644 index 0000000..04f9b54 --- /dev/null +++ b/Model/utils/README.md @@ -0,0 +1,7 @@ +# flask_app 主程式需呼叫的所有輔助程式 + +## retrieval_agent.py +負責呼叫 weaviate & voyage reranker 進行檢索 + +## config_log.py +負責處理 config 檔案,並設定 log 檔案 diff --git a/src/utils/__init__.py b/Model/utils/__init__.py similarity index 100% rename from src/utils/__init__.py rename to Model/utils/__init__.py diff --git a/src/utils/config_log.py b/Model/utils/config_log.py similarity index 92% rename from src/utils/config_log.py rename to Model/utils/config_log.py index 655d224..0ad925c 100644 --- a/src/utils/config_log.py +++ b/Model/utils/config_log.py @@ -6,6 +6,7 @@ def setup_config_and_logging(): + """Set up the configuration and logging.""" config = configparser.ConfigParser() logger = logging.getLogger() diff --git a/src/utils/retrieval_agent.py b/Model/utils/retrieval_agent.py similarity index 74% rename from src/utils/retrieval_agent.py rename to Model/utils/retrieval_agent.py index 4400670..6534092 100644 --- a/src/utils/retrieval_agent.py +++ b/Model/utils/retrieval_agent.py @@ -6,30 +6,34 @@ import utils.config_log as config_log -# 載入設定檔案和日誌設定 config, logger, CONFIG_PATH = config_log.setup_config_and_logging() config.read(CONFIG_PATH) -# 從 config 中取得 Weaviate URL 和 API 金鑰 -wea_url = config.get('Weaviate', 'weaviate_url') -voyage_api_key = config.get('VoyageAI', 'api_key') -PROPERTIES = ['pid', 'content'] +wea_url = config.get('Weaviate', 'weaviate_url') # 此次所使用的向量資料庫 +voyage_api_key = config.get('VoyageAI', 'api_key') # Voyage Reranker 所使用的 API Key +PROPERTIES = ['pid', 'content'] # 向量資料庫中此 Class 的欄位 # 設定 OpenAI API 金鑰 os.environ['OPENAI_API_KEY'] = config.get('OpenAI', 'api_key') class WeaviateSemanticSearch: + """Weaviate 向量資料庫的搜尋類別""" + def __init__(self, classnm): + """初始化 Weaviate 向量資料庫的搜尋類別""" self.url = wea_url + # 選擇的 OpenAI embedding model self.embeddings = OpenAIEmbeddings(chunk_size=1, model='text-embedding-3-large') self.client = weaviate.Client(url=wea_url) self.classnm = classnm def hybrid_search(self, query, source, num, alpha): + """Weaviate 向量資料庫的搜尋方法""" query_vector = self.embeddings.embed_query(query) vector_str = ','.join(map(str, query_vector)) + # 下述兩搜索式主要為過濾出 source 中的 pid,並只針對 source 中的 pid 的文件進行 retrieval & rerank where_conditions = ' '.join([f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source]) gql_query = f""" @@ -63,16 +67,20 @@ def hybrid_search(self, query, source, num, alpha): def rerank_with_voyage(query, documents, pids, api_key): + """利用 Voyage Reranker 對 Weaviate hybrid search retrieval 的結果進行 rerank""" vo = voyageai.Client(api_key=api_key) + # 利用 voyage rerank-2 從 hybrid search retrieval 中篩出的所有文件取出最終的 top 1 reranking = vo.rerank(query, documents, model='rerank-2', top_k=1) top_result = reranking.results[0] - # 根據內容找到相對應的 pid top_pid = pids[documents.index(top_result.document)] return {'pid': top_pid, 'relevance_score': top_result.relevance_score} def search_do(question, category, source, alpha): + """flask_app.py 呼叫的 '搜尋' 主程式""" + + # 先根據題目給定的 category 選擇對應的向量資料庫 Class if category == 'finance': vdb_named = 'Financedev' elif category == 'insurance': @@ -81,16 +89,16 @@ def search_do(question, category, source, alpha): vdb_named = 'Faqdev' searcher = WeaviateSemanticSearch(vdb_named) - # 從 Weaviate 取得前 100 筆結果 + # 從 Weaviate hybrid search retrieval 前 100 筆結果 top_100_results = searcher.hybrid_search(question, source, 100, alpha=alpha) - # 準備文件和 pid 列表供 rerank 使用 documents = [result['content'] for result in top_100_results] pids = [result['pid'] for result in top_100_results] # 使用 VoyageAI 重新排序,並取得排名最高的 pid top_reranked_result = rerank_with_voyage(question, documents, pids, voyage_api_key) + # Log print('最相關文件的 PID:') print(f"PID: {top_reranked_result['pid']}") print(f"相關性分數: {top_reranked_result['relevance_score']}") diff --git a/Preprocess/README.md b/Preprocess/README.md new file mode 100644 index 0000000..082c981 --- /dev/null +++ b/Preprocess/README.md @@ -0,0 +1,8 @@ +# 此資料夾為所有處理資料的程式碼 +包含 資料預處理 及 資料寫入資料庫 + +## data_process/ +OCR & PDF 文字直接讀取 + +## insert_data.py +此程式為寫入資料庫的程式碼,並包含建立資料庫 class、對資料進行 embedding、利用 text_splitter 去 chunk tokens 數過多的資料 diff --git a/Preprocess/data_process/README.md b/Preprocess/data_process/README.md new file mode 100644 index 0000000..39a494e --- /dev/null +++ b/Preprocess/data_process/README.md @@ -0,0 +1,204 @@ +--- +title: 資料前處理使用指南 +此資料夾為資料預處理的程式碼 +- OCR & PDF 文字直接讀取 + +--- + + +# 資料前處理使用指南 + +## 簡介 + +此程式碼包含用於讀取與處理 Reference 檔案夾中 FAQ(JSON)文件和 Finance 與 Insurance(PDF)文本文件的 Python 程式碼。程式碼的主要功能包括: + +- 先從 ZIP 壓縮檔案中提取指定資料夾內的 PDF 文件,再將每一頁轉換為圖像,並使用 Tesseract 進行 OCR 識別以提取文本內容。將提取的文本內容保存為 `.txt` 文件,按類別分類儲存。 +- 再讀取 FAQ JSON 文件和 OCR 生成的文本文件,將所有資料格式化並合併為一個統一的 JSON 文件,便於後續的檢索與處理。 + +## 運行環境和套件 + +### Python 套件 + +- `pytesseract` +- `pdf2image` +- `zipfile`(標準函式庫) +- `json`(標準函式庫) +- `os`(標準函式庫) + +### 外部套件 + +- **Tesseract-OCR**:用於 OCR 識別。 + - 下載地址:[Tesseract OCR](https://github.com/tesseract-ocr/tesseract) + - 安裝路徑示例:`C:\Program Files\Tesseract-OCR\tesseract.exe` +- **Poppler**:用於 PDF 轉圖片。 + - 下載地址:[Poppler for Windows](http://blog.alivate.com.au/poppler-windows/) + - 安裝路徑示例:`C:\Program Files\poppler-24.08.0\Library\bin` + +## 安裝 + +### 1. 複製或下載專案 + +如果您尚未獲取專案代碼,請複製或下載到本地: + +```bash +git clone https://github.com/yourusername/your-repo-name.git +cd your-repo-name +``` + + +### 2. 安裝外部套件 + +- **Tesseract-OCR**: + - 下載並安裝 Tesseract-OCR。 + - 安裝完成後,記下安裝路徑(如 `C:\Program Files\Tesseract-OCR\tesseract.exe`)。 + +- **Poppler**: + - 下載並安裝 Poppler。 + - 安裝完成後,記下 `poppler_path`(如 `C:\Program Files\poppler-24.08.0\Library\bin`)。 + +### 3. 安裝 Python 套件 + +安裝所需的 Python 套件: + +```bash +pip install pytesseract==0.3.13 +pip install pdf2image==1.17.0 +``` + +## 配置 + +在程式碼中配置 Tesseract 和 Poppler 的路徑: + +```python +# Configure Tesseract path if necessary (update this path as needed) +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + +# Specify the path to the Poppler binaries +poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" +``` + +確保將上述路徑替換為本地實際安裝的路徑。 + +## 使用說明 + +### 1. 準備資料 + +確保您的 ZIP 文件包含以下資料夾和文件: + +- `競賽資料集/reference/faq/pid_map_content.json` +- `競賽資料集/reference/finance/*.pdf` +- `競賽資料集/reference/insurance/*.pdf` + +### 2. 運行 OCR 提取 + +運行以下命令進行 OCR 處理: + +```bash +python data_preprocess.py +``` + +程式碼將執行以下步驟: + +1. 從指定的 ZIP 文件中提取 Finance 和 Insurance 的 PDF 文件。 +2. 將每個 PDF 文件的每一頁轉換為圖像。 +3. 使用 Tesseract 進行 OCR 識別,提取文本內容。 +4. 將提取的文本保存為 `.txt` 文件,按類別儲存在 `dataset/output_text/finance/` 和 `dataset/output_text/insurance/` 目錄下。 + +### 3. 資料格式化 + +程式碼會繼續執行以下步驟: + +1. 讀取 FAQ 文件 `pid_map_content.json`,提取問題和答案。 +2. 讀取 OCR 生成的文本文件,按 PDF 文件和頁碼順序合併文本內容。 +3. 將所有資料格式化並合併為一個 JSON 文件 `dataset/formatted_reference_ocr.json`。 + +### 4. 查看輸出 + +- **OCR 輸出文本文件**: + - Finance 文本文件保存在 `dataset/output_text/finance/`。 + - Insurance 文本文件保存在 `dataset/output_text/insurance/`。 + +- **合併後的 JSON 文件**: + - `dataset/formatted_reference_ocr.json` 包含了所有格式化後的 FAQ、Finance 與 Insurance 資料。 + +## 文件結構 + +``` +project/ +├── dataset/ +│ ├── output_text/ +│ │ └── 競賽資料集/ +│ │ └── reference/ +│ │ ├── finance/ +│ │ │ ├── 0.pdf_page_1.txt +│ │ │ ├── 1.pdf_page_1.txt +│ │ │ ├── 1.pdf_page_2.txt +│ │ │ └── ... +│ │ └── insurance/ +│ │ ├── 1.pdf_page_1.txt +│ │ ├── 1.pdf_page_2.txt +│ │ └── ... +│ └── formatted_reference_ocr.json +├── datazip.zip/ +│ └── 競賽資料集/ +│ └── reference/ +│ ├── faq/ +│ │ └── pid_map_content.json +│ ├── finance/ +│ │ ├── 0.pdf +│ │ ├── 1.pdf +│ │ └── ... +│ └── insurance/ +│ ├── 1.pdf +│ ├── 2.pdf +│ └── ... +├── data_preprocess.py +└── README.md +``` + +## 範例輸出 + +生成的 `formatted_reference_ocr.json` 文件結構示例: + +```json +[ + { + "category": "faq", + "qid": "0", + "content": { + "question": "什麼是跨境手機掃碼支付?", + "answers": [ + "允許大陸消費者可以用手機支付寶App在台灣實體商店購買商品或服務" + ] + } + },// 其他 FAQ 資料條目... + { + "category": "finance", + "qid": "0", + "content": "註 1U ﹕ 本 雄 團 於 民 國 111] 年 第 1 季 投 賁 成 立 寶 元 智 造 公 司 , 由 本 集 圖 持\n有 100% 股 權 , 另 於 民 國 111 年 第 3 季 及 112 年 第 1 季 未 依 持 股 比..." + },// 其他 Finance 資料條目... + { + "category": "insurance", + "qid": "1", + "content": "延 期 間 內 發 生 第 十 六 條 或 第 十 七 條 本 公 司 應 負 係 險 貫 任 之 事 故 時 , 其 約 定 之 係 險 金 計 算 方 式 將 不 適 用 , 本 公\n..." + },// 其他 Insurance 資料條目... +] +``` + +## 注意事項 + +- **編碼**:確保所有文本文件均使用 UTF-8 編碼,以支持中文字符,避免出現亂碼。 +- **路徑配置**: + - 請根據您本地的安裝路徑,更新程式碼中的 `tesseract_cmd` 和 `poppler_path` 變數。 +- **文件命名**: + - OCR 文本文件必須遵循 `{文件名}.pdf_page_{頁碼}.txt` 的命名規則,以確保程式碼能夠正確讀取並合併各頁內容。 +- **套件安裝**: + - 確保已正確安裝並配置 Tesseract-OCR 和 Poppler,否則程式碼將無法正常運行。 + +## 許可證 + +本專案採用 [MIT 許可證](LICENSE)。您可以自由地使用、修改和分發本專案。 + +--- + +**感謝您的使用!** diff --git a/src/data/conbine_readpdf_result.py b/Preprocess/data_process/conbine_readpdf_result.py similarity index 93% rename from src/data/conbine_readpdf_result.py rename to Preprocess/data_process/conbine_readpdf_result.py index a021ced..7976b17 100644 --- a/src/data/conbine_readpdf_result.py +++ b/Preprocess/data_process/conbine_readpdf_result.py @@ -4,7 +4,7 @@ with open('data/aicup_noocr.json', encoding='utf-8') as file: noocr_data = json.load(file) -with open('data/aicup_ref.json', encoding='utf-8') as file: +with open('data/formatted_reference_ocr.json', encoding='utf-8') as file: ref_data = json.load(file) # 建立 ref_data 的 dictionary,並檢查 content 是否為字串,再去除空格 diff --git a/Preprocess/data_process/data_preprocess.py b/Preprocess/data_process/data_preprocess.py new file mode 100644 index 0000000..e82c118 --- /dev/null +++ b/Preprocess/data_process/data_preprocess.py @@ -0,0 +1,159 @@ +import json +import os +import zipfile + +import pytesseract +from pdf2image import convert_from_bytes + +# Configure Tesseract path if necessary (update this path as needed) +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + + +def ocr_in_folder(zip_path, folder, output_dir): + """ + Extracts PDF files from a ZIP archive, performs OCR, and saves the output text. + + Args: + zip_path (str): The path to the ZIP file containing the documents. + folder (str): The folder path inside the ZIP to search for PDF files. + output_dir (str): The directory to save the OCR output text files. + + Returns: + None + """ + folder_path = f'{folder}/' + + with zipfile.ZipFile(zip_path, 'r') as zipf: + for zip_info in zipf.infolist(): + if zip_info.filename.startswith(folder_path) and not zip_info.is_dir(): + with zipf.open(zip_info.filename) as pdf_file: + pdf_bytes = pdf_file.read() + + # Specify the path to the Poppler binaries if needed + poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin' + + # Convert the PDF bytes to images + images = convert_from_bytes(pdf_bytes, dpi=300, poppler_path=poppler_path) + + os.makedirs(output_dir, exist_ok=True) + + # Extract only the base filename (e.g., "file1.pdf" instead of the full path) + base_filename = os.path.basename(zip_info.filename) + + # Perform OCR on each page and save the text + for i, image in enumerate(images): + text = pytesseract.image_to_string(image, lang='chi_tra') + output_file_path = os.path.join(output_dir, f'{base_filename}_page_{i + 1}.txt') + os.makedirs(os.path.dirname(output_file_path), exist_ok=True) + with open(output_file_path, 'w', encoding='utf-8') as f: + f.write(text) + print(f'OCR completed for {base_filename}') + + +# OCR extraction paths +zip_path = 'datazip.zip' +ocr_in_folder(zip_path, '競賽資料集/reference/insurance', 'dataset/output_text/insurance') +ocr_in_folder(zip_path, '競賽資料集/reference/finance', 'dataset/output_text/finance') + +# File paths +FAQ_FILEPATH = 'datazip/競賽資料集/reference/faq/pid_map_content.json' +FINANCE_OCR_FOLDER_PATH = 'dataset/output_text/finance' +INSURANCE_OCR_FOLDER_PATH = 'dataset/output_text/insurance' + + +def check_text(file_path, category): + """ + Reads a JSON FAQ file, processes it, and returns formatted data. + + Args: + file_path (str): Path to the FAQ JSON file. + category (str): Category label for the FAQ data. + + Returns: + list: A list of dictionaries containing formatted FAQ data. + """ + formatted_data = [] + with open(file_path, encoding='utf-8') as faq_file: + loaded_faq = json.load(faq_file) + + for qid, questions in loaded_faq.items(): + for question_item in questions: + formatted_entry = { + 'category': category, + 'qid': qid, + 'content': {'question': question_item['question'], 'answers': question_item['answers']}, + } + formatted_data.append(formatted_entry) + print(formatted_entry) + return formatted_data + + +def read_ocr_files(ocr_folder_path, category): + """ + Reads text files generated from OCR, consolidates them, and returns formatted data. + + Args: + ocr_folder_path (str): Path to the folder containing OCR text files. + category (str): Category label for the OCR data. + + Returns: + list: A list of dictionaries containing consolidated OCR data. + """ + formatted_data = [] + + # Capture the name of file + file_basenames = set() + for filename in os.listdir(ocr_folder_path): + if filename.endswith('.txt'): + basename = filename.split('.pdf_page_')[0] + file_basenames.add(basename) + + for basename in sorted(file_basenames, key=lambda x: int(x)): + all_text = '' + page_files = [] + + for filename in os.listdir(ocr_folder_path): + if filename.startswith(basename) and filename.endswith('.txt'): + page_files.append(filename) + + page_files = sorted(page_files, key=lambda x: int(x.split('.pdf_page_')[1].split('.txt')[0])) + + for page_file in page_files: + ocr_file_path = os.path.join(ocr_folder_path, page_file) + with open(ocr_file_path, encoding='utf-8') as ocr_file: + content = ocr_file.read() + all_text += content + '\n\n' + + formatted_entry = {'category': category, 'qid': basename, 'content': all_text.strip()} + formatted_data.append(formatted_entry) + print(formatted_entry) + + return formatted_data + + +if __name__ == '__main__': + """ + Main entry point of the script. Processes FAQ, finance, and insurance OCR data, + consolidates them, and saves the result to a JSON file. + """ + total_formatted_data = [] + + # handle faq + faq_data = check_text(FAQ_FILEPATH, 'faq') + total_formatted_data.extend(faq_data) + + # read finance ocr + finance_data = read_ocr_files(FINANCE_OCR_FOLDER_PATH, 'finance') + total_formatted_data.extend(finance_data) + + # read insurance ocr + insurance_data = read_ocr_files(INSURANCE_OCR_FOLDER_PATH, 'insurance') + total_formatted_data.extend(insurance_data) + + # store the data after cleaning in formatted_reference_ocr.json + output_json_path = 'data/formatted_reference_ocr.json' + # os.makedirs(os.path.dirname(output_json_path), exist_ok=True) + with open(output_json_path, 'w', encoding='utf-8') as formatted_file: + json.dump(total_formatted_data, formatted_file, ensure_ascii=False, indent=4) + + print('The process is finished and the result is saved in dataset/formatted_reference_ocr.json') diff --git a/src/data/read_pdf_noocr.py b/Preprocess/data_process/read_pdf_noocr.py similarity index 100% rename from src/data/read_pdf_noocr.py rename to Preprocess/data_process/read_pdf_noocr.py diff --git a/src/db_insert.py b/Preprocess/insert_data.py similarity index 94% rename from src/db_insert.py rename to Preprocess/insert_data.py index 76ec8a0..61de115 100644 --- a/src/db_insert.py +++ b/Preprocess/insert_data.py @@ -16,13 +16,17 @@ class WeaviateManager: + """Weaviate Insert data 管理器""" + def __init__(self, classnm): + """初始化 Weaviate 連接""" self.url = wea_url self.client = weaviate.Client(url=wea_url, additional_headers={'X-OpenAI-Api-Key': openai_api_key}) self.classnm = classnm self.check_class_exist() def check_class_exist(self): + """檢查 class 是否存在""" if self.client.schema.exists(self.classnm): print(f'{self.classnm} is ready') return True @@ -47,6 +51,7 @@ def check_class_exist(self): return True def insert_data(self, pid, content): + """插入資料到 Weaviate""" data_object = {'pid': pid, 'content': content} max_retries = 5 for attempt in range(max_retries): @@ -73,6 +78,7 @@ def insert_data(self, pid, content): return False def split_and_insert(self, pid, content, category): + """處理特例:分割並插入資料""" # 使用 TextSplitter 分割長文本 text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=500) split_content = text_splitter.split_text(content) @@ -118,7 +124,7 @@ def split_and_insert(self, pid, content, category): elif not result: # 如果失敗且非長度問題 failed_records.append({'pid': pid, 'category': category}) - # 將失敗的資料寫入 JSON 檔案 + # 將失敗的資料寫入 JSON 檔案,之後有需要可以利用此 JSON 檔案重新匯入 if failed_records: with open('failed_imports.json', 'w', encoding='utf-8') as f: json.dump(failed_records, f, ensure_ascii=False, indent=4) diff --git a/README.md b/README.md index 0a74b05..2a8c3fc 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,52 @@ # AI CUP 2024 玉山人工智慧公開挑戰賽-RAG與LLM在金融問答的應用 -** High-Accuracy RAG Retriever Template ** -## Rankings +## Repo Structure +``` +. +├── .github +│ ├── contribute_guide.md +│ └── workflows +│ └── ci.yml +├── .gitignore # 讓 git 忽略的檔案和目錄 (e.g. cache, logs, etc.) +├── .pre-commit-config.yaml # 設定 pre-commit hooks 以檢查與格式化代碼、環境配置、Git 設定及檢測敏感資訊 +├── .ruff.toml # ruff 設定檔,lint: pep8-naming, pycodestyle, pyflakes, etc. +├── LICENSE # MIT License +├── Model +│ ├── README.md +│ ├── flask_app.py +│ └── utils +│ ├── README.md +│ ├── __init__.py +│ ├── config_log.py +│ └── retrieval_agent.py +├── Preprocess +│ ├── README.md +│ ├── data_process +│ │ ├── README.md +│ │ ├── conbine_readpdf_result.py +│ │ ├── merge_with_ocr_pdfminer.py +│ │ ├── read_pdf_noocr.py +│ │ └── read_pdf_ocr.py +│ └── insert_data.py +├── README.md +├── config_example.ini # 設定檔範例,需自己複製一份成 config.ini 並修改 +├── data +│ └── README.md +├── docker +│ ├── README.md +│ ├── docker-compose.yml +│ └── docker_install.sh +├── main.py # 主程式 +├── requirements.txt # Python pip 環境需求 +└── testing + ├── README.md + ├── checkans.py + └── get_best_alpha.py +``` + +## Setup Environment +- **OS:** 除了 Data processing 使用 Windows, 其他以 MacOS, Linux 為主, Windows 需安裝 WSL2 等來模擬出 Linux 環境 -- Overall Ranking: 38th out of 487 teams (~7.8%) - - Leaderboard: 38th out of 222 - -![AI Cup Result](img/aicup_result.png) - -## Development Mode To set up the development environment, follow these steps: 1. Create a virtual environment: @@ -17,95 +55,95 @@ To set up the development environment, follow these steps: source aicup_venv/bin/activate ``` -2. Install the required dependencies: +2. git clone our repo: + ``` + git clone https://github.com/JustinHsu1019/AICUP2024-RAG-LLM.git + cd AICUP2024-RAG-LLM + ``` + +3. Install the required dependencies: ``` pip install -r requirements.txt ``` -3. Copy the configuration example and create your own config file: +4. Copy the configuration example and create your own config file: ``` cp config_example.ini config.ini ``` -4. Manually add your `secret key` to the `config.ini`. +5. Manually add your `secret key` to the `config.ini`: + +- [OpenAI] 的 api_key 可以在 openai 官網註冊取得 +- [VoyageAI] 的 api_key 可以在 voyageai 官網註冊取得 +- [Api_docs] 的 password 可以自己隨意輸入 + - flask_app.py 啟動後,直接訪問 http://127.0.0.1:5000/ 即可看到 Swagger API 文件頁面 -5. Create a `logs` directory: +6. Create a `logs` directory: ``` mkdir logs ``` -6. Navigate to the `docker` directory (optional): +7. Navigate to the `docker` directory: ``` cd docker ``` -7. Start the Docker environment (optional): +8. Start the Docker environment (weaviate database): ``` docker-compose up -d ``` -8. Run the Flask app: - ``` - python3 src/flask_app.py - ``` +9. Data preprocessing (這一階段因不同組員處理原因,OS 環境為 Windows): +- **Tesseract-OCR**: + - 下載並安裝 Tesseract-OCR。 + - 安裝完成後,記下安裝路徑(如 `C:\Program Files\Tesseract-OCR\tesseract.exe`)。 -## Docker Production Mode +- **Poppler**: + - 下載並安裝 Poppler。 + - 安裝完成後,記下 `poppler_path`(如 `C:\Program Files\poppler-24.08.0\Library\bin`)。 -1. Copy the configuration example and create your own config file: - ``` - cp config_example.ini config.ini - ``` +在程式碼中配置 Tesseract 和 Poppler 的路徑: -2. Manually add your `secret key` to the `config.ini`. +```python +# Configure Tesseract path if necessary (update this path as needed) +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' -3. Create a `logs` directory: - ``` - mkdir logs - ``` +# Specify the path to the Poppler binaries +poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" +``` + +確保將上述路徑替換為本地實際安裝的路徑。 + +確保您的 ZIP 文件包含以下資料夾和文件 (下載官方 dataset 後): + +- `競賽資料集/reference/faq/pid_map_content.json` +- `競賽資料集/reference/finance/*.pdf` +- `競賽資料集/reference/insurance/*.pdf` + +運行 data preprocess scripts: -4. Navigate to the `docker` directory: ``` - cd docker + python3 Proprocess/data_process/data_preprocess.py + python3 Preprocess/data_process/read_pdf_noocr.py + python3 Preprocess/data_process/conbine_readpdf_result.py ``` -5. Start the Docker environment: +10. Data insert to weaviate: ``` - docker-compose up -d + python3 Preprocess/insert_data.py ``` -6. Build the Docker image: +11. Run the Flask app (/ 是 API Docs, /api/chat/ 是我們的 Retrieval API): ``` - docker build -t aicup_img -f dockerfile . + python3 Model/flask_app.py ``` -7. Run the Docker container: +11. 將主辦方提供的題目 json 檔案改名為 questions.json 並塞入 data/ + +12. 運行 main.py 進行測試得出 data/pred_retrieve.json 提交最終結果給主辦方: ``` - docker run -d -p 5001:5001 --name aicup_cont aicup_img + python3 main.py ``` ## Folder-specific Details For more detailed information about each folder and its purpose, refer to the individual `README.md` files located in their respective directories. - -## Contribution Guide -We follow GitHub Flow for contributing. The steps are as follows: - -1. **Claim an issue**: Start by picking an issue from GitHub. -2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`). -3. **Development**: After completing the feature, ensure you run pre-commit hooks: - ``` - pre-commit run --all-files - ``` -4. **Create PR Request (PR)**: - - Ensure your PR is small and easily reviewable. - - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference. - - Write a clear description including the reason for the change and what was modified (`Reason & Changes`). -5. **Review & Approval**: - - Assign the PR to all members of the team for review. - - Wait for at least one approval. - - Ensure all CI checks pass. -6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself. - -## Additional Notes -- Keep your commits focused and ensure meaningful commit messages. -- Always rebase your branch on top of `main` before merging. -- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues. diff --git a/config_example.ini b/config_example.ini index 365639d..a64b35a 100644 --- a/config_example.ini +++ b/config_example.ini @@ -1,8 +1,5 @@ [Weaviate] -weaviate_url = - -[Gemini] -api_key = +weaviate_url = http://127.0.0.1:8882 [OpenAI] api_key = diff --git a/data/README.md b/data/README.md index 5f31f8f..29008c4 100644 --- a/data/README.md +++ b/data/README.md @@ -1,3 +1 @@ -# Data that needs to be saved in Weaviate - -應競賽主辦單位要求,無法將資料集上傳至公開網路,請自行準備資料集,並用 `src/data` 內的 Scripts 處理 +# 所有 Questions, Answers, References, etc. 等“資料”都會存於此資料夾 diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..567d04c --- /dev/null +++ b/docker/README.md @@ -0,0 +1,7 @@ +# 此資料夾紀錄所有docker相關的內容 + +## docker_install.sh +為 docker 安裝的腳本,只需運行此 sh 便可 setup docker & docker-compose + +## docker-compose.yml +為 docker-compose 的設定檔,可透過 docker-compose 指令來觸發,裡面僅包含 weaviate 資料庫的啟動 diff --git a/docker/dockerfile b/docker/dockerfile deleted file mode 100644 index 853149f..0000000 --- a/docker/dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -# 使用官方的 Python 3.12.3 Slim 作為基礎映像 -FROM python:3.12.3-slim - -# 設定工作目錄為 /app -WORKDIR /app - -# 將當前目錄的內容複製到容器中的 /app -COPY . /app - -# 安裝 requirements.txt 中的必要套件 -RUN pip install --no-cache-dir -r requirements.txt - -# 暴露應用程式運行的埠 -EXPOSE 5001 - -# 複製 entrypoint 腳本 -COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -# 設定環境變數 -ENV NAME World - -# 當容器啟動時運行 entrypoint.sh -ENTRYPOINT ["/entrypoint.sh"] diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh deleted file mode 100755 index 2d874a5..0000000 --- a/docker/entrypoint.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -# 啟動 Flask API -cd /app -exec python3 src/flask_app.py diff --git a/img/aicup_result.png b/img/aicup_result.png deleted file mode 100644 index b4a3e7f..0000000 Binary files a/img/aicup_result.png and /dev/null differ diff --git a/src/tools/automate.py b/main.py similarity index 59% rename from src/tools/automate.py rename to main.py index b61aa2b..1015c61 100644 --- a/src/tools/automate.py +++ b/main.py @@ -1,48 +1,51 @@ import json -import time # Import time module for timing +import time import requests -# Load questions from the JSON file -with open('data/questions_example.json', encoding='utf-8') as file: +# 讀取主辦提供的 Question JSON 檔案 +with open('data/questions.json', encoding='utf-8') as file: questions = json.load(file)['questions'] -output_data = {'answers': []} # Initialize output format with "answers" array +# 初始化輸出資料格式 +output_data = {'answers': []} +# 設定 Flask 應用程式的 URL url = 'http://127.0.0.1:5000/api/chat' -total_start_time = time.time() # Start timing for the entire process +# 計算總花費時間 +total_start_time = time.time() for question in questions: - question_start_time = time.time() # Start timing for each question + # 計算每個問題的處理時間 + question_start_time = time.time() - # Send POST request + # 發送 POST 請求到 Model/flask_app.py 的 Retrieve API 端點 response = requests.post(url, json=question) if response.status_code == 200: response_json = response.json() - # Extract qid and retrieve from the API response - qid = question.get('qid') # Assuming each question has a unique "qid" field + # 從回應中提取 qid 和 retrieve 欄位 + qid = question.get('qid') retrieve = response_json.get('retrieve') - # Append formatted result to the answers array + # 將 qid 和 retrieve 加入輸出資料中 output_data['answers'].append({'qid': qid, 'retrieve': retrieve}) print('成功取得 JSON:', response_json) else: print('請求失敗,狀態碼:', response.status_code) - # Calculate and print time for each question + # 計算每個問題的處理時間 question_end_time = time.time() question_duration = question_end_time - question_start_time print(f'QID: {qid} - 花費時間: {question_duration:.2f} 秒') -# Calculate and print total time total_end_time = time.time() total_duration = total_end_time - total_start_time print(f'全部題目處理完成,總共花費時間: {total_duration:.2f} 秒') -# Save the output data to a new JSON file +# 將輸出資料寫入 JSON 文件 with open('data/pred_retrieve.json', 'w', encoding='utf-8') as output_file: json.dump(output_data, output_file, ensure_ascii=False, indent=4) diff --git a/requirements.txt b/requirements.txt index 743e682..c332b34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +# Python 3.12.0 Flask==2.3.2 Flask_Cors==4.0.0 keyboard==0.13.5 @@ -7,10 +8,11 @@ selenium==4.21.0 weaviate_client==3.22.1 tiktoken==0.7.0 langchain-community==0.2.0 -sentence-transformers==2.7.0 flask_limiter==3.7.0 flask_restx==1.3.0 -python-dateutil +python-dateutil==2.9.0.post0 redis==5.0.8 flask-httpauth==4.8.0 -voyageai +voyageai==0.3.1 +pytesseract==0.3.13 +pdf2image==1.17.0 diff --git a/src/batch/README.md b/src/batch/README.md deleted file mode 100644 index 93b7dae..0000000 --- a/src/batch/README.md +++ /dev/null @@ -1 +0,0 @@ -# Scripts for spidering announcement data diff --git a/src/batch/time.txt b/src/batch/time.txt deleted file mode 100644 index 04d1b01..0000000 --- a/src/batch/time.txt +++ /dev/null @@ -1 +0,0 @@ -2024-01-01 00:00:00 diff --git a/src/data/README.md b/src/data/README.md deleted file mode 100644 index 197793b..0000000 --- a/src/data/README.md +++ /dev/null @@ -1 +0,0 @@ -# Scripts to process data automatically diff --git a/src/data/read_pdf_ocr.py b/src/data/read_pdf_ocr.py deleted file mode 100644 index db6c31c..0000000 --- a/src/data/read_pdf_ocr.py +++ /dev/null @@ -1,2 +0,0 @@ -# TODO: Add OCR processing script -# This script was accidentally lost diff --git a/src/tools/monitor.py b/src/tools/monitor.py deleted file mode 100644 index d89f7ca..0000000 --- a/src/tools/monitor.py +++ /dev/null @@ -1,25 +0,0 @@ -import subprocess -import time - -import requests - - -def check_service(): - url = 'http://xxxx/api/' - try: - response = requests.get(url) - if response.status_code != 200: - restart_service() - except requests.RequestException: - restart_service() - - -def restart_service(): - print('Service is down. Restarting service...') - subprocess.run(['nohup', 'python3', 'src/flask_app.py', '&']) - - -if __name__ == '__main__': - while True: - check_service() - time.sleep(600) diff --git a/src/utils/ai/__init__.py b/src/utils/ai/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/utils/ai/call_ai.py b/src/utils/ai/call_ai.py deleted file mode 100644 index 1551113..0000000 --- a/src/utils/ai/call_ai.py +++ /dev/null @@ -1,53 +0,0 @@ -from datetime import datetime - -from utils.ai.gemini_tem import gemini_template -from utils.ai.gpt_tem import gpt_template - - -def call_aied(wait, quest, use_gpt: bool): - current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - - prompt = f"""【今日日期】: {current_time} -You are a helpful and informative bot that answers questions using text from the reference passage included below. \ -Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \ -However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \ -strike a friendly and conversational tone. \ -If the passage is irrelevant to the answer, you may ignore it. -請用繁體中文回答 -並在回應有關時間/時程的問題時,要考慮【今日日期】 - - -'{quest}' - -PASSAGE: -'{wait[0]} - -{wait[1]} - -{wait[2]} - -{wait[3]} - -{wait[4]} - -{wait[5]} - -{wait[6]} - -{wait[7]} - -{wait[8]} - -{wait[9]}' - -ANSWER: -""" - try: - if use_gpt: - res = gpt_template(prompt) - else: - res = gemini_template(prompt) - except Exception: - res = '太多使用者請求了!請等待幾秒後再重新詢問' - - return res diff --git a/src/utils/ai/gemini_tem.py b/src/utils/ai/gemini_tem.py deleted file mode 100644 index a098852..0000000 --- a/src/utils/ai/gemini_tem.py +++ /dev/null @@ -1,23 +0,0 @@ -import json - -import requests - -import utils.config_log as config_log - -config, logger, CONFIG_PATH = config_log.setup_config_and_logging() -config.read(CONFIG_PATH) - - -def gemini_template(prompt): - api_key = config.get('Gemini', 'api_key') - url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={api_key}' - payload = {'contents': [{'parts': [{'text': prompt}]}]} - headers = {'Content-Type': 'application/json'} - response = (requests.post(url, headers=headers, data=json.dumps(payload))).json() - return response['candidates'][0]['content']['parts'][0]['text'] - - -if __name__ == '__main__': - prompt = """告訴我 CTF 逆向分析的 3 個訣竅,用 json 格式輸出: {"訣竅1": ,"訣竅2": ,"訣竅3": }""" - response = gemini_template(prompt) - print(response) diff --git a/src/utils/ai/gpt_tem.py b/src/utils/ai/gpt_tem.py deleted file mode 100644 index 0486fa8..0000000 --- a/src/utils/ai/gpt_tem.py +++ /dev/null @@ -1,39 +0,0 @@ -import textwrap - -import openai - -import utils.config_log as config_log - -config, logger, CONFIG_PATH = config_log.setup_config_and_logging() -config.read(CONFIG_PATH) - - -def gpt_template(prompt, output_way='json'): - openai.api_key = config.get('OpenAI', 'api_key') - - userprompt = textwrap.dedent( - f""" - {prompt} - """ - ) - - response = openai.ChatCompletion.create( - model='gpt-4o-mini', - messages=[ - {'role': 'system', 'content': '使用繁體中文回答'}, - {'role': 'user', 'content': userprompt}, - ], - ) - - return response.choices[0].message['content'] - - -def main(): - """範例: GPT 模板使用""" - # import utils.gpt_integration as gpt_call - # gpt_call.gpt_template() - print(gpt_template('問題: 太陽系有哪些行星?請用 json 格式回傳,{"回傳內容": "_回答_"}')) - - -if __name__ == '__main__': - main() diff --git a/src/utils/ckip.py b/src/utils/ckip.py deleted file mode 100644 index 23d192e..0000000 --- a/src/utils/ckip.py +++ /dev/null @@ -1,16 +0,0 @@ -from ckip_transformers.nlp import CkipPosTagger, CkipWordSegmenter - -ws_driver = CkipWordSegmenter(model='albert-base') -pos_driver = CkipPosTagger(model='albert-base') - - -def clean(sentence_ws, sentence_pos): - short_sentence = [] - stop_pos = set(['Nep', 'Nh', 'Nb']) - for word_ws, word_pos in zip(sentence_ws, sentence_pos): - is_n_or_v = word_pos.startswith('V') or word_pos.startswith('N') - is_not_stop_pos = word_pos not in stop_pos - is_not_one_charactor = not (len(word_ws) == 1) - if is_n_or_v and is_not_stop_pos and is_not_one_charactor: - short_sentence.append(f'{word_ws}') - return ' '.join(short_sentence) diff --git a/testing/README.md b/testing/README.md new file mode 100644 index 0000000..c63f11c --- /dev/null +++ b/testing/README.md @@ -0,0 +1,8 @@ +# 此資料庫為進行一些測試及檢測的程式碼 +這些程式不影響正式比賽時的運行,也不影響資料處理,僅是做測試尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25);還有用來為公開測資對答案的腳本。 + +# get_best_alpha.py +用來測試不同 alpha 的準確率,尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25) + +# checkans.py +用來為公開測資對答案的腳本 diff --git a/src/tools/checkans.py b/testing/checkans.py similarity index 100% rename from src/tools/checkans.py rename to testing/checkans.py diff --git a/src/tools/get_best_alpha.py b/testing/get_best_alpha.py similarity index 97% rename from src/tools/get_best_alpha.py rename to testing/get_best_alpha.py index b2dec99..b552275 100644 --- a/src/tools/get_best_alpha.py +++ b/testing/get_best_alpha.py @@ -4,7 +4,7 @@ import requests # Load questions from the JSON file -with open('data/questions_example.json', encoding='utf-8') as file: +with open('data/questions.json', encoding='utf-8') as file: questions = json.load(file)['questions'] # Load ground truth data