From f11c27c9b7a39eff7a41b15cea436f9e083d7e43 Mon Sep 17 00:00:00 2001 From: "justin.hsu" Date: Wed, 13 Nov 2024 14:15:46 +0800 Subject: [PATCH] refactor: docstring, readme, update structure --- .github/contribute_guide.md | 25 +++++++++++ .github/pre-commit.md | 0 Model/README.md | 10 +++++ Model/flask_app.py | 12 +++-- Model/utils/README.md | 7 +++ Model/utils/config_log.py | 1 + Model/utils/retrieval_agent.py | 24 ++++++---- Preprocess/README.md | 9 +++- Preprocess/data_process/README.md | 2 + Preprocess/insert_data.py | 6 +++ README.md | 73 +++++++------------------------ config_example.ini | 3 -- data/README.md | 1 + docker/README.md | 7 +++ main.py | 17 +++---- requirements.txt | 5 +-- testing/README.md | 8 ++++ 17 files changed, 122 insertions(+), 88 deletions(-) create mode 100644 .github/contribute_guide.md delete mode 100644 .github/pre-commit.md create mode 100644 Preprocess/data_process/README.md create mode 100644 data/README.md diff --git a/.github/contribute_guide.md b/.github/contribute_guide.md new file mode 100644 index 0000000..c5f4798 --- /dev/null +++ b/.github/contribute_guide.md @@ -0,0 +1,25 @@ +# Contribution Guide +這個資料夾主要處理 CI Pipeline, 目前僅有檢測程式碼規範 (pre-commit), 且在發 PR & merge to main 才會觸發 + +We follow GitHub Flow for contributing. The steps are as follows: + +1. **Claim an issue**: Start by picking an issue from GitHub. +2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`). +3. **Development**: After completing the feature, ensure you run pre-commit hooks: + ``` + pre-commit run --all-files + ``` +4. **Create PR Request (PR)**: + - Ensure your PR is small and easily reviewable. + - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference. + - Write a clear description including the reason for the change and what was modified (`Reason & Changes`). +5. **Review & Approval**: + - Assign the PR to all members of the team for review. + - Wait for at least one approval. + - Ensure all CI checks pass. +6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself. + +## Additional Notes +- Keep your commits focused and ensure meaningful commit messages. +- Always rebase your branch on top of `main` before merging. +- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues. diff --git a/.github/pre-commit.md b/.github/pre-commit.md deleted file mode 100644 index e69de29..0000000 diff --git a/Model/README.md b/Model/README.md index e69de29..884c41b 100644 --- a/Model/README.md +++ b/Model/README.md @@ -0,0 +1,10 @@ +# 進行檢索的主程式 + +## flask_app.py +會開出一個 API 供 main.py 呼叫,每次呼叫會送入一題問題,並回傳一個答案 pid + +## utils/retrieval_agent.py +負責呼叫 weaviate & voyage reranker 進行檢索 + +## utils/config_log.py +負責處理 config 檔案,並設定 log 檔案 diff --git a/Model/flask_app.py b/Model/flask_app.py index 991bfa2..e53164b 100644 --- a/Model/flask_app.py +++ b/Model/flask_app.py @@ -59,18 +59,18 @@ def get(self): return response -# TODO: Modify the output format for general RAG purposes @ns.route('/chat') class ChatBot(Resource): @api.doc('chat_bot') @api.expect(model) def post(self): + """retrieve and rank api entry point""" qid = request.json.get('qid') source = request.json.get('source') question = request.json.get('query') category = request.json.get('category') - # for alpha testing + # for alpha testing (finding best hybrid search alpha) # alpha = request.json.get('alpha') # input template @@ -81,9 +81,10 @@ def post(self): # "category": "insurance" # }, - alpha = 0.5 + alpha = 0.5 # 最終因使用 Reranker 全盤處理 sources,故任何 alpha 對準確率都無影響 if not question: + # 為避免任何萬一,無論如何都須回傳一個結果,不做 Error logging response = jsonify({'qid': '1', 'retrieve': '1'}) response.status_code = 200 return response @@ -103,19 +104,24 @@ def post(self): response.status_code = 200 return response except TypeError: + # 為避免任何萬一,無論如何都須回傳一個結果,不做 Error logging response = jsonify({'qid': qid, 'retrieve': source[-1]}) response.status_code = 200 return response +# For API Docs @app.before_request def require_auth_for_docs(): + """Require authentication for API Docs""" if request.path == '/': return auth.login_required()(swagger_ui)() +# For API Docs @app.route('/') def swagger_ui(): + """Redirect to the Swagger UI""" return api.render_doc() diff --git a/Model/utils/README.md b/Model/utils/README.md index e69de29..04f9b54 100644 --- a/Model/utils/README.md +++ b/Model/utils/README.md @@ -0,0 +1,7 @@ +# flask_app 主程式需呼叫的所有輔助程式 + +## retrieval_agent.py +負責呼叫 weaviate & voyage reranker 進行檢索 + +## config_log.py +負責處理 config 檔案,並設定 log 檔案 diff --git a/Model/utils/config_log.py b/Model/utils/config_log.py index 655d224..0ad925c 100644 --- a/Model/utils/config_log.py +++ b/Model/utils/config_log.py @@ -6,6 +6,7 @@ def setup_config_and_logging(): + """Set up the configuration and logging.""" config = configparser.ConfigParser() logger = logging.getLogger() diff --git a/Model/utils/retrieval_agent.py b/Model/utils/retrieval_agent.py index 4400670..6534092 100644 --- a/Model/utils/retrieval_agent.py +++ b/Model/utils/retrieval_agent.py @@ -6,30 +6,34 @@ import utils.config_log as config_log -# 載入設定檔案和日誌設定 config, logger, CONFIG_PATH = config_log.setup_config_and_logging() config.read(CONFIG_PATH) -# 從 config 中取得 Weaviate URL 和 API 金鑰 -wea_url = config.get('Weaviate', 'weaviate_url') -voyage_api_key = config.get('VoyageAI', 'api_key') -PROPERTIES = ['pid', 'content'] +wea_url = config.get('Weaviate', 'weaviate_url') # 此次所使用的向量資料庫 +voyage_api_key = config.get('VoyageAI', 'api_key') # Voyage Reranker 所使用的 API Key +PROPERTIES = ['pid', 'content'] # 向量資料庫中此 Class 的欄位 # 設定 OpenAI API 金鑰 os.environ['OPENAI_API_KEY'] = config.get('OpenAI', 'api_key') class WeaviateSemanticSearch: + """Weaviate 向量資料庫的搜尋類別""" + def __init__(self, classnm): + """初始化 Weaviate 向量資料庫的搜尋類別""" self.url = wea_url + # 選擇的 OpenAI embedding model self.embeddings = OpenAIEmbeddings(chunk_size=1, model='text-embedding-3-large') self.client = weaviate.Client(url=wea_url) self.classnm = classnm def hybrid_search(self, query, source, num, alpha): + """Weaviate 向量資料庫的搜尋方法""" query_vector = self.embeddings.embed_query(query) vector_str = ','.join(map(str, query_vector)) + # 下述兩搜索式主要為過濾出 source 中的 pid,並只針對 source 中的 pid 的文件進行 retrieval & rerank where_conditions = ' '.join([f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source]) gql_query = f""" @@ -63,16 +67,20 @@ def hybrid_search(self, query, source, num, alpha): def rerank_with_voyage(query, documents, pids, api_key): + """利用 Voyage Reranker 對 Weaviate hybrid search retrieval 的結果進行 rerank""" vo = voyageai.Client(api_key=api_key) + # 利用 voyage rerank-2 從 hybrid search retrieval 中篩出的所有文件取出最終的 top 1 reranking = vo.rerank(query, documents, model='rerank-2', top_k=1) top_result = reranking.results[0] - # 根據內容找到相對應的 pid top_pid = pids[documents.index(top_result.document)] return {'pid': top_pid, 'relevance_score': top_result.relevance_score} def search_do(question, category, source, alpha): + """flask_app.py 呼叫的 '搜尋' 主程式""" + + # 先根據題目給定的 category 選擇對應的向量資料庫 Class if category == 'finance': vdb_named = 'Financedev' elif category == 'insurance': @@ -81,16 +89,16 @@ def search_do(question, category, source, alpha): vdb_named = 'Faqdev' searcher = WeaviateSemanticSearch(vdb_named) - # 從 Weaviate 取得前 100 筆結果 + # 從 Weaviate hybrid search retrieval 前 100 筆結果 top_100_results = searcher.hybrid_search(question, source, 100, alpha=alpha) - # 準備文件和 pid 列表供 rerank 使用 documents = [result['content'] for result in top_100_results] pids = [result['pid'] for result in top_100_results] # 使用 VoyageAI 重新排序,並取得排名最高的 pid top_reranked_result = rerank_with_voyage(question, documents, pids, voyage_api_key) + # Log print('最相關文件的 PID:') print(f"PID: {top_reranked_result['pid']}") print(f"相關性分數: {top_reranked_result['relevance_score']}") diff --git a/Preprocess/README.md b/Preprocess/README.md index 197793b..082c981 100644 --- a/Preprocess/README.md +++ b/Preprocess/README.md @@ -1 +1,8 @@ -# Scripts to process data automatically +# 此資料夾為所有處理資料的程式碼 +包含 資料預處理 及 資料寫入資料庫 + +## data_process/ +OCR & PDF 文字直接讀取 + +## insert_data.py +此程式為寫入資料庫的程式碼,並包含建立資料庫 class、對資料進行 embedding、利用 text_splitter 去 chunk tokens 數過多的資料 diff --git a/Preprocess/data_process/README.md b/Preprocess/data_process/README.md new file mode 100644 index 0000000..f065c58 --- /dev/null +++ b/Preprocess/data_process/README.md @@ -0,0 +1,2 @@ +# 此資料夾為資料預處理的程式碼 +OCR & PDF 文字直接讀取 diff --git a/Preprocess/insert_data.py b/Preprocess/insert_data.py index 76ec8a0..6cb5a24 100644 --- a/Preprocess/insert_data.py +++ b/Preprocess/insert_data.py @@ -16,13 +16,17 @@ class WeaviateManager: + """Weaviate Insert data 管理器""" + def __init__(self, classnm): + """初始化 Weaviate 連接""" self.url = wea_url self.client = weaviate.Client(url=wea_url, additional_headers={'X-OpenAI-Api-Key': openai_api_key}) self.classnm = classnm self.check_class_exist() def check_class_exist(self): + """檢查 class 是否存在""" if self.client.schema.exists(self.classnm): print(f'{self.classnm} is ready') return True @@ -47,6 +51,7 @@ def check_class_exist(self): return True def insert_data(self, pid, content): + """插入資料到 Weaviate""" data_object = {'pid': pid, 'content': content} max_retries = 5 for attempt in range(max_retries): @@ -73,6 +78,7 @@ def insert_data(self, pid, content): return False def split_and_insert(self, pid, content, category): + """處理特例:分割並插入資料""" # 使用 TextSplitter 分割長文本 text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=500) split_content = text_splitter.split_text(content) diff --git a/README.md b/README.md index 8069444..df51130 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # AI CUP 2024 玉山人工智慧公開挑戰賽-RAG與LLM在金融問答的應用 -## Development Mode +## Repo Structure +``` +(主要用 # 介紹沒有在 folder 內獨立檔案) +``` + +## Setup Environment To set up the development environment, follow these steps: 1. Create a virtual environment: @@ -20,84 +25,36 @@ To set up the development environment, follow these steps: ``` 4. Manually add your `secret key` to the `config.ini`. + (需展開解釋 config.ini 內的每一項 key) 5. Create a `logs` directory: ``` mkdir logs ``` -6. Navigate to the `docker` directory (optional): +6. Navigate to the `docker` directory: ``` cd docker ``` -7. Start the Docker environment (optional): +7. Start the Docker environment (weaviate database): ``` docker-compose up -d ``` -8. Run the Flask app: - ``` - python3 src/flask_app.py - ``` +8. Data preprocessing: -## Docker Production Mode - -1. Copy the configuration example and create your own config file: - ``` - cp config_example.ini config.ini - ``` - -2. Manually add your `secret key` to the `config.ini`. - -3. Create a `logs` directory: - ``` - mkdir logs - ``` +9. Data insert to weaviate: -4. Navigate to the `docker` directory: +10. Run the Flask app: ``` - cd docker + python3 src/flask_app.py ``` -5. Start the Docker environment: - ``` - docker-compose up -d - ``` +11. 將主辦方提供的 questions.json 測試資料塞入 data/: -6. Build the Docker image: - ``` - docker build -t aicup_img -f dockerfile . - ``` +12. 運行 main.py 進行測試得出 data/pred_retrieve.json 提交最終結果給主辦方: -7. Run the Docker container: - ``` - docker run -d -p 5001:5001 --name aicup_cont aicup_img - ``` ## Folder-specific Details For more detailed information about each folder and its purpose, refer to the individual `README.md` files located in their respective directories. - -## Contribution Guide -We follow GitHub Flow for contributing. The steps are as follows: - -1. **Claim an issue**: Start by picking an issue from GitHub. -2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`). -3. **Development**: After completing the feature, ensure you run pre-commit hooks: - ``` - pre-commit run --all-files - ``` -4. **Create PR Request (PR)**: - - Ensure your PR is small and easily reviewable. - - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference. - - Write a clear description including the reason for the change and what was modified (`Reason & Changes`). -5. **Review & Approval**: - - Assign the PR to all members of the team for review. - - Wait for at least one approval. - - Ensure all CI checks pass. -6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself. - -## Additional Notes -- Keep your commits focused and ensure meaningful commit messages. -- Always rebase your branch on top of `main` before merging. -- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues. diff --git a/config_example.ini b/config_example.ini index 365639d..5c8955a 100644 --- a/config_example.ini +++ b/config_example.ini @@ -1,9 +1,6 @@ [Weaviate] weaviate_url = -[Gemini] -api_key = - [OpenAI] api_key = diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..29008c4 --- /dev/null +++ b/data/README.md @@ -0,0 +1 @@ +# 所有 Questions, Answers, References, etc. 等“資料”都會存於此資料夾 diff --git a/docker/README.md b/docker/README.md index e69de29..567d04c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -0,0 +1,7 @@ +# 此資料夾紀錄所有docker相關的內容 + +## docker_install.sh +為 docker 安裝的腳本,只需運行此 sh 便可 setup docker & docker-compose + +## docker-compose.yml +為 docker-compose 的設定檔,可透過 docker-compose 指令來觸發,裡面僅包含 weaviate 資料庫的啟動 diff --git a/main.py b/main.py index b61aa2b..217d1c3 100644 --- a/main.py +++ b/main.py @@ -1,48 +1,41 @@ import json -import time # Import time module for timing +import time import requests -# Load questions from the JSON file with open('data/questions_example.json', encoding='utf-8') as file: questions = json.load(file)['questions'] -output_data = {'answers': []} # Initialize output format with "answers" array +output_data = {'answers': []} url = 'http://127.0.0.1:5000/api/chat' -total_start_time = time.time() # Start timing for the entire process +total_start_time = time.time() for question in questions: - question_start_time = time.time() # Start timing for each question + question_start_time = time.time() - # Send POST request response = requests.post(url, json=question) if response.status_code == 200: response_json = response.json() - # Extract qid and retrieve from the API response - qid = question.get('qid') # Assuming each question has a unique "qid" field + qid = question.get('qid') retrieve = response_json.get('retrieve') - # Append formatted result to the answers array output_data['answers'].append({'qid': qid, 'retrieve': retrieve}) print('成功取得 JSON:', response_json) else: print('請求失敗,狀態碼:', response.status_code) - # Calculate and print time for each question question_end_time = time.time() question_duration = question_end_time - question_start_time print(f'QID: {qid} - 花費時間: {question_duration:.2f} 秒') -# Calculate and print total time total_end_time = time.time() total_duration = total_end_time - total_start_time print(f'全部題目處理完成,總共花費時間: {total_duration:.2f} 秒') -# Save the output data to a new JSON file with open('data/pred_retrieve.json', 'w', encoding='utf-8') as output_file: json.dump(output_data, output_file, ensure_ascii=False, indent=4) diff --git a/requirements.txt b/requirements.txt index 884c6e0..3cdbe00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -Python==3.12.0 +# Python 3.12.0 Flask==2.3.2 Flask_Cors==4.0.0 keyboard==0.13.5 @@ -8,10 +8,9 @@ selenium==4.21.0 weaviate_client==3.22.1 tiktoken==0.7.0 langchain-community==0.2.0 -sentence-transformers==2.7.0 flask_limiter==3.7.0 flask_restx==1.3.0 -python-dateutil +python-dateutil==2.9.0.post0 redis==5.0.8 flask-httpauth==4.8.0 voyageai==0.3.1 diff --git a/testing/README.md b/testing/README.md index e69de29..c63f11c 100644 --- a/testing/README.md +++ b/testing/README.md @@ -0,0 +1,8 @@ +# 此資料庫為進行一些測試及檢測的程式碼 +這些程式不影響正式比賽時的運行,也不影響資料處理,僅是做測試尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25);還有用來為公開測資對答案的腳本。 + +# get_best_alpha.py +用來測試不同 alpha 的準確率,尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25) + +# checkans.py +用來為公開測資對答案的腳本