Refactor: for aicup review (#5)

* refactor: for aicup review structure * rename .github/ md * refactor: docstring, readme, update structure * Add repo structure * Add comments for main.py * docs: setup environment * update readme * fix: pre-commit issue * update readme (os) * Update README.md * Update README.md * Update README.md * Update README.md * add preprocess * fix: pre-commit issue * Update README.md
JustinHsu1019 · Nov 14, 2024 · 8367da8 · 8367da8
1 parent b44ae8d
commit 8367da8
Show file tree

Hide file tree

Showing 35 changed files with 584 additions and 287 deletions.
diff --git a/.github/contribute_guide.md b/.github/contribute_guide.md
@@ -0,0 +1,25 @@
+# Contribution Guide
+這個資料夾主要處理 CI Pipeline, 目前僅有檢測程式碼規範 (pre-commit), 且在發 PR & merge to main 才會觸發
+
+We follow GitHub Flow for contributing. The steps are as follows:
+
+1. **Claim an issue**: Start by picking an issue from GitHub.
+2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`).
+3. **Development**: After completing the feature, ensure you run pre-commit hooks:
+   ```
+   pre-commit run --all-files
+   ```
+4. **Create PR Request (PR)**:
+   - Ensure your PR is small and easily reviewable.
+   - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference.
+   - Write a clear description including the reason for the change and what was modified (`Reason & Changes`).
+5. **Review & Approval**:
+   - Assign the PR to all members of the team for review.
+   - Wait for at least one approval.
+   - Ensure all CI checks pass.
+6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself.
+
+## Additional Notes
+- Keep your commits focused and ensure meaningful commit messages.
+- Always rebase your branch on top of `main` before merging.
+- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues.
diff --git a/Model/README.md b/Model/README.md
@@ -0,0 +1,10 @@
+# 進行檢索的主程式
+
+## flask_app.py
+會開出一個 API 供 main.py 呼叫，每次呼叫會送入一題問題，並回傳一個答案 pid
+
+## utils/retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## utils/config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/src/flask_app.py → Model/flask_app.py b/src/flask_app.py → Model/flask_app.py
@@ -59,18 +59,18 @@ def get(self):
         return response
 
 
-# TODO: Modify the output format for general RAG purposes
 @ns.route('/chat')
 class ChatBot(Resource):
     @api.doc('chat_bot')
     @api.expect(model)
     def post(self):
+        """retrieve and rank api entry point"""
         qid = request.json.get('qid')
         source = request.json.get('source')
         question = request.json.get('query')
         category = request.json.get('category')
 
-        # for alpha testing
+        # for alpha testing (finding best hybrid search alpha)
         # alpha = request.json.get('alpha')
 
         # input template
@@ -81,9 +81,10 @@ def post(self):
         # "category": "insurance"
         # },
 
-        alpha = 0.5
+        alpha = 0.5  # 最終因使用 Reranker 全盤處理 sources，故任何 alpha 對準確率都無影響
 
         if not question:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': '1', 'retrieve': '1'})
             response.status_code = 200
             return response
@@ -103,19 +104,24 @@ def post(self):
             response.status_code = 200
             return response
         except TypeError:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': qid, 'retrieve': source[-1]})
             response.status_code = 200
             return response
 
 
+# For API Docs
 @app.before_request
 def require_auth_for_docs():
+    """Require authentication for API Docs"""
     if request.path == '/':
         return auth.login_required()(swagger_ui)()
 
 
+# For API Docs
 @app.route('/')
 def swagger_ui():
+    """Redirect to the Swagger UI"""
     return api.render_doc()
 
 

diff --git a/Model/utils/README.md b/Model/utils/README.md
@@ -0,0 +1,7 @@
+# flask_app 主程式需呼叫的所有輔助程式
+
+## retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/src/utils/__init__.py → Model/utils/__init__.py b/src/utils/__init__.py → Model/utils/__init__.py
diff --git a/src/utils/config_log.py → Model/utils/config_log.py b/src/utils/config_log.py → Model/utils/config_log.py
@@ -6,6 +6,7 @@
 
 
 def setup_config_and_logging():
+    """Set up the configuration and logging."""
     config = configparser.ConfigParser()
 
     logger = logging.getLogger()

diff --git a/src/utils/retrieval_agent.py → Model/utils/retrieval_agent.py b/src/utils/retrieval_agent.py → Model/utils/retrieval_agent.py
@@ -6,30 +6,34 @@
 
 import utils.config_log as config_log
 
-# 載入設定檔案和日誌設定
 config, logger, CONFIG_PATH = config_log.setup_config_and_logging()
 config.read(CONFIG_PATH)
 
-# 從 config 中取得 Weaviate URL 和 API 金鑰
-wea_url = config.get('Weaviate', 'weaviate_url')
-voyage_api_key = config.get('VoyageAI', 'api_key')
-PROPERTIES = ['pid', 'content']
+wea_url = config.get('Weaviate', 'weaviate_url')  # 此次所使用的向量資料庫
+voyage_api_key = config.get('VoyageAI', 'api_key')  # Voyage Reranker 所使用的 API Key
+PROPERTIES = ['pid', 'content']  # 向量資料庫中此 Class 的欄位
 
 # 設定 OpenAI API 金鑰
 os.environ['OPENAI_API_KEY'] = config.get('OpenAI', 'api_key')
 
 
 class WeaviateSemanticSearch:
+    """Weaviate 向量資料庫的搜尋類別"""
+
     def __init__(self, classnm):
+        """初始化 Weaviate 向量資料庫的搜尋類別"""
         self.url = wea_url
+        # 選擇的 OpenAI embedding model
         self.embeddings = OpenAIEmbeddings(chunk_size=1, model='text-embedding-3-large')
         self.client = weaviate.Client(url=wea_url)
         self.classnm = classnm
 
     def hybrid_search(self, query, source, num, alpha):
+        """Weaviate 向量資料庫的搜尋方法"""
         query_vector = self.embeddings.embed_query(query)
         vector_str = ','.join(map(str, query_vector))
 
+        # 下述兩搜索式主要為過濾出 source 中的 pid，並只針對 source 中的 pid 的文件進行 retrieval & rerank
         where_conditions = ' '.join([f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source])
 
         gql_query = f"""
@@ -63,16 +67,20 @@ def hybrid_search(self, query, source, num, alpha):
 
 
 def rerank_with_voyage(query, documents, pids, api_key):
+    """利用 Voyage Reranker 對 Weaviate hybrid search retrieval 的結果進行 rerank"""
     vo = voyageai.Client(api_key=api_key)
+    # 利用 voyage rerank-2 從 hybrid search retrieval 中篩出的所有文件取出最終的 top 1
     reranking = vo.rerank(query, documents, model='rerank-2', top_k=1)
     top_result = reranking.results[0]
 
-    # 根據內容找到相對應的 pid
     top_pid = pids[documents.index(top_result.document)]
     return {'pid': top_pid, 'relevance_score': top_result.relevance_score}
 
 
 def search_do(question, category, source, alpha):
+    """flask_app.py 呼叫的 '搜尋' 主程式"""
+
+    # 先根據題目給定的 category 選擇對應的向量資料庫 Class
     if category == 'finance':
         vdb_named = 'Financedev'
     elif category == 'insurance':
@@ -81,16 +89,16 @@ def search_do(question, category, source, alpha):
         vdb_named = 'Faqdev'
 
     searcher = WeaviateSemanticSearch(vdb_named)
-    # 從 Weaviate 取得前 100 筆結果
+    # 從 Weaviate hybrid search retrieval 前 100 筆結果
     top_100_results = searcher.hybrid_search(question, source, 100, alpha=alpha)
 
-    # 準備文件和 pid 列表供 rerank 使用
     documents = [result['content'] for result in top_100_results]
     pids = [result['pid'] for result in top_100_results]
 
     # 使用 VoyageAI 重新排序，並取得排名最高的 pid
     top_reranked_result = rerank_with_voyage(question, documents, pids, voyage_api_key)
 
+    # Log
     print('最相關文件的 PID:')
     print(f"PID: {top_reranked_result['pid']}")
     print(f"相關性分數: {top_reranked_result['relevance_score']}")

diff --git a/Preprocess/README.md b/Preprocess/README.md
@@ -0,0 +1,8 @@
+# 此資料夾為所有處理資料的程式碼
+包含 資料預處理 及 資料寫入資料庫
+
+## data_process/
+OCR & PDF 文字直接讀取
+
+## insert_data.py
+此程式為寫入資料庫的程式碼，並包含建立資料庫 class、對資料進行 embedding、利用 text_splitter 去 chunk tokens 數過多的資料