refactor: docstring, readme, update structure

JustinHsu1019 · Nov 13, 2024 · f11c27c · f11c27c
1 parent 095d292
commit f11c27c
Show file tree

Hide file tree

Showing 17 changed files with 122 additions and 88 deletions.
diff --git a/.github/contribute_guide.md b/.github/contribute_guide.md
@@ -0,0 +1,25 @@
+# Contribution Guide
+這個資料夾主要處理 CI Pipeline, 目前僅有檢測程式碼規範 (pre-commit), 且在發 PR & merge to main 才會觸發
+
+We follow GitHub Flow for contributing. The steps are as follows:
+
+1. **Claim an issue**: Start by picking an issue from GitHub.
+2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`).
+3. **Development**: After completing the feature, ensure you run pre-commit hooks:
+   ```
+   pre-commit run --all-files
+   ```
+4. **Create PR Request (PR)**:
+   - Ensure your PR is small and easily reviewable.
+   - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference.
+   - Write a clear description including the reason for the change and what was modified (`Reason & Changes`).
+5. **Review & Approval**:
+   - Assign the PR to all members of the team for review.
+   - Wait for at least one approval.
+   - Ensure all CI checks pass.
+6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself.
+
+## Additional Notes
+- Keep your commits focused and ensure meaningful commit messages.
+- Always rebase your branch on top of `main` before merging.
+- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues.
diff --git a/.github/pre-commit.md b/.github/pre-commit.md
diff --git a/Model/README.md b/Model/README.md
@@ -0,0 +1,10 @@
+# 進行檢索的主程式
+
+## flask_app.py
+會開出一個 API 供 main.py 呼叫，每次呼叫會送入一題問題，並回傳一個答案 pid
+
+## utils/retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## utils/config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/Model/flask_app.py b/Model/flask_app.py
@@ -59,18 +59,18 @@ def get(self):
         return response
 
 
-# TODO: Modify the output format for general RAG purposes
 @ns.route('/chat')
 class ChatBot(Resource):
     @api.doc('chat_bot')
     @api.expect(model)
     def post(self):
+        """retrieve and rank api entry point"""
         qid = request.json.get('qid')
         source = request.json.get('source')
         question = request.json.get('query')
         category = request.json.get('category')
 
-        # for alpha testing
+        # for alpha testing (finding best hybrid search alpha)
         # alpha = request.json.get('alpha')
 
         # input template
@@ -81,9 +81,10 @@ def post(self):
         # "category": "insurance"
         # },
 
-        alpha = 0.5
+        alpha = 0.5  # 最終因使用 Reranker 全盤處理 sources，故任何 alpha 對準確率都無影響
 
         if not question:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': '1', 'retrieve': '1'})
             response.status_code = 200
             return response
@@ -103,19 +104,24 @@ def post(self):
             response.status_code = 200
             return response
         except TypeError:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': qid, 'retrieve': source[-1]})
             response.status_code = 200
             return response
 
 
+# For API Docs
 @app.before_request
 def require_auth_for_docs():
+    """Require authentication for API Docs"""
     if request.path == '/':
         return auth.login_required()(swagger_ui)()
 
 
+# For API Docs
 @app.route('/')
 def swagger_ui():
+    """Redirect to the Swagger UI"""
     return api.render_doc()
 
 

diff --git a/Model/utils/README.md b/Model/utils/README.md
@@ -0,0 +1,7 @@
+# flask_app 主程式需呼叫的所有輔助程式
+
+## retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/Model/utils/config_log.py b/Model/utils/config_log.py
@@ -6,6 +6,7 @@
 
 
 def setup_config_and_logging():
+    """Set up the configuration and logging."""
     config = configparser.ConfigParser()
 
     logger = logging.getLogger()

diff --git a/Model/utils/retrieval_agent.py b/Model/utils/retrieval_agent.py
@@ -6,30 +6,34 @@
 
 import utils.config_log as config_log
 
-# 載入設定檔案和日誌設定
 config, logger, CONFIG_PATH = config_log.setup_config_and_logging()
 config.read(CONFIG_PATH)
 
-# 從 config 中取得 Weaviate URL 和 API 金鑰
-wea_url = config.get('Weaviate', 'weaviate_url')
-voyage_api_key = config.get('VoyageAI', 'api_key')
-PROPERTIES = ['pid', 'content']
+wea_url = config.get('Weaviate', 'weaviate_url')  # 此次所使用的向量資料庫
+voyage_api_key = config.get('VoyageAI', 'api_key')  # Voyage Reranker 所使用的 API Key
+PROPERTIES = ['pid', 'content']  # 向量資料庫中此 Class 的欄位
 
 # 設定 OpenAI API 金鑰
 os.environ['OPENAI_API_KEY'] = config.get('OpenAI', 'api_key')
 
 
 class WeaviateSemanticSearch:
+    """Weaviate 向量資料庫的搜尋類別"""
+
     def __init__(self, classnm):
+        """初始化 Weaviate 向量資料庫的搜尋類別"""
         self.url = wea_url
+        # 選擇的 OpenAI embedding model
         self.embeddings = OpenAIEmbeddings(chunk_size=1, model='text-embedding-3-large')
         self.client = weaviate.Client(url=wea_url)
         self.classnm = classnm
 
     def hybrid_search(self, query, source, num, alpha):
+        """Weaviate 向量資料庫的搜尋方法"""
         query_vector = self.embeddings.embed_query(query)
         vector_str = ','.join(map(str, query_vector))
 
+        # 下述兩搜索式主要為過濾出 source 中的 pid，並只針對 source 中的 pid 的文件進行 retrieval & rerank
         where_conditions = ' '.join([f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source])
 
         gql_query = f"""
@@ -63,16 +67,20 @@ def hybrid_search(self, query, source, num, alpha):
 
 
 def rerank_with_voyage(query, documents, pids, api_key):
+    """利用 Voyage Reranker 對 Weaviate hybrid search retrieval 的結果進行 rerank"""
     vo = voyageai.Client(api_key=api_key)
+    # 利用 voyage rerank-2 從 hybrid search retrieval 中篩出的所有文件取出最終的 top 1
     reranking = vo.rerank(query, documents, model='rerank-2', top_k=1)
     top_result = reranking.results[0]
 
-    # 根據內容找到相對應的 pid
     top_pid = pids[documents.index(top_result.document)]
     return {'pid': top_pid, 'relevance_score': top_result.relevance_score}
 
 
 def search_do(question, category, source, alpha):
+    """flask_app.py 呼叫的 '搜尋' 主程式"""
+
+    # 先根據題目給定的 category 選擇對應的向量資料庫 Class
     if category == 'finance':
         vdb_named = 'Financedev'
     elif category == 'insurance':
@@ -81,16 +89,16 @@ def search_do(question, category, source, alpha):
         vdb_named = 'Faqdev'
 
     searcher = WeaviateSemanticSearch(vdb_named)
-    # 從 Weaviate 取得前 100 筆結果
+    # 從 Weaviate hybrid search retrieval 前 100 筆結果
     top_100_results = searcher.hybrid_search(question, source, 100, alpha=alpha)
 
-    # 準備文件和 pid 列表供 rerank 使用
     documents = [result['content'] for result in top_100_results]
     pids = [result['pid'] for result in top_100_results]
 
     # 使用 VoyageAI 重新排序，並取得排名最高的 pid
     top_reranked_result = rerank_with_voyage(question, documents, pids, voyage_api_key)
 
+    # Log
     print('最相關文件的 PID:')
     print(f"PID: {top_reranked_result['pid']}")
     print(f"相關性分數: {top_reranked_result['relevance_score']}")

diff --git a/Preprocess/README.md b/Preprocess/README.md
@@ -1 +1,8 @@
-# Scripts to process data automatically
+# 此資料夾為所有處理資料的程式碼
+包含 資料預處理 及 資料寫入資料庫
+
+## data_process/
+OCR & PDF 文字直接讀取
+
+## insert_data.py
+此程式為寫入資料庫的程式碼，並包含建立資料庫 class、對資料進行 embedding、利用 text_splitter 去 chunk tokens 數過多的資料
diff --git a/Preprocess/data_process/README.md b/Preprocess/data_process/README.md
@@ -0,0 +1,2 @@
+# 此資料夾為資料預處理的程式碼
+OCR & PDF 文字直接讀取
diff --git a/Preprocess/insert_data.py b/Preprocess/insert_data.py
@@ -16,13 +16,17 @@
 
 
 class WeaviateManager:
+    """Weaviate Insert data 管理器"""
+
     def __init__(self, classnm):
+        """初始化 Weaviate 連接"""
         self.url = wea_url
         self.client = weaviate.Client(url=wea_url, additional_headers={'X-OpenAI-Api-Key': openai_api_key})
         self.classnm = classnm
         self.check_class_exist()
 
     def check_class_exist(self):
+        """檢查 class 是否存在"""
         if self.client.schema.exists(self.classnm):
             print(f'{self.classnm} is ready')
             return True
@@ -47,6 +51,7 @@ def check_class_exist(self):
         return True
 
     def insert_data(self, pid, content):
+        """插入資料到 Weaviate"""
         data_object = {'pid': pid, 'content': content}
         max_retries = 5
         for attempt in range(max_retries):
@@ -73,6 +78,7 @@ def insert_data(self, pid, content):
         return False
 
     def split_and_insert(self, pid, content, category):
+        """處理特例：分割並插入資料"""
         # 使用 TextSplitter 分割長文本
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=500)
         split_content = text_splitter.split_text(content)

diff --git a/README.md b/README.md
@@ -1,6 +1,11 @@
 # AI CUP 2024 玉山人工智慧公開挑戰賽－RAG與LLM在金融問答的應用
 
-## Development Mode
+## Repo Structure
+```
+(主要用 # 介紹沒有在 folder 內獨立檔案)
+```
+
+## Setup Environment
 To set up the development environment, follow these steps:
 
 1. Create a virtual environment:
@@ -20,84 +25,36 @@ To set up the development environment, follow these steps:
    ```
 
 4. Manually add your `secret key` to the `config.ini`.
+   (需展開解釋 config.ini 內的每一項 key)
 
 5. Create a `logs` directory:
    ```
    mkdir logs
    ```
 
-6. Navigate to the `docker` directory (optional):
+6. Navigate to the `docker` directory:
    ```
    cd docker
    ```
 
-7. Start the Docker environment (optional):
+7. Start the Docker environment (weaviate database):
    ```
    docker-compose up -d
    ```
 
-8. Run the Flask app:
-   ```
-   python3 src/flask_app.py
-   ```
+8. Data preprocessing:
 
-## Docker Production Mode
-
-1. Copy the configuration example and create your own config file:
-   ```
-   cp config_example.ini config.ini
-   ```
-
-2. Manually add your `secret key` to the `config.ini`.
-
-3. Create a `logs` directory:
-   ```
-   mkdir logs
-   ```
+9. Data insert to weaviate:
 
-4. Navigate to the `docker` directory:
+10. Run the Flask app:
    ```
-   cd docker
+   python3 src/flask_app.py
    ```
 
-5. Start the Docker environment:
-   ```
-   docker-compose up -d
-   ```
+11. 將主辦方提供的 questions.json 測試資料塞入 data/:
 
-6. Build the Docker image:
-   ```
-   docker build -t aicup_img -f dockerfile .
-   ```
+12. 運行 main.py 進行測試得出 data/pred_retrieve.json 提交最終結果給主辦方:
 
-7. Run the Docker container:
-   ```
-   docker run -d -p 5001:5001 --name aicup_cont aicup_img
-   ```
 
 ## Folder-specific Details
 For more detailed information about each folder and its purpose, refer to the individual `README.md` files located in their respective directories.
-
-## Contribution Guide
-We follow GitHub Flow for contributing. The steps are as follows:
-
-1. **Claim an issue**: Start by picking an issue from GitHub.
-2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`).
-3. **Development**: After completing the feature, ensure you run pre-commit hooks:
-   ```
-   pre-commit run --all-files
-   ```
-4. **Create PR Request (PR)**:
-   - Ensure your PR is small and easily reviewable.
-   - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference.
-   - Write a clear description including the reason for the change and what was modified (`Reason & Changes`).
-5. **Review & Approval**:
-   - Assign the PR to all members of the team for review.
-   - Wait for at least one approval.
-   - Ensure all CI checks pass.
-6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself.
-
-## Additional Notes
-- Keep your commits focused and ensure meaningful commit messages.
-- Always rebase your branch on top of `main` before merging.
-- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues.
diff --git a/config_example.ini b/config_example.ini
@@ -1,9 +1,6 @@
 [Weaviate]
 weaviate_url =
 
-[Gemini]
-api_key =
-
 [OpenAI]
 api_key =
 

diff --git a/data/README.md b/data/README.md
@@ -0,0 +1 @@
+# 所有 Questions, Answers, References, etc. 等“資料”都會存於此資料夾
diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,7 @@
+# 此資料夾紀錄所有docker相關的內容
+
+## docker_install.sh
+為 docker 安裝的腳本，只需運行此 sh 便可 setup docker & docker-compose
+
+## docker-compose.yml
+為 docker-compose 的設定檔，可透過 docker-compose 指令來觸發，裡面僅包含 weaviate 資料庫的啟動
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# 此資料夾為資料預處理的程式碼
		OCR & PDF 文字直接讀取
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# 所有 Questions, Answers, References, etc. 等“資料”都會存於此資料夾