From f11c27c9b7a39eff7a41b15cea436f9e083d7e43 Mon Sep 17 00:00:00 2001
From: "justin.hsu" <justin.hsu.1019@gmail.com>
Date: Wed, 13 Nov 2024 14:15:46 +0800
Subject: [PATCH] refactor: docstring, readme, update structure

---
 .github/contribute_guide.md       | 25 +++++++++++
 .github/pre-commit.md             |  0
 Model/README.md                   | 10 +++++
 Model/flask_app.py                | 12 +++--
 Model/utils/README.md             |  7 +++
 Model/utils/config_log.py         |  1 +
 Model/utils/retrieval_agent.py    | 24 ++++++----
 Preprocess/README.md              |  9 +++-
 Preprocess/data_process/README.md |  2 +
 Preprocess/insert_data.py         |  6 +++
 README.md                         | 73 +++++++------------------------
 config_example.ini                |  3 --
 data/README.md                    |  1 +
 docker/README.md                  |  7 +++
 main.py                           | 17 +++----
 requirements.txt                  |  5 +--
 testing/README.md                 |  8 ++++
 17 files changed, 122 insertions(+), 88 deletions(-)
 create mode 100644 .github/contribute_guide.md
 delete mode 100644 .github/pre-commit.md
 create mode 100644 Preprocess/data_process/README.md
 create mode 100644 data/README.md

diff --git a/.github/contribute_guide.md b/.github/contribute_guide.md
new file mode 100644
index 0000000..c5f4798
--- /dev/null
+++ b/.github/contribute_guide.md
@@ -0,0 +1,25 @@
+# Contribution Guide
+這個資料夾主要處理 CI Pipeline, 目前僅有檢測程式碼規範 (pre-commit), 且在發 PR & merge to main 才會觸發
+
+We follow GitHub Flow for contributing. The steps are as follows:
+
+1. **Claim an issue**: Start by picking an issue from GitHub.
+2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`).
+3. **Development**: After completing the feature, ensure you run pre-commit hooks:
+   ```
+   pre-commit run --all-files
+   ```
+4. **Create PR Request (PR)**:
+   - Ensure your PR is small and easily reviewable.
+   - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference.
+   - Write a clear description including the reason for the change and what was modified (`Reason & Changes`).
+5. **Review & Approval**:
+   - Assign the PR to all members of the team for review.
+   - Wait for at least one approval.
+   - Ensure all CI checks pass.
+6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself.
+
+## Additional Notes
+- Keep your commits focused and ensure meaningful commit messages.
+- Always rebase your branch on top of `main` before merging.
+- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues.
diff --git a/.github/pre-commit.md b/.github/pre-commit.md
deleted file mode 100644
index e69de29..0000000
diff --git a/Model/README.md b/Model/README.md
index e69de29..884c41b 100644
--- a/Model/README.md
+++ b/Model/README.md
@@ -0,0 +1,10 @@
+# 進行檢索的主程式
+
+## flask_app.py
+會開出一個 API 供 main.py 呼叫，每次呼叫會送入一題問題，並回傳一個答案 pid
+
+## utils/retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## utils/config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/Model/flask_app.py b/Model/flask_app.py
index 991bfa2..e53164b 100644
--- a/Model/flask_app.py
+++ b/Model/flask_app.py
@@ -59,18 +59,18 @@ def get(self):
         return response
 
 
-# TODO: Modify the output format for general RAG purposes
 @ns.route('/chat')
 class ChatBot(Resource):
     @api.doc('chat_bot')
     @api.expect(model)
     def post(self):
+        """retrieve and rank api entry point"""
         qid = request.json.get('qid')
         source = request.json.get('source')
         question = request.json.get('query')
         category = request.json.get('category')
 
-        # for alpha testing
+        # for alpha testing (finding best hybrid search alpha)
         # alpha = request.json.get('alpha')
 
         # input template
@@ -81,9 +81,10 @@ def post(self):
         # "category": "insurance"
         # },
 
-        alpha = 0.5
+        alpha = 0.5  # 最終因使用 Reranker 全盤處理 sources，故任何 alpha 對準確率都無影響
 
         if not question:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': '1', 'retrieve': '1'})
             response.status_code = 200
             return response
@@ -103,19 +104,24 @@ def post(self):
             response.status_code = 200
             return response
         except TypeError:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': qid, 'retrieve': source[-1]})
             response.status_code = 200
             return response
 
 
+# For API Docs
 @app.before_request
 def require_auth_for_docs():
+    """Require authentication for API Docs"""
     if request.path == '/':
         return auth.login_required()(swagger_ui)()
 
 
+# For API Docs
 @app.route('/')
 def swagger_ui():
+    """Redirect to the Swagger UI"""
     return api.render_doc()
 
 
diff --git a/Model/utils/README.md b/Model/utils/README.md
index e69de29..04f9b54 100644
--- a/Model/utils/README.md
+++ b/Model/utils/README.md
@@ -0,0 +1,7 @@
+# flask_app 主程式需呼叫的所有輔助程式
+
+## retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/Model/utils/config_log.py b/Model/utils/config_log.py
index 655d224..0ad925c 100644
--- a/Model/utils/config_log.py
+++ b/Model/utils/config_log.py
@@ -6,6 +6,7 @@
 
 
 def setup_config_and_logging():
+    """Set up the configuration and logging."""
     config = configparser.ConfigParser()
 
     logger = logging.getLogger()
diff --git a/Model/utils/retrieval_agent.py b/Model/utils/retrieval_agent.py
index 4400670..6534092 100644
--- a/Model/utils/retrieval_agent.py
+++ b/Model/utils/retrieval_agent.py
@@ -6,30 +6,34 @@
 
 import utils.config_log as config_log
 
-# 載入設定檔案和日誌設定
 config, logger, CONFIG_PATH = config_log.setup_config_and_logging()
 config.read(CONFIG_PATH)
 
-# 從 config 中取得 Weaviate URL 和 API 金鑰
-wea_url = config.get('Weaviate', 'weaviate_url')
-voyage_api_key = config.get('VoyageAI', 'api_key')
-PROPERTIES = ['pid', 'content']
+wea_url = config.get('Weaviate', 'weaviate_url')  # 此次所使用的向量資料庫
+voyage_api_key = config.get('VoyageAI', 'api_key')  # Voyage Reranker 所使用的 API Key
+PROPERTIES = ['pid', 'content']  # 向量資料庫中此 Class 的欄位
 
 # 設定 OpenAI API 金鑰
 os.environ['OPENAI_API_KEY'] = config.get('OpenAI', 'api_key')
 
 
 class WeaviateSemanticSearch:
+    """Weaviate 向量資料庫的搜尋類別"""
+
     def __init__(self, classnm):
+        """初始化 Weaviate 向量資料庫的搜尋類別"""
         self.url = wea_url
+        # 選擇的 OpenAI embedding model
         self.embeddings = OpenAIEmbeddings(chunk_size=1, model='text-embedding-3-large')
         self.client = weaviate.Client(url=wea_url)
         self.classnm = classnm
 
     def hybrid_search(self, query, source, num, alpha):
+        """Weaviate 向量資料庫的搜尋方法"""
         query_vector = self.embeddings.embed_query(query)
         vector_str = ','.join(map(str, query_vector))
 
+        # 下述兩搜索式主要為過濾出 source 中的 pid，並只針對 source 中的 pid 的文件進行 retrieval & rerank
         where_conditions = ' '.join([f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source])
 
         gql_query = f"""
@@ -63,16 +67,20 @@ def hybrid_search(self, query, source, num, alpha):
 
 
 def rerank_with_voyage(query, documents, pids, api_key):
+    """利用 Voyage Reranker 對 Weaviate hybrid search retrieval 的結果進行 rerank"""
     vo = voyageai.Client(api_key=api_key)
+    # 利用 voyage rerank-2 從 hybrid search retrieval 中篩出的所有文件取出最終的 top 1
     reranking = vo.rerank(query, documents, model='rerank-2', top_k=1)
     top_result = reranking.results[0]
 
-    # 根據內容找到相對應的 pid
     top_pid = pids[documents.index(top_result.document)]
     return {'pid': top_pid, 'relevance_score': top_result.relevance_score}
 
 
 def search_do(question, category, source, alpha):
+    """flask_app.py 呼叫的 '搜尋' 主程式"""
+
+    # 先根據題目給定的 category 選擇對應的向量資料庫 Class
     if category == 'finance':
         vdb_named = 'Financedev'
     elif category == 'insurance':
@@ -81,16 +89,16 @@ def search_do(question, category, source, alpha):
         vdb_named = 'Faqdev'
 
     searcher = WeaviateSemanticSearch(vdb_named)
-    # 從 Weaviate 取得前 100 筆結果
+    # 從 Weaviate hybrid search retrieval 前 100 筆結果
     top_100_results = searcher.hybrid_search(question, source, 100, alpha=alpha)
 
-    # 準備文件和 pid 列表供 rerank 使用
     documents = [result['content'] for result in top_100_results]
     pids = [result['pid'] for result in top_100_results]
 
     # 使用 VoyageAI 重新排序，並取得排名最高的 pid
     top_reranked_result = rerank_with_voyage(question, documents, pids, voyage_api_key)
 
+    # Log
     print('最相關文件的 PID:')
     print(f"PID: {top_reranked_result['pid']}")
     print(f"相關性分數: {top_reranked_result['relevance_score']}")
diff --git a/Preprocess/README.md b/Preprocess/README.md
index 197793b..082c981 100644
--- a/Preprocess/README.md
+++ b/Preprocess/README.md
@@ -1 +1,8 @@
-# Scripts to process data automatically
+# 此資料夾為所有處理資料的程式碼
+包含 資料預處理 及 資料寫入資料庫
+
+## data_process/
+OCR & PDF 文字直接讀取
+
+## insert_data.py
+此程式為寫入資料庫的程式碼，並包含建立資料庫 class、對資料進行 embedding、利用 text_splitter 去 chunk tokens 數過多的資料
diff --git a/Preprocess/data_process/README.md b/Preprocess/data_process/README.md
new file mode 100644
index 0000000..f065c58
--- /dev/null
+++ b/Preprocess/data_process/README.md
@@ -0,0 +1,2 @@
+# 此資料夾為資料預處理的程式碼
+OCR & PDF 文字直接讀取
diff --git a/Preprocess/insert_data.py b/Preprocess/insert_data.py
index 76ec8a0..6cb5a24 100644
--- a/Preprocess/insert_data.py
+++ b/Preprocess/insert_data.py
@@ -16,13 +16,17 @@
 
 
 class WeaviateManager:
+    """Weaviate Insert data 管理器"""
+
     def __init__(self, classnm):
+        """初始化 Weaviate 連接"""
         self.url = wea_url
         self.client = weaviate.Client(url=wea_url, additional_headers={'X-OpenAI-Api-Key': openai_api_key})
         self.classnm = classnm
         self.check_class_exist()
 
     def check_class_exist(self):
+        """檢查 class 是否存在"""
         if self.client.schema.exists(self.classnm):
             print(f'{self.classnm} is ready')
             return True
@@ -47,6 +51,7 @@ def check_class_exist(self):
         return True
 
     def insert_data(self, pid, content):
+        """插入資料到 Weaviate"""
         data_object = {'pid': pid, 'content': content}
         max_retries = 5
         for attempt in range(max_retries):
@@ -73,6 +78,7 @@ def insert_data(self, pid, content):
         return False
 
     def split_and_insert(self, pid, content, category):
+        """處理特例：分割並插入資料"""
         # 使用 TextSplitter 分割長文本
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=500)
         split_content = text_splitter.split_text(content)
diff --git a/README.md b/README.md
index 8069444..df51130 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,11 @@
 # AI CUP 2024 玉山人工智慧公開挑戰賽－RAG與LLM在金融問答的應用
 
-## Development Mode
+## Repo Structure
+```
+(主要用 # 介紹沒有在 folder 內獨立檔案)
+```
+
+## Setup Environment
 To set up the development environment, follow these steps:
 
 1. Create a virtual environment:
@@ -20,84 +25,36 @@ To set up the development environment, follow these steps:
    ```
 
 4. Manually add your `secret key` to the `config.ini`.
+   (需展開解釋 config.ini 內的每一項 key)
 
 5. Create a `logs` directory:
    ```
    mkdir logs
    ```
 
-6. Navigate to the `docker` directory (optional):
+6. Navigate to the `docker` directory:
    ```
    cd docker
    ```
 
-7. Start the Docker environment (optional):
+7. Start the Docker environment (weaviate database):
    ```
    docker-compose up -d
    ```
 
-8. Run the Flask app:
-   ```
-   python3 src/flask_app.py
-   ```
+8. Data preprocessing:
 
-## Docker Production Mode
-
-1. Copy the configuration example and create your own config file:
-   ```
-   cp config_example.ini config.ini
-   ```
-
-2. Manually add your `secret key` to the `config.ini`.
-
-3. Create a `logs` directory:
-   ```
-   mkdir logs
-   ```
+9. Data insert to weaviate:
 
-4. Navigate to the `docker` directory:
+10. Run the Flask app:
    ```
-   cd docker
+   python3 src/flask_app.py
    ```
 
-5. Start the Docker environment:
-   ```
-   docker-compose up -d
-   ```
+11. 將主辦方提供的 questions.json 測試資料塞入 data/:
 
-6. Build the Docker image:
-   ```
-   docker build -t aicup_img -f dockerfile .
-   ```
+12. 運行 main.py 進行測試得出 data/pred_retrieve.json 提交最終結果給主辦方:
 
-7. Run the Docker container:
-   ```
-   docker run -d -p 5001:5001 --name aicup_cont aicup_img
-   ```
 
 ## Folder-specific Details
 For more detailed information about each folder and its purpose, refer to the individual `README.md` files located in their respective directories.
-
-## Contribution Guide
-We follow GitHub Flow for contributing. The steps are as follows:
-
-1. **Claim an issue**: Start by picking an issue from GitHub.
-2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`).
-3. **Development**: After completing the feature, ensure you run pre-commit hooks:
-   ```
-   pre-commit run --all-files
-   ```
-4. **Create PR Request (PR)**:
-   - Ensure your PR is small and easily reviewable.
-   - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference.
-   - Write a clear description including the reason for the change and what was modified (`Reason & Changes`).
-5. **Review & Approval**:
-   - Assign the PR to all members of the team for review.
-   - Wait for at least one approval.
-   - Ensure all CI checks pass.
-6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself.
-
-## Additional Notes
-- Keep your commits focused and ensure meaningful commit messages.
-- Always rebase your branch on top of `main` before merging.
-- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues.
diff --git a/config_example.ini b/config_example.ini
index 365639d..5c8955a 100644
--- a/config_example.ini
+++ b/config_example.ini
@@ -1,9 +1,6 @@
 [Weaviate]
 weaviate_url =
 
-[Gemini]
-api_key =
-
 [OpenAI]
 api_key =
 
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 0000000..29008c4
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1 @@
+# 所有 Questions, Answers, References, etc. 等“資料”都會存於此資料夾
diff --git a/docker/README.md b/docker/README.md
index e69de29..567d04c 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -0,0 +1,7 @@
+# 此資料夾紀錄所有docker相關的內容
+
+## docker_install.sh
+為 docker 安裝的腳本，只需運行此 sh 便可 setup docker & docker-compose
+
+## docker-compose.yml
+為 docker-compose 的設定檔，可透過 docker-compose 指令來觸發，裡面僅包含 weaviate 資料庫的啟動
diff --git a/main.py b/main.py
index b61aa2b..217d1c3 100644
--- a/main.py
+++ b/main.py
@@ -1,48 +1,41 @@
 import json
-import time  # Import time module for timing
+import time
 
 import requests
 
-# Load questions from the JSON file
 with open('data/questions_example.json', encoding='utf-8') as file:
     questions = json.load(file)['questions']
 
-output_data = {'answers': []}  # Initialize output format with "answers" array
+output_data = {'answers': []}
 
 url = 'http://127.0.0.1:5000/api/chat'
 
-total_start_time = time.time()  # Start timing for the entire process
+total_start_time = time.time()
 
 for question in questions:
-    question_start_time = time.time()  # Start timing for each question
+    question_start_time = time.time()
 
-    # Send POST request
     response = requests.post(url, json=question)
 
     if response.status_code == 200:
         response_json = response.json()
 
-        # Extract qid and retrieve from the API response
-        qid = question.get('qid')  # Assuming each question has a unique "qid" field
+        qid = question.get('qid')
         retrieve = response_json.get('retrieve')
 
-        # Append formatted result to the answers array
         output_data['answers'].append({'qid': qid, 'retrieve': retrieve})
         print('成功取得 JSON:', response_json)
     else:
         print('請求失敗，狀態碼:', response.status_code)
 
-    # Calculate and print time for each question
     question_end_time = time.time()
     question_duration = question_end_time - question_start_time
     print(f'QID: {qid} - 花費時間: {question_duration:.2f} 秒')
 
-# Calculate and print total time
 total_end_time = time.time()
 total_duration = total_end_time - total_start_time
 print(f'全部題目處理完成，總共花費時間: {total_duration:.2f} 秒')
 
-# Save the output data to a new JSON file
 with open('data/pred_retrieve.json', 'w', encoding='utf-8') as output_file:
     json.dump(output_data, output_file, ensure_ascii=False, indent=4)
 
diff --git a/requirements.txt b/requirements.txt
index 884c6e0..3cdbe00 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-Python==3.12.0
+# Python 3.12.0
 Flask==2.3.2
 Flask_Cors==4.0.0
 keyboard==0.13.5
@@ -8,10 +8,9 @@ selenium==4.21.0
 weaviate_client==3.22.1
 tiktoken==0.7.0
 langchain-community==0.2.0
-sentence-transformers==2.7.0
 flask_limiter==3.7.0
 flask_restx==1.3.0
-python-dateutil
+python-dateutil==2.9.0.post0
 redis==5.0.8
 flask-httpauth==4.8.0
 voyageai==0.3.1
diff --git a/testing/README.md b/testing/README.md
index e69de29..c63f11c 100644
--- a/testing/README.md
+++ b/testing/README.md
@@ -0,0 +1,8 @@
+# 此資料庫為進行一些測試及檢測的程式碼
+這些程式不影響正式比賽時的運行，也不影響資料處理，僅是做測試尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25)；還有用來為公開測資對答案的腳本。
+
+# get_best_alpha.py
+用來測試不同 alpha 的準確率，尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25)
+
+# checkans.py
+用來為公開測資對答案的腳本