diff --git a/.github/contribute_guide.md b/.github/contribute_guide.md
new file mode 100644
index 0000000..c5f4798
--- /dev/null
+++ b/.github/contribute_guide.md
@@ -0,0 +1,25 @@
+# Contribution Guide
+這個資料夾主要處理 CI Pipeline, 目前僅有檢測程式碼規範 (pre-commit), 且在發 PR & merge to main 才會觸發
+
+We follow GitHub Flow for contributing. The steps are as follows:
+
+1. **Claim an issue**: Start by picking an issue from GitHub.
+2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`).
+3. **Development**: After completing the feature, ensure you run pre-commit hooks:
+   ```
+   pre-commit run --all-files
+   ```
+4. **Create PR Request (PR)**:
+   - Ensure your PR is small and easily reviewable.
+   - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference.
+   - Write a clear description including the reason for the change and what was modified (`Reason & Changes`).
+5. **Review & Approval**:
+   - Assign the PR to all members of the team for review.
+   - Wait for at least one approval.
+   - Ensure all CI checks pass.
+6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself.
+
+## Additional Notes
+- Keep your commits focused and ensure meaningful commit messages.
+- Always rebase your branch on top of `main` before merging.
+- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues.
diff --git a/Model/README.md b/Model/README.md
new file mode 100644
index 0000000..884c41b
--- /dev/null
+++ b/Model/README.md
@@ -0,0 +1,10 @@
+# 進行檢索的主程式
+
+## flask_app.py
+會開出一個 API 供 main.py 呼叫，每次呼叫會送入一題問題，並回傳一個答案 pid
+
+## utils/retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## utils/config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/src/flask_app.py b/Model/flask_app.py
similarity index 86%
rename from src/flask_app.py
rename to Model/flask_app.py
index 991bfa2..e53164b 100644
--- a/src/flask_app.py
+++ b/Model/flask_app.py
@@ -59,18 +59,18 @@ def get(self):
         return response
 
 
-# TODO: Modify the output format for general RAG purposes
 @ns.route('/chat')
 class ChatBot(Resource):
     @api.doc('chat_bot')
     @api.expect(model)
     def post(self):
+        """retrieve and rank api entry point"""
         qid = request.json.get('qid')
         source = request.json.get('source')
         question = request.json.get('query')
         category = request.json.get('category')
 
-        # for alpha testing
+        # for alpha testing (finding best hybrid search alpha)
         # alpha = request.json.get('alpha')
 
         # input template
@@ -81,9 +81,10 @@ def post(self):
         # "category": "insurance"
         # },
 
-        alpha = 0.5
+        alpha = 0.5  # 最終因使用 Reranker 全盤處理 sources，故任何 alpha 對準確率都無影響
 
         if not question:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': '1', 'retrieve': '1'})
             response.status_code = 200
             return response
@@ -103,19 +104,24 @@ def post(self):
             response.status_code = 200
             return response
         except TypeError:
+            # 為避免任何萬一，無論如何都須回傳一個結果，不做 Error logging
             response = jsonify({'qid': qid, 'retrieve': source[-1]})
             response.status_code = 200
             return response
 
 
+# For API Docs
 @app.before_request
 def require_auth_for_docs():
+    """Require authentication for API Docs"""
     if request.path == '/':
         return auth.login_required()(swagger_ui)()
 
 
+# For API Docs
 @app.route('/')
 def swagger_ui():
+    """Redirect to the Swagger UI"""
     return api.render_doc()
 
 
diff --git a/Model/utils/README.md b/Model/utils/README.md
new file mode 100644
index 0000000..04f9b54
--- /dev/null
+++ b/Model/utils/README.md
@@ -0,0 +1,7 @@
+# flask_app 主程式需呼叫的所有輔助程式
+
+## retrieval_agent.py
+負責呼叫 weaviate & voyage reranker 進行檢索
+
+## config_log.py
+負責處理 config 檔案，並設定 log 檔案
diff --git a/src/utils/__init__.py b/Model/utils/__init__.py
similarity index 100%
rename from src/utils/__init__.py
rename to Model/utils/__init__.py
diff --git a/src/utils/config_log.py b/Model/utils/config_log.py
similarity index 92%
rename from src/utils/config_log.py
rename to Model/utils/config_log.py
index 655d224..0ad925c 100644
--- a/src/utils/config_log.py
+++ b/Model/utils/config_log.py
@@ -6,6 +6,7 @@
 
 
 def setup_config_and_logging():
+    """Set up the configuration and logging."""
     config = configparser.ConfigParser()
 
     logger = logging.getLogger()
diff --git a/src/utils/retrieval_agent.py b/Model/utils/retrieval_agent.py
similarity index 74%
rename from src/utils/retrieval_agent.py
rename to Model/utils/retrieval_agent.py
index 4400670..6534092 100644
--- a/src/utils/retrieval_agent.py
+++ b/Model/utils/retrieval_agent.py
@@ -6,30 +6,34 @@
 
 import utils.config_log as config_log
 
-# 載入設定檔案和日誌設定
 config, logger, CONFIG_PATH = config_log.setup_config_and_logging()
 config.read(CONFIG_PATH)
 
-# 從 config 中取得 Weaviate URL 和 API 金鑰
-wea_url = config.get('Weaviate', 'weaviate_url')
-voyage_api_key = config.get('VoyageAI', 'api_key')
-PROPERTIES = ['pid', 'content']
+wea_url = config.get('Weaviate', 'weaviate_url')  # 此次所使用的向量資料庫
+voyage_api_key = config.get('VoyageAI', 'api_key')  # Voyage Reranker 所使用的 API Key
+PROPERTIES = ['pid', 'content']  # 向量資料庫中此 Class 的欄位
 
 # 設定 OpenAI API 金鑰
 os.environ['OPENAI_API_KEY'] = config.get('OpenAI', 'api_key')
 
 
 class WeaviateSemanticSearch:
+    """Weaviate 向量資料庫的搜尋類別"""
+
     def __init__(self, classnm):
+        """初始化 Weaviate 向量資料庫的搜尋類別"""
         self.url = wea_url
+        # 選擇的 OpenAI embedding model
         self.embeddings = OpenAIEmbeddings(chunk_size=1, model='text-embedding-3-large')
         self.client = weaviate.Client(url=wea_url)
         self.classnm = classnm
 
     def hybrid_search(self, query, source, num, alpha):
+        """Weaviate 向量資料庫的搜尋方法"""
         query_vector = self.embeddings.embed_query(query)
         vector_str = ','.join(map(str, query_vector))
 
+        # 下述兩搜索式主要為過濾出 source 中的 pid，並只針對 source 中的 pid 的文件進行 retrieval & rerank
         where_conditions = ' '.join([f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source])
 
         gql_query = f"""
@@ -63,16 +67,20 @@ def hybrid_search(self, query, source, num, alpha):
 
 
 def rerank_with_voyage(query, documents, pids, api_key):
+    """利用 Voyage Reranker 對 Weaviate hybrid search retrieval 的結果進行 rerank"""
     vo = voyageai.Client(api_key=api_key)
+    # 利用 voyage rerank-2 從 hybrid search retrieval 中篩出的所有文件取出最終的 top 1
     reranking = vo.rerank(query, documents, model='rerank-2', top_k=1)
     top_result = reranking.results[0]
 
-    # 根據內容找到相對應的 pid
     top_pid = pids[documents.index(top_result.document)]
     return {'pid': top_pid, 'relevance_score': top_result.relevance_score}
 
 
 def search_do(question, category, source, alpha):
+    """flask_app.py 呼叫的 '搜尋' 主程式"""
+
+    # 先根據題目給定的 category 選擇對應的向量資料庫 Class
     if category == 'finance':
         vdb_named = 'Financedev'
     elif category == 'insurance':
@@ -81,16 +89,16 @@ def search_do(question, category, source, alpha):
         vdb_named = 'Faqdev'
 
     searcher = WeaviateSemanticSearch(vdb_named)
-    # 從 Weaviate 取得前 100 筆結果
+    # 從 Weaviate hybrid search retrieval 前 100 筆結果
     top_100_results = searcher.hybrid_search(question, source, 100, alpha=alpha)
 
-    # 準備文件和 pid 列表供 rerank 使用
     documents = [result['content'] for result in top_100_results]
     pids = [result['pid'] for result in top_100_results]
 
     # 使用 VoyageAI 重新排序，並取得排名最高的 pid
     top_reranked_result = rerank_with_voyage(question, documents, pids, voyage_api_key)
 
+    # Log
     print('最相關文件的 PID:')
     print(f"PID: {top_reranked_result['pid']}")
     print(f"相關性分數: {top_reranked_result['relevance_score']}")
diff --git a/Preprocess/README.md b/Preprocess/README.md
new file mode 100644
index 0000000..082c981
--- /dev/null
+++ b/Preprocess/README.md
@@ -0,0 +1,8 @@
+# 此資料夾為所有處理資料的程式碼
+包含 資料預處理 及 資料寫入資料庫
+
+## data_process/
+OCR & PDF 文字直接讀取
+
+## insert_data.py
+此程式為寫入資料庫的程式碼，並包含建立資料庫 class、對資料進行 embedding、利用 text_splitter 去 chunk tokens 數過多的資料
diff --git a/Preprocess/data_process/README.md b/Preprocess/data_process/README.md
new file mode 100644
index 0000000..39a494e
--- /dev/null
+++ b/Preprocess/data_process/README.md
@@ -0,0 +1,204 @@
+---
+title: 資料前處理使用指南
+此資料夾為資料預處理的程式碼
+- OCR & PDF 文字直接讀取
+
+---
+
+
+# 資料前處理使用指南
+
+## 簡介
+
+此程式碼包含用於讀取與處理 Reference 檔案夾中 FAQ（JSON）文件和 Finance 與 Insurance（PDF）文本文件的 Python 程式碼。程式碼的主要功能包括：
+
+- 先從 ZIP 壓縮檔案中提取指定資料夾內的 PDF 文件，再將每一頁轉換為圖像，並使用 Tesseract 進行 OCR 識別以提取文本內容。將提取的文本內容保存為 `.txt` 文件，按類別分類儲存。
+- 再讀取 FAQ JSON 文件和 OCR 生成的文本文件，將所有資料格式化並合併為一個統一的 JSON 文件，便於後續的檢索與處理。
+
+## 運行環境和套件
+
+### Python 套件
+
+- `pytesseract`
+- `pdf2image`
+- `zipfile`（標準函式庫）
+- `json`（標準函式庫）
+- `os`（標準函式庫）
+
+### 外部套件
+
+- **Tesseract-OCR**：用於 OCR 識別。
+  - 下載地址：[Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
+  - 安裝路徑示例：`C:\Program Files\Tesseract-OCR\tesseract.exe`
+- **Poppler**：用於 PDF 轉圖片。
+  - 下載地址：[Poppler for Windows](http://blog.alivate.com.au/poppler-windows/)
+  - 安裝路徑示例：`C:\Program Files\poppler-24.08.0\Library\bin`
+
+## 安裝
+
+### 1. 複製或下載專案
+
+如果您尚未獲取專案代碼，請複製或下載到本地：
+
+```bash
+git clone https://github.com/yourusername/your-repo-name.git
+cd your-repo-name
+```
+
+
+### 2. 安裝外部套件
+
+- **Tesseract-OCR**：
+  - 下載並安裝 Tesseract-OCR。
+  - 安裝完成後，記下安裝路徑（如 `C:\Program Files\Tesseract-OCR\tesseract.exe`）。
+
+- **Poppler**：
+  - 下載並安裝 Poppler。
+  - 安裝完成後，記下 `poppler_path`（如 `C:\Program Files\poppler-24.08.0\Library\bin`）。
+
+### 3. 安裝 Python 套件
+
+安裝所需的 Python 套件：
+
+```bash
+pip install pytesseract==0.3.13
+pip install pdf2image==1.17.0
+```
+
+## 配置
+
+在程式碼中配置 Tesseract 和 Poppler 的路徑：
+
+```python
+# Configure Tesseract path if necessary (update this path as needed)
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+
+# Specify the path to the Poppler binaries
+poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin"
+```
+
+確保將上述路徑替換為本地實際安裝的路徑。
+
+## 使用說明
+
+### 1. 準備資料
+
+確保您的 ZIP 文件包含以下資料夾和文件：
+
+- `競賽資料集/reference/faq/pid_map_content.json`
+- `競賽資料集/reference/finance/*.pdf`
+- `競賽資料集/reference/insurance/*.pdf`
+
+### 2. 運行 OCR 提取
+
+運行以下命令進行 OCR 處理：
+
+```bash
+python data_preprocess.py
+```
+
+程式碼將執行以下步驟：
+
+1. 從指定的 ZIP 文件中提取 Finance 和 Insurance 的 PDF 文件。
+2. 將每個 PDF 文件的每一頁轉換為圖像。
+3. 使用 Tesseract 進行 OCR 識別，提取文本內容。
+4. 將提取的文本保存為 `.txt` 文件，按類別儲存在 `dataset/output_text/finance/` 和 `dataset/output_text/insurance/` 目錄下。
+
+### 3. 資料格式化
+
+程式碼會繼續執行以下步驟：
+
+1. 讀取 FAQ 文件 `pid_map_content.json`，提取問題和答案。
+2. 讀取 OCR 生成的文本文件，按 PDF 文件和頁碼順序合併文本內容。
+3. 將所有資料格式化並合併為一個 JSON 文件 `dataset/formatted_reference_ocr.json`。
+
+### 4. 查看輸出
+
+- **OCR 輸出文本文件**：
+  - Finance 文本文件保存在 `dataset/output_text/finance/`。
+  - Insurance 文本文件保存在 `dataset/output_text/insurance/`。
+
+- **合併後的 JSON 文件**：
+  - `dataset/formatted_reference_ocr.json` 包含了所有格式化後的 FAQ、Finance 與 Insurance 資料。
+
+## 文件結構
+
+```
+project/
+├── dataset/
+│   ├── output_text/
+│   │   └── 競賽資料集/
+│   │       └── reference/
+│   │           ├── finance/
+│   │           │   ├── 0.pdf_page_1.txt
+│   │           │   ├── 1.pdf_page_1.txt
+│   │           │   ├── 1.pdf_page_2.txt
+│   │           │   └── ...
+│   │           └── insurance/
+│   │               ├── 1.pdf_page_1.txt
+│   │               ├── 1.pdf_page_2.txt
+│   │               └── ...
+│   └── formatted_reference_ocr.json
+├── datazip.zip/
+│   └── 競賽資料集/
+│           └── reference/
+│               ├── faq/
+│               │   └── pid_map_content.json
+│               ├── finance/
+│               │   ├── 0.pdf
+│               │   ├── 1.pdf
+│               │   └── ...
+│               └── insurance/
+│                   ├── 1.pdf
+│                   ├── 2.pdf
+│                   └── ...
+├── data_preprocess.py
+└── README.md
+```
+
+## 範例輸出
+
+生成的 `formatted_reference_ocr.json` 文件結構示例：
+
+```json
+[
+    {
+        "category": "faq",
+        "qid": "0",
+        "content": {
+            "question": "什麼是跨境手機掃碼支付?",
+            "answers": [
+                "允許大陸消費者可以用手機支付寶App在台灣實體商店購買商品或服務"
+            ]
+        }
+    },// 其他 FAQ 資料條目...
+    {
+        "category": "finance",
+        "qid": "0",
+        "content": "註 1U ﹕ 本 雄 團 於 民 國 111] 年 第 1 季 投 賁 成 立 寶 元 智 造 公 司 ， 由 本 集 圖 持\n有 100% 股 權 ， 另 於 民 國 111 年 第 3 季 及 112 年 第 1 季 未 依 持 股 比..."
+    },// 其他 Finance 資料條目...
+    {
+        "category": "insurance",
+        "qid": "1",
+        "content": "延 期 間 內 發 生 第 十 六 條 或 第 十 七 條 本 公 司 應 負 係 險 貫 任 之 事 故 時 ， 其 約 定 之 係 險 金 計 算 方 式 將 不 適 用 ， 本 公\n..."
+    },// 其他 Insurance 資料條目...
+]
+```
+
+## 注意事項
+
+- **編碼**：確保所有文本文件均使用 UTF-8 編碼，以支持中文字符，避免出現亂碼。
+- **路徑配置**：
+  - 請根據您本地的安裝路徑，更新程式碼中的 `tesseract_cmd` 和 `poppler_path` 變數。
+- **文件命名**：
+  - OCR 文本文件必須遵循 `{文件名}.pdf_page_{頁碼}.txt` 的命名規則，以確保程式碼能夠正確讀取並合併各頁內容。
+- **套件安裝**：
+  - 確保已正確安裝並配置 Tesseract-OCR 和 Poppler，否則程式碼將無法正常運行。
+
+## 許可證
+
+本專案採用 [MIT 許可證](LICENSE)。您可以自由地使用、修改和分發本專案。
+
+---
+
+**感謝您的使用！**
diff --git a/src/data/conbine_readpdf_result.py b/Preprocess/data_process/conbine_readpdf_result.py
similarity index 93%
rename from src/data/conbine_readpdf_result.py
rename to Preprocess/data_process/conbine_readpdf_result.py
index a021ced..7976b17 100644
--- a/src/data/conbine_readpdf_result.py
+++ b/Preprocess/data_process/conbine_readpdf_result.py
@@ -4,7 +4,7 @@
 with open('data/aicup_noocr.json', encoding='utf-8') as file:
     noocr_data = json.load(file)
 
-with open('data/aicup_ref.json', encoding='utf-8') as file:
+with open('data/formatted_reference_ocr.json', encoding='utf-8') as file:
     ref_data = json.load(file)
 
 # 建立 ref_data 的 dictionary，並檢查 content 是否為字串，再去除空格
diff --git a/Preprocess/data_process/data_preprocess.py b/Preprocess/data_process/data_preprocess.py
new file mode 100644
index 0000000..e82c118
--- /dev/null
+++ b/Preprocess/data_process/data_preprocess.py
@@ -0,0 +1,159 @@
+import json
+import os
+import zipfile
+
+import pytesseract
+from pdf2image import convert_from_bytes
+
+# Configure Tesseract path if necessary (update this path as needed)
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+
+
+def ocr_in_folder(zip_path, folder, output_dir):
+    """
+    Extracts PDF files from a ZIP archive, performs OCR, and saves the output text.
+
+    Args:
+        zip_path (str): The path to the ZIP file containing the documents.
+        folder (str): The folder path inside the ZIP to search for PDF files.
+        output_dir (str): The directory to save the OCR output text files.
+
+    Returns:
+        None
+    """
+    folder_path = f'{folder}/'
+
+    with zipfile.ZipFile(zip_path, 'r') as zipf:
+        for zip_info in zipf.infolist():
+            if zip_info.filename.startswith(folder_path) and not zip_info.is_dir():
+                with zipf.open(zip_info.filename) as pdf_file:
+                    pdf_bytes = pdf_file.read()
+
+                    # Specify the path to the Poppler binaries if needed
+                    poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin'
+
+                    # Convert the PDF bytes to images
+                    images = convert_from_bytes(pdf_bytes, dpi=300, poppler_path=poppler_path)
+
+                    os.makedirs(output_dir, exist_ok=True)
+
+                    # Extract only the base filename (e.g., "file1.pdf" instead of the full path)
+                    base_filename = os.path.basename(zip_info.filename)
+
+                    # Perform OCR on each page and save the text
+                    for i, image in enumerate(images):
+                        text = pytesseract.image_to_string(image, lang='chi_tra')
+                        output_file_path = os.path.join(output_dir, f'{base_filename}_page_{i + 1}.txt')
+                        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
+                        with open(output_file_path, 'w', encoding='utf-8') as f:
+                            f.write(text)
+                    print(f'OCR completed for {base_filename}')
+
+
+# OCR extraction paths
+zip_path = 'datazip.zip'
+ocr_in_folder(zip_path, '競賽資料集/reference/insurance', 'dataset/output_text/insurance')
+ocr_in_folder(zip_path, '競賽資料集/reference/finance', 'dataset/output_text/finance')
+
+# File paths
+FAQ_FILEPATH = 'datazip/競賽資料集/reference/faq/pid_map_content.json'
+FINANCE_OCR_FOLDER_PATH = 'dataset/output_text/finance'
+INSURANCE_OCR_FOLDER_PATH = 'dataset/output_text/insurance'
+
+
+def check_text(file_path, category):
+    """
+    Reads a JSON FAQ file, processes it, and returns formatted data.
+
+    Args:
+        file_path (str): Path to the FAQ JSON file.
+        category (str): Category label for the FAQ data.
+
+    Returns:
+        list: A list of dictionaries containing formatted FAQ data.
+    """
+    formatted_data = []
+    with open(file_path, encoding='utf-8') as faq_file:
+        loaded_faq = json.load(faq_file)
+
+    for qid, questions in loaded_faq.items():
+        for question_item in questions:
+            formatted_entry = {
+                'category': category,
+                'qid': qid,
+                'content': {'question': question_item['question'], 'answers': question_item['answers']},
+            }
+            formatted_data.append(formatted_entry)
+            print(formatted_entry)
+    return formatted_data
+
+
+def read_ocr_files(ocr_folder_path, category):
+    """
+    Reads text files generated from OCR, consolidates them, and returns formatted data.
+
+    Args:
+        ocr_folder_path (str): Path to the folder containing OCR text files.
+        category (str): Category label for the OCR data.
+
+    Returns:
+        list: A list of dictionaries containing consolidated OCR data.
+    """
+    formatted_data = []
+
+    # Capture the name of file
+    file_basenames = set()
+    for filename in os.listdir(ocr_folder_path):
+        if filename.endswith('.txt'):
+            basename = filename.split('.pdf_page_')[0]
+            file_basenames.add(basename)
+
+    for basename in sorted(file_basenames, key=lambda x: int(x)):
+        all_text = ''
+        page_files = []
+
+        for filename in os.listdir(ocr_folder_path):
+            if filename.startswith(basename) and filename.endswith('.txt'):
+                page_files.append(filename)
+
+        page_files = sorted(page_files, key=lambda x: int(x.split('.pdf_page_')[1].split('.txt')[0]))
+
+        for page_file in page_files:
+            ocr_file_path = os.path.join(ocr_folder_path, page_file)
+            with open(ocr_file_path, encoding='utf-8') as ocr_file:
+                content = ocr_file.read()
+                all_text += content + '\n\n'
+
+        formatted_entry = {'category': category, 'qid': basename, 'content': all_text.strip()}
+        formatted_data.append(formatted_entry)
+        print(formatted_entry)
+
+    return formatted_data
+
+
+if __name__ == '__main__':
+    """
+    Main entry point of the script. Processes FAQ, finance, and insurance OCR data,
+    consolidates them, and saves the result to a JSON file.
+    """
+    total_formatted_data = []
+
+    # handle faq
+    faq_data = check_text(FAQ_FILEPATH, 'faq')
+    total_formatted_data.extend(faq_data)
+
+    # read finance ocr
+    finance_data = read_ocr_files(FINANCE_OCR_FOLDER_PATH, 'finance')
+    total_formatted_data.extend(finance_data)
+
+    # read insurance ocr
+    insurance_data = read_ocr_files(INSURANCE_OCR_FOLDER_PATH, 'insurance')
+    total_formatted_data.extend(insurance_data)
+
+    # store the data after cleaning in formatted_reference_ocr.json
+    output_json_path = 'data/formatted_reference_ocr.json'
+    # os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
+    with open(output_json_path, 'w', encoding='utf-8') as formatted_file:
+        json.dump(total_formatted_data, formatted_file, ensure_ascii=False, indent=4)
+
+    print('The process is finished and the result is saved in dataset/formatted_reference_ocr.json')
diff --git a/src/data/read_pdf_noocr.py b/Preprocess/data_process/read_pdf_noocr.py
similarity index 100%
rename from src/data/read_pdf_noocr.py
rename to Preprocess/data_process/read_pdf_noocr.py
diff --git a/src/db_insert.py b/Preprocess/insert_data.py
similarity index 94%
rename from src/db_insert.py
rename to Preprocess/insert_data.py
index 76ec8a0..61de115 100644
--- a/src/db_insert.py
+++ b/Preprocess/insert_data.py
@@ -16,13 +16,17 @@
 
 
 class WeaviateManager:
+    """Weaviate Insert data 管理器"""
+
     def __init__(self, classnm):
+        """初始化 Weaviate 連接"""
         self.url = wea_url
         self.client = weaviate.Client(url=wea_url, additional_headers={'X-OpenAI-Api-Key': openai_api_key})
         self.classnm = classnm
         self.check_class_exist()
 
     def check_class_exist(self):
+        """檢查 class 是否存在"""
         if self.client.schema.exists(self.classnm):
             print(f'{self.classnm} is ready')
             return True
@@ -47,6 +51,7 @@ def check_class_exist(self):
         return True
 
     def insert_data(self, pid, content):
+        """插入資料到 Weaviate"""
         data_object = {'pid': pid, 'content': content}
         max_retries = 5
         for attempt in range(max_retries):
@@ -73,6 +78,7 @@ def insert_data(self, pid, content):
         return False
 
     def split_and_insert(self, pid, content, category):
+        """處理特例：分割並插入資料"""
         # 使用 TextSplitter 分割長文本
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=500)
         split_content = text_splitter.split_text(content)
@@ -118,7 +124,7 @@ def split_and_insert(self, pid, content, category):
         elif not result:  # 如果失敗且非長度問題
             failed_records.append({'pid': pid, 'category': category})
 
-    # 將失敗的資料寫入 JSON 檔案
+    # 將失敗的資料寫入 JSON 檔案，之後有需要可以利用此 JSON 檔案重新匯入
     if failed_records:
         with open('failed_imports.json', 'w', encoding='utf-8') as f:
             json.dump(failed_records, f, ensure_ascii=False, indent=4)
diff --git a/README.md b/README.md
index 0a74b05..2a8c3fc 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,52 @@
 # AI CUP 2024 玉山人工智慧公開挑戰賽－RAG與LLM在金融問答的應用
-＊＊ High-Accuracy RAG Retriever Template ＊＊
 
-## Rankings
+## Repo Structure
+```
+.
+├── .github
+│   ├── contribute_guide.md
+│   └── workflows
+│       └── ci.yml
+├── .gitignore # 讓 git 忽略的檔案和目錄 (e.g. cache, logs, etc.)
+├── .pre-commit-config.yaml # 設定 pre-commit hooks 以檢查與格式化代碼、環境配置、Git 設定及檢測敏感資訊
+├── .ruff.toml # ruff 設定檔，lint: pep8-naming, pycodestyle, pyflakes, etc.
+├── LICENSE # MIT License
+├── Model
+│   ├── README.md
+│   ├── flask_app.py
+│   └── utils
+│       ├── README.md
+│       ├── __init__.py
+│       ├── config_log.py
+│       └── retrieval_agent.py
+├── Preprocess
+│   ├── README.md
+│   ├── data_process
+│   │   ├── README.md
+│   │   ├── conbine_readpdf_result.py
+│   │   ├── merge_with_ocr_pdfminer.py
+│   │   ├── read_pdf_noocr.py
+│   │   └── read_pdf_ocr.py
+│   └── insert_data.py
+├── README.md
+├── config_example.ini # 設定檔範例，需自己複製一份成 config.ini 並修改
+├── data
+│   └── README.md
+├── docker
+│   ├── README.md
+│   ├── docker-compose.yml
+│   └── docker_install.sh
+├── main.py # 主程式
+├── requirements.txt # Python pip 環境需求
+└── testing
+    ├── README.md
+    ├── checkans.py
+    └── get_best_alpha.py
+```
+
+## Setup Environment
+- **OS:** 除了 Data processing 使用 Windows, 其他以 MacOS, Linux 為主, Windows 需安裝 WSL2 等來模擬出 Linux 環境
 
-- Overall Ranking: 38th out of 487 teams (~7.8%)
-   - Leaderboard: 38th out of 222
-
-![AI Cup Result](img/aicup_result.png)
-
-## Development Mode
 To set up the development environment, follow these steps:
 
 1. Create a virtual environment:
@@ -17,95 +55,95 @@ To set up the development environment, follow these steps:
    source aicup_venv/bin/activate
    ```
 
-2. Install the required dependencies:
+2. git clone our repo:
+   ```
+   git clone https://github.com/JustinHsu1019/AICUP2024-RAG-LLM.git
+   cd AICUP2024-RAG-LLM
+   ```
+
+3. Install the required dependencies:
    ```
    pip install -r requirements.txt
    ```
 
-3. Copy the configuration example and create your own config file:
+4. Copy the configuration example and create your own config file:
    ```
    cp config_example.ini config.ini
    ```
 
-4. Manually add your `secret key` to the `config.ini`.
+5. Manually add your `secret key` to the `config.ini`:
+
+- [OpenAI] 的 api_key 可以在 openai 官網註冊取得
+- [VoyageAI] 的 api_key 可以在 voyageai 官網註冊取得
+- [Api_docs] 的 password 可以自己隨意輸入
+    - flask_app.py 啟動後，直接訪問 http://127.0.0.1:5000/ 即可看到 Swagger API 文件頁面
 
-5. Create a `logs` directory:
+6. Create a `logs` directory:
    ```
    mkdir logs
    ```
 
-6. Navigate to the `docker` directory (optional):
+7. Navigate to the `docker` directory:
    ```
    cd docker
    ```
 
-7. Start the Docker environment (optional):
+8. Start the Docker environment (weaviate database):
    ```
    docker-compose up -d
    ```
 
-8. Run the Flask app:
-   ```
-   python3 src/flask_app.py
-   ```
+9. Data preprocessing (這一階段因不同組員處理原因，OS 環境為 Windows):
+- **Tesseract-OCR**：
+  - 下載並安裝 Tesseract-OCR。
+  - 安裝完成後，記下安裝路徑（如 `C:\Program Files\Tesseract-OCR\tesseract.exe`）。
 
-## Docker Production Mode
+- **Poppler**：
+   - 下載並安裝 Poppler。
+   - 安裝完成後，記下 `poppler_path`（如 `C:\Program Files\poppler-24.08.0\Library\bin`）。
 
-1. Copy the configuration example and create your own config file:
-   ```
-   cp config_example.ini config.ini
-   ```
+在程式碼中配置 Tesseract 和 Poppler 的路徑：
 
-2. Manually add your `secret key` to the `config.ini`.
+```python
+# Configure Tesseract path if necessary (update this path as needed)
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
 
-3. Create a `logs` directory:
-   ```
-   mkdir logs
-   ```
+# Specify the path to the Poppler binaries
+poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin"
+```
+
+確保將上述路徑替換為本地實際安裝的路徑。
+
+確保您的 ZIP 文件包含以下資料夾和文件 (下載官方 dataset 後)：
+
+- `競賽資料集/reference/faq/pid_map_content.json`
+- `競賽資料集/reference/finance/*.pdf`
+- `競賽資料集/reference/insurance/*.pdf`
+
+運行 data preprocess scripts:
 
-4. Navigate to the `docker` directory:
    ```
-   cd docker
+   python3 Proprocess/data_process/data_preprocess.py
+   python3 Preprocess/data_process/read_pdf_noocr.py
+   python3 Preprocess/data_process/conbine_readpdf_result.py
    ```
 
-5. Start the Docker environment:
+10. Data insert to weaviate:
    ```
-   docker-compose up -d
+   python3 Preprocess/insert_data.py
    ```
 
-6. Build the Docker image:
+11. Run the Flask app (/ 是 API Docs, /api/chat/ 是我們的 Retrieval API):
    ```
-   docker build -t aicup_img -f dockerfile .
+   python3 Model/flask_app.py
    ```
 
-7. Run the Docker container:
+11. 將主辦方提供的題目 json 檔案改名為 questions.json 並塞入 data/
+
+12. 運行 main.py 進行測試得出 data/pred_retrieve.json 提交最終結果給主辦方:
    ```
-   docker run -d -p 5001:5001 --name aicup_cont aicup_img
+   python3 main.py
    ```
 
 ## Folder-specific Details
 For more detailed information about each folder and its purpose, refer to the individual `README.md` files located in their respective directories.
-
-## Contribution Guide
-We follow GitHub Flow for contributing. The steps are as follows:
-
-1. **Claim an issue**: Start by picking an issue from GitHub.
-2. **Create a branch**: Open a new branch with a clear name related to the issue (e.g., `feat/xxxxx-feature`).
-3. **Development**: After completing the feature, ensure you run pre-commit hooks:
-   ```
-   pre-commit run --all-files
-   ```
-4. **Create PR Request (PR)**:
-   - Ensure your PR is small and easily reviewable.
-   - Add the GitHub issue number to the PR title in the format `feat(#123): xxxxxx` for easy reference.
-   - Write a clear description including the reason for the change and what was modified (`Reason & Changes`).
-5. **Review & Approval**:
-   - Assign the PR to all members of the team for review.
-   - Wait for at least one approval.
-   - Ensure all CI checks pass.
-6. **Merge**: Once approved and CI passes, merge the branch into `main` yourself.
-
-## Additional Notes
-- Keep your commits focused and ensure meaningful commit messages.
-- Always rebase your branch on top of `main` before merging.
-- Avoid large, multi-purpose PRs. Smaller changes are easier to review and help prevent issues.
diff --git a/config_example.ini b/config_example.ini
index 365639d..a64b35a 100644
--- a/config_example.ini
+++ b/config_example.ini
@@ -1,8 +1,5 @@
 [Weaviate]
-weaviate_url =
-
-[Gemini]
-api_key =
+weaviate_url = http://127.0.0.1:8882
 
 [OpenAI]
 api_key =
diff --git a/data/README.md b/data/README.md
index 5f31f8f..29008c4 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,3 +1 @@
-# Data that needs to be saved in Weaviate
-
-應競賽主辦單位要求，無法將資料集上傳至公開網路，請自行準備資料集，並用 `src/data` 內的 Scripts 處理
+# 所有 Questions, Answers, References, etc. 等“資料”都會存於此資料夾
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000..567d04c
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,7 @@
+# 此資料夾紀錄所有docker相關的內容
+
+## docker_install.sh
+為 docker 安裝的腳本，只需運行此 sh 便可 setup docker & docker-compose
+
+## docker-compose.yml
+為 docker-compose 的設定檔，可透過 docker-compose 指令來觸發，裡面僅包含 weaviate 資料庫的啟動
diff --git a/docker/dockerfile b/docker/dockerfile
deleted file mode 100644
index 853149f..0000000
--- a/docker/dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-# 使用官方的 Python 3.12.3 Slim 作為基礎映像
-FROM python:3.12.3-slim
-
-# 設定工作目錄為 /app
-WORKDIR /app
-
-# 將當前目錄的內容複製到容器中的 /app
-COPY . /app
-
-# 安裝 requirements.txt 中的必要套件
-RUN pip install --no-cache-dir -r requirements.txt
-
-# 暴露應用程式運行的埠
-EXPOSE 5001
-
-# 複製 entrypoint 腳本
-COPY entrypoint.sh /entrypoint.sh
-RUN chmod +x /entrypoint.sh
-
-# 設定環境變數
-ENV NAME World
-
-# 當容器啟動時運行 entrypoint.sh
-ENTRYPOINT ["/entrypoint.sh"]
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
deleted file mode 100755
index 2d874a5..0000000
--- a/docker/entrypoint.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-# 啟動 Flask API
-cd /app
-exec python3 src/flask_app.py
diff --git a/img/aicup_result.png b/img/aicup_result.png
deleted file mode 100644
index b4a3e7f..0000000
Binary files a/img/aicup_result.png and /dev/null differ
diff --git a/src/tools/automate.py b/main.py
similarity index 59%
rename from src/tools/automate.py
rename to main.py
index b61aa2b..1015c61 100644
--- a/src/tools/automate.py
+++ b/main.py
@@ -1,48 +1,51 @@
 import json
-import time  # Import time module for timing
+import time
 
 import requests
 
-# Load questions from the JSON file
-with open('data/questions_example.json', encoding='utf-8') as file:
+# 讀取主辦提供的 Question JSON 檔案
+with open('data/questions.json', encoding='utf-8') as file:
     questions = json.load(file)['questions']
 
-output_data = {'answers': []}  # Initialize output format with "answers" array
+# 初始化輸出資料格式
+output_data = {'answers': []}
 
+# 設定 Flask 應用程式的 URL
 url = 'http://127.0.0.1:5000/api/chat'
 
-total_start_time = time.time()  # Start timing for the entire process
+# 計算總花費時間
+total_start_time = time.time()
 
 for question in questions:
-    question_start_time = time.time()  # Start timing for each question
+    # 計算每個問題的處理時間
+    question_start_time = time.time()
 
-    # Send POST request
+    # 發送 POST 請求到 Model/flask_app.py 的 Retrieve API 端點
     response = requests.post(url, json=question)
 
     if response.status_code == 200:
         response_json = response.json()
 
-        # Extract qid and retrieve from the API response
-        qid = question.get('qid')  # Assuming each question has a unique "qid" field
+        # 從回應中提取 qid 和 retrieve 欄位
+        qid = question.get('qid')
         retrieve = response_json.get('retrieve')
 
-        # Append formatted result to the answers array
+        # 將 qid 和 retrieve 加入輸出資料中
         output_data['answers'].append({'qid': qid, 'retrieve': retrieve})
         print('成功取得 JSON:', response_json)
     else:
         print('請求失敗，狀態碼:', response.status_code)
 
-    # Calculate and print time for each question
+    # 計算每個問題的處理時間
     question_end_time = time.time()
     question_duration = question_end_time - question_start_time
     print(f'QID: {qid} - 花費時間: {question_duration:.2f} 秒')
 
-# Calculate and print total time
 total_end_time = time.time()
 total_duration = total_end_time - total_start_time
 print(f'全部題目處理完成，總共花費時間: {total_duration:.2f} 秒')
 
-# Save the output data to a new JSON file
+# 將輸出資料寫入 JSON 文件
 with open('data/pred_retrieve.json', 'w', encoding='utf-8') as output_file:
     json.dump(output_data, output_file, ensure_ascii=False, indent=4)
 
diff --git a/requirements.txt b/requirements.txt
index 743e682..c332b34 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+# Python 3.12.0
 Flask==2.3.2
 Flask_Cors==4.0.0
 keyboard==0.13.5
@@ -7,10 +8,11 @@ selenium==4.21.0
 weaviate_client==3.22.1
 tiktoken==0.7.0
 langchain-community==0.2.0
-sentence-transformers==2.7.0
 flask_limiter==3.7.0
 flask_restx==1.3.0
-python-dateutil
+python-dateutil==2.9.0.post0
 redis==5.0.8
 flask-httpauth==4.8.0
-voyageai
+voyageai==0.3.1
+pytesseract==0.3.13
+pdf2image==1.17.0
diff --git a/src/batch/README.md b/src/batch/README.md
deleted file mode 100644
index 93b7dae..0000000
--- a/src/batch/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# Scripts for spidering announcement data
diff --git a/src/batch/time.txt b/src/batch/time.txt
deleted file mode 100644
index 04d1b01..0000000
--- a/src/batch/time.txt
+++ /dev/null
@@ -1 +0,0 @@
-2024-01-01 00:00:00
diff --git a/src/data/README.md b/src/data/README.md
deleted file mode 100644
index 197793b..0000000
--- a/src/data/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# Scripts to process data automatically
diff --git a/src/data/read_pdf_ocr.py b/src/data/read_pdf_ocr.py
deleted file mode 100644
index db6c31c..0000000
--- a/src/data/read_pdf_ocr.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# TODO: Add OCR processing script
-# This script was accidentally lost
diff --git a/src/tools/monitor.py b/src/tools/monitor.py
deleted file mode 100644
index d89f7ca..0000000
--- a/src/tools/monitor.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import subprocess
-import time
-
-import requests
-
-
-def check_service():
-    url = 'http://xxxx/api/'
-    try:
-        response = requests.get(url)
-        if response.status_code != 200:
-            restart_service()
-    except requests.RequestException:
-        restart_service()
-
-
-def restart_service():
-    print('Service is down. Restarting service...')
-    subprocess.run(['nohup', 'python3', 'src/flask_app.py', '&'])
-
-
-if __name__ == '__main__':
-    while True:
-        check_service()
-        time.sleep(600)
diff --git a/src/utils/ai/__init__.py b/src/utils/ai/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/utils/ai/call_ai.py b/src/utils/ai/call_ai.py
deleted file mode 100644
index 1551113..0000000
--- a/src/utils/ai/call_ai.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from datetime import datetime
-
-from utils.ai.gemini_tem import gemini_template
-from utils.ai.gpt_tem import gpt_template
-
-
-def call_aied(wait, quest, use_gpt: bool):
-    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-
-    prompt = f"""【今日日期】: {current_time}
-You are a helpful and informative bot that answers questions using text from the reference passage included below. \
-Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
-However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
-strike a friendly and conversational tone. \
-If the passage is irrelevant to the answer, you may ignore it.
-請用繁體中文回答
-並在回應有關時間/時程的問題時，要考慮【今日日期】
-
-
-'{quest}'
-
-PASSAGE:
-'{wait[0]}
-
-{wait[1]}
-
-{wait[2]}
-
-{wait[3]}
-
-{wait[4]}
-
-{wait[5]}
-
-{wait[6]}
-
-{wait[7]}
-
-{wait[8]}
-
-{wait[9]}'
-
-ANSWER:
-"""
-    try:
-        if use_gpt:
-            res = gpt_template(prompt)
-        else:
-            res = gemini_template(prompt)
-    except Exception:
-        res = '太多使用者請求了！請等待幾秒後再重新詢問'
-
-    return res
diff --git a/src/utils/ai/gemini_tem.py b/src/utils/ai/gemini_tem.py
deleted file mode 100644
index a098852..0000000
--- a/src/utils/ai/gemini_tem.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import json
-
-import requests
-
-import utils.config_log as config_log
-
-config, logger, CONFIG_PATH = config_log.setup_config_and_logging()
-config.read(CONFIG_PATH)
-
-
-def gemini_template(prompt):
-    api_key = config.get('Gemini', 'api_key')
-    url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={api_key}'
-    payload = {'contents': [{'parts': [{'text': prompt}]}]}
-    headers = {'Content-Type': 'application/json'}
-    response = (requests.post(url, headers=headers, data=json.dumps(payload))).json()
-    return response['candidates'][0]['content']['parts'][0]['text']
-
-
-if __name__ == '__main__':
-    prompt = """告訴我 CTF 逆向分析的 3 個訣竅，用 json 格式輸出: {"訣竅1": ,"訣竅2": ,"訣竅3": }"""
-    response = gemini_template(prompt)
-    print(response)
diff --git a/src/utils/ai/gpt_tem.py b/src/utils/ai/gpt_tem.py
deleted file mode 100644
index 0486fa8..0000000
--- a/src/utils/ai/gpt_tem.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import textwrap
-
-import openai
-
-import utils.config_log as config_log
-
-config, logger, CONFIG_PATH = config_log.setup_config_and_logging()
-config.read(CONFIG_PATH)
-
-
-def gpt_template(prompt, output_way='json'):
-    openai.api_key = config.get('OpenAI', 'api_key')
-
-    userprompt = textwrap.dedent(
-        f"""
-        {prompt}
-    """
-    )
-
-    response = openai.ChatCompletion.create(
-        model='gpt-4o-mini',
-        messages=[
-            {'role': 'system', 'content': '使用繁體中文回答'},
-            {'role': 'user', 'content': userprompt},
-        ],
-    )
-
-    return response.choices[0].message['content']
-
-
-def main():
-    """範例: GPT 模板使用"""
-    # import utils.gpt_integration as gpt_call
-    # gpt_call.gpt_template()
-    print(gpt_template('問題: 太陽系有哪些行星？請用 json 格式回傳，{"回傳內容": "_回答_"}'))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/src/utils/ckip.py b/src/utils/ckip.py
deleted file mode 100644
index 23d192e..0000000
--- a/src/utils/ckip.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from ckip_transformers.nlp import CkipPosTagger, CkipWordSegmenter
-
-ws_driver = CkipWordSegmenter(model='albert-base')
-pos_driver = CkipPosTagger(model='albert-base')
-
-
-def clean(sentence_ws, sentence_pos):
-    short_sentence = []
-    stop_pos = set(['Nep', 'Nh', 'Nb'])
-    for word_ws, word_pos in zip(sentence_ws, sentence_pos):
-        is_n_or_v = word_pos.startswith('V') or word_pos.startswith('N')
-        is_not_stop_pos = word_pos not in stop_pos
-        is_not_one_charactor = not (len(word_ws) == 1)
-        if is_n_or_v and is_not_stop_pos and is_not_one_charactor:
-            short_sentence.append(f'{word_ws}')
-    return ' '.join(short_sentence)
diff --git a/testing/README.md b/testing/README.md
new file mode 100644
index 0000000..c63f11c
--- /dev/null
+++ b/testing/README.md
@@ -0,0 +1,8 @@
+# 此資料庫為進行一些測試及檢測的程式碼
+這些程式不影響正式比賽時的運行，也不影響資料處理，僅是做測試尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25)；還有用來為公開測資對答案的腳本。
+
+# get_best_alpha.py
+用來測試不同 alpha 的準確率，尋找最佳 hybrid search 比重 (多少 % text2vec, 多少 % bm25)
+
+# checkans.py
+用來為公開測資對答案的腳本
diff --git a/src/tools/checkans.py b/testing/checkans.py
similarity index 100%
rename from src/tools/checkans.py
rename to testing/checkans.py
diff --git a/src/tools/get_best_alpha.py b/testing/get_best_alpha.py
similarity index 97%
rename from src/tools/get_best_alpha.py
rename to testing/get_best_alpha.py
index b2dec99..b552275 100644
--- a/src/tools/get_best_alpha.py
+++ b/testing/get_best_alpha.py
@@ -4,7 +4,7 @@
 import requests
 
 # Load questions from the JSON file
-with open('data/questions_example.json', encoding='utf-8') as file:
+with open('data/questions.json', encoding='utf-8') as file:
     questions = json.load(file)['questions']
 
 # Load ground truth data