diff --git a/README.md b/README.md index 2366626..c14cd42 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,11 @@ -# aicup-rag +# AI CUP 2024 玉山人工智慧公開挑戰賽-RAG與LLM在金融問答的應用 + +## Result + +- Total: 38 / 487 Teams + - Leaderboard: 38 / 222 + +![AI Cup Result](img/aicup_result.png) ## Development Mode To set up the development environment, follow these steps: diff --git a/data/conbine_result.py b/data/conbine_result.py index 0f72467..a021ced 100644 --- a/data/conbine_result.py +++ b/data/conbine_result.py @@ -1,31 +1,33 @@ import json # 載入 aicup_noocr.json 和 aicup_ref.json -with open('data/aicup_noocr.json', 'r', encoding='utf-8') as file: +with open('data/aicup_noocr.json', encoding='utf-8') as file: noocr_data = json.load(file) -with open('data/aicup_ref.json', 'r', encoding='utf-8') as file: +with open('data/aicup_ref.json', encoding='utf-8') as file: ref_data = json.load(file) # 建立 ref_data 的 dictionary,並檢查 content 是否為字串,再去除空格 ref_dict = { - (item["category"], item["pid"]): ''.join(item["content"].split()) if isinstance(item["content"], str) else item["content"] + (item['category'], item['pid']): ''.join(item['content'].split()) + if isinstance(item['content'], str) + else item['content'] for item in ref_data } # 更新 noocr_data 中空的 content for item in noocr_data: - category = item["category"] - pid = item["pid"] - content = item["content"] + category = item['category'] + pid = item['pid'] + content = item['content'] # 如果 content 是 string 並且為空,則從 ref_data 裡填入去掉空格的 content - if isinstance(content, str) and content == "": + if isinstance(content, str) and content == '': if (category, pid) in ref_dict: - item["content"] = ref_dict[(category, pid)] + item['content'] = ref_dict[(category, pid)] # 將結果寫入 aicup_noocr_sec.json with open('data/aicup_noocr_sec.json', 'w', encoding='utf-8') as file: json.dump(noocr_data, file, ensure_ascii=False, indent=4) -print("已完成比對並生成 aicup_noocr_sec.json,並移除轉入的 content 中的空格(如果 content 是字串)") +print('已完成比對並生成 aicup_noocr_sec.json,並移除轉入的 content 中的空格(如果 content 是字串)') diff --git a/data/read_data_noocr.py b/data/read_data_noocr.py index 7b254d9..cbc744e 100644 --- a/data/read_data_noocr.py +++ b/data/read_data_noocr.py @@ -1,8 +1,10 @@ -import os import json +import os + import pdfplumber from tqdm import tqdm + # 讀取單個PDF文件並返回其文本內容 def read_pdf(pdf_loc): pdf = pdfplumber.open(pdf_loc) @@ -14,6 +16,7 @@ def read_pdf(pdf_loc): pdf.close() return pdf_text + # 從指定資料夾載入PDF文件,並根據資料夾名稱設定category def load_data_by_category(source_path, category): pdf_files = [f for f in os.listdir(source_path) if f.endswith('.pdf')] @@ -21,17 +24,18 @@ def load_data_by_category(source_path, category): for file in tqdm(pdf_files): pid = file.replace('.pdf', '') # 擷取檔案名稱作為pid content = read_pdf(os.path.join(source_path, file)) # 讀取PDF內文 - data.append({"category": category, "pid": pid, "content": content}) + data.append({'category': category, 'pid': pid, 'content': content}) return data + # 主程式 def generate_json(output_path): all_data = [] - + # 載入不同類別的PDF資料 source_paths = { - "finance": "reference/finance", # finance 資料夾的路徑 - "insurance": "reference/insurance" # insurance 資料夾的路徑 + 'finance': 'reference/finance', # finance 資料夾的路徑 + 'insurance': 'reference/insurance', # insurance 資料夾的路徑 } # 遍歷每個類別的資料夾並載入資料 @@ -43,6 +47,7 @@ def generate_json(output_path): with open(output_path, 'w', encoding='utf8') as f: json.dump(all_data, f, ensure_ascii=False, indent=4) + # 設定輸出路徑 output_path = 'data/aicup_noocr.json' generate_json(output_path) diff --git a/img/aicup_result.png b/img/aicup_result.png new file mode 100644 index 0000000..b4a3e7f Binary files /dev/null and b/img/aicup_result.png differ diff --git a/src/db_insert.py b/src/db_insert.py index ca5d6d0..76ec8a0 100644 --- a/src/db_insert.py +++ b/src/db_insert.py @@ -1,8 +1,9 @@ -import time import json -from langchain.text_splitter import RecursiveCharacterTextSplitter +import time + import utils.config_log as config_log import weaviate +from langchain.text_splitter import RecursiveCharacterTextSplitter config, logger, CONFIG_PATH = config_log.setup_config_and_logging() config.read(CONFIG_PATH) @@ -13,6 +14,7 @@ # Token limit for OpenAI model TOKEN_LIMIT = 8192 + class WeaviateManager: def __init__(self, classnm): self.url = wea_url @@ -28,7 +30,11 @@ def check_class_exist(self): 'class': self.classnm, 'properties': [ {'name': 'pid', 'dataType': ['text']}, - {'name': 'content', 'dataType': ['text'], "tokenization": "gse"}, # `gse` implements the "Jieba" algorithm, which is a popular Chinese text segmentation algorithm. + { + 'name': 'content', + 'dataType': ['text'], + 'tokenization': 'gse', + }, # `gse` implements the "Jieba" algorithm, which is a popular Chinese text segmentation algorithm. ], 'vectorizer': 'text2vec-openai', 'moduleConfig': { @@ -51,19 +57,19 @@ def insert_data(self, pid, content): error_msg = str(e) # 檢查是否是因為 token 長度過長 if 'maximum context length' in error_msg: - print(f"Content too long for pid: {pid}. Splitting content.") - return "TOO_LONG" # 特殊回傳值表達需要分割 + print(f'Content too long for pid: {pid}. Splitting content.') + return 'TOO_LONG' # 特殊回傳值表達需要分割 elif '429' in error_msg: print(f'Rate limit exceeded, retrying in 5 seconds... (Attempt {attempt + 1}/{max_retries})') time.sleep(5) else: - print(f"Unexpected Error for pid: {pid} - {error_msg}") + print(f'Unexpected Error for pid: {pid} - {error_msg}') return False except Exception as e: print(f'Error inserting data for pid: {pid}, category: {self.classnm} - {str(e)}') return False # 超過最大重試次數 - print(f"Failed to insert data for pid: {pid} after {max_retries} attempts.") + print(f'Failed to insert data for pid: {pid} after {max_retries} attempts.') return False def split_and_insert(self, pid, content, category): @@ -73,10 +79,10 @@ def split_and_insert(self, pid, content, category): # 逐段插入分割後的文本,保持相同的 pid 和 category for idx, part in enumerate(split_content): - print(f"Inserting split content part {idx + 1} for pid: {pid}") + print(f'Inserting split content part {idx + 1} for pid: {pid}') success = self.insert_data(pid, part) if not success: - failed_records.append({"pid": pid, "category": category}) + failed_records.append({'pid': pid, 'category': category}) if __name__ == '__main__': @@ -90,27 +96,27 @@ def split_and_insert(self, pid, content, category): pid = item['pid'] content = item['content'] - if category == "faq": - classnm = "faqdev" + if category == 'faq': + classnm = 'faqdev' content_str = json.dumps(content, ensure_ascii=False, indent=4) - elif category == "insurance": - classnm = "insurancedev" + elif category == 'insurance': + classnm = 'insurancedev' content_str = content - elif category == "finance": - classnm = "financedev" + elif category == 'finance': + classnm = 'financedev' content_str = json.dumps(content, ensure_ascii=False, indent=4) if isinstance(content, dict) else content else: - print("Unknown category, skipping item.") + print('Unknown category, skipping item.') continue manager = WeaviateManager(classnm) result = manager.insert_data(pid, content_str) # 如果內容過長需要切割 - if result == "TOO_LONG": + if result == 'TOO_LONG': manager.split_and_insert(pid, content_str, category) elif not result: # 如果失敗且非長度問題 - failed_records.append({"pid": pid, "category": category}) + failed_records.append({'pid': pid, 'category': category}) # 將失敗的資料寫入 JSON 檔案 if failed_records: @@ -118,4 +124,4 @@ def split_and_insert(self, pid, content, category): json.dump(failed_records, f, ensure_ascii=False, indent=4) print("Failed records have been written to 'failed_imports.json'") else: - print("All records imported successfully.") + print('All records imported successfully.') diff --git a/src/flask_app.py b/src/flask_app.py index 2e4b0f5..84058ea 100644 --- a/src/flask_app.py +++ b/src/flask_app.py @@ -5,6 +5,7 @@ from flask_limiter import Limiter from flask_limiter.util import get_remote_address from flask_restx import Api, Resource, fields + # from utils.weaviate_op import search_do from utils.weaviatexreranker import search_do from werkzeug.security import check_password_hash, generate_password_hash @@ -45,7 +46,7 @@ def verify_password(username, password): 'qid': fields.Integer(required=True, description='qid of the question'), 'source': fields.List(fields.Integer, required=True, description='source of the question'), 'query': fields.String(required=True, description='The message to the chatbot'), - 'category': fields.String(required=True, description='The category of the question') + 'category': fields.String(required=True, description='The category of the question'), }, ) @@ -77,7 +78,7 @@ def post(self): # "query": "匯款銀行及中間行所收取之相關費用由誰負擔?", # "category": "insurance" # }, - + alpha = 0.5 if not question: @@ -87,10 +88,7 @@ def post(self): else: try: response = search_do(question, category, source, alpha) - response = { - 'qid': qid, - 'retrieve': int(response) - } + response = {'qid': qid, 'retrieve': int(response)} response = jsonify(response) diff --git a/src/tools/automate.py b/src/tools/automate.py index b2b8a04..b61aa2b 100644 --- a/src/tools/automate.py +++ b/src/tools/automate.py @@ -1,51 +1,49 @@ -import requests import json import time # Import time module for timing +import requests + # Load questions from the JSON file -with open('data/questions_example.json', 'r', encoding='utf-8') as file: +with open('data/questions_example.json', encoding='utf-8') as file: questions = json.load(file)['questions'] -output_data = {"answers": []} # Initialize output format with "answers" array +output_data = {'answers': []} # Initialize output format with "answers" array -url = "http://127.0.0.1:5000/api/chat" +url = 'http://127.0.0.1:5000/api/chat' total_start_time = time.time() # Start timing for the entire process for question in questions: question_start_time = time.time() # Start timing for each question - + # Send POST request response = requests.post(url, json=question) if response.status_code == 200: response_json = response.json() - + # Extract qid and retrieve from the API response - qid = question.get("qid") # Assuming each question has a unique "qid" field - retrieve = response_json.get("retrieve") - + qid = question.get('qid') # Assuming each question has a unique "qid" field + retrieve = response_json.get('retrieve') + # Append formatted result to the answers array - output_data["answers"].append({ - "qid": qid, - "retrieve": retrieve - }) - print("成功取得 JSON:", response_json) + output_data['answers'].append({'qid': qid, 'retrieve': retrieve}) + print('成功取得 JSON:', response_json) else: - print("請求失敗,狀態碼:", response.status_code) - + print('請求失敗,狀態碼:', response.status_code) + # Calculate and print time for each question question_end_time = time.time() question_duration = question_end_time - question_start_time - print(f"QID: {qid} - 花費時間: {question_duration:.2f} 秒") + print(f'QID: {qid} - 花費時間: {question_duration:.2f} 秒') # Calculate and print total time total_end_time = time.time() total_duration = total_end_time - total_start_time -print(f"全部題目處理完成,總共花費時間: {total_duration:.2f} 秒") +print(f'全部題目處理完成,總共花費時間: {total_duration:.2f} 秒') # Save the output data to a new JSON file with open('data/pred_retrieve.json', 'w', encoding='utf-8') as output_file: json.dump(output_data, output_file, ensure_ascii=False, indent=4) -print("合併輸出已保存到 pred_retrieve.json 文件中。") +print('合併輸出已保存到 pred_retrieve.json 文件中。') diff --git a/src/tools/checkans.py b/src/tools/checkans.py index 5eeb05a..547700e 100644 --- a/src/tools/checkans.py +++ b/src/tools/checkans.py @@ -2,39 +2,39 @@ from collections import defaultdict # Load ground truth data -with open('data/ground_truths_example.json', 'r') as f: - ground_truths = json.load(f)["ground_truths"] +with open('data/ground_truths_example.json') as f: + ground_truths = json.load(f)['ground_truths'] # Load predicted data with the new format -with open('data/pred_retrieve.json', 'r') as f: - pred_retrieves = json.load(f)["answers"] +with open('data/pred_retrieve.json') as f: + pred_retrieves = json.load(f)['answers'] # Create a dictionary from predictions for easy lookup -pred_dict = {item["qid"]: item["retrieve"] for item in pred_retrieves} +pred_dict = {item['qid']: item['retrieve'] for item in pred_retrieves} # Initialize counters and data structures incorrect_qids = [] correct_count = 0 -category_counts = defaultdict(lambda: {"correct": 0, "total": 0}) +category_counts = defaultdict(lambda: {'correct': 0, 'total': 0}) # Compare predictions to ground truth for ground in ground_truths: - qid = ground["qid"] - category = ground["category"] - correct_retrieve = ground["retrieve"] + qid = ground['qid'] + category = ground['category'] + correct_retrieve = ground['retrieve'] predicted_retrieve = pred_dict.get(qid) if predicted_retrieve == correct_retrieve: correct_count += 1 - category_counts[category]["correct"] += 1 + category_counts[category]['correct'] += 1 else: incorrect_qids.append(qid) - category_counts[category]["total"] += 1 + category_counts[category]['total'] += 1 # Print results -print("錯誤的題目 QID:", incorrect_qids) -print(f"總正確題數: {correct_count} / {len(ground_truths)}") +print('錯誤的題目 QID:', incorrect_qids) +print(f'總正確題數: {correct_count} / {len(ground_truths)}') for category, counts in category_counts.items(): print(f"類別 {category}: {counts['correct']} / {counts['total']}") diff --git a/src/tools/get_best_alpha.py b/src/tools/get_best_alpha.py index e6b7527..b2dec99 100644 --- a/src/tools/get_best_alpha.py +++ b/src/tools/get_best_alpha.py @@ -1,14 +1,15 @@ -import requests import json from collections import defaultdict +import requests + # Load questions from the JSON file -with open('data/questions_example.json', 'r', encoding='utf-8') as file: +with open('data/questions_example.json', encoding='utf-8') as file: questions = json.load(file)['questions'] # Load ground truth data -with open('data/ground_truths_example.json', 'r', encoding='utf-8') as f: - ground_truths = json.load(f)["ground_truths"] +with open('data/ground_truths_example.json', encoding='utf-8') as f: + ground_truths = json.load(f)['ground_truths'] # Dictionary to hold the best alpha and accuracy best_alpha = 0.0 @@ -16,60 +17,57 @@ # Loop through alpha values from 0.0 to 1.0 for alpha in [round(x * 0.1, 1) for x in range(11)]: - output_data = {"answers": []} # Reset output format with "answers" array + output_data = {'answers': []} # Reset output format with "answers" array - url = "http://127.0.0.1:5000/api/chat" + url = 'http://127.0.0.1:5000/api/chat' # Send each question to the API with the current alpha for question in questions: # Add the alpha key to the question payload - question_with_alpha = {**question, "alpha": alpha} + question_with_alpha = {**question, 'alpha': alpha} # Send POST request response = requests.post(url, json=question_with_alpha) if response.status_code == 200: response_json = response.json() - qid = question.get("qid") - retrieve = response_json.get("retrieve") + qid = question.get('qid') + retrieve = response_json.get('retrieve') # Append formatted result to the answers array - output_data["answers"].append({ - "qid": qid, - "retrieve": retrieve - }) + output_data['answers'].append({'qid': qid, 'retrieve': retrieve}) else: - print(f"請求失敗,狀態碼: {response.status_code},Alpha 值: {alpha}") - + print(f'請求失敗,狀態碼: {response.status_code},Alpha 值: {alpha}') + # Save predictions for the current alpha pred_file = f'data/pred_retrieve_alpha_{alpha}.json' with open(pred_file, 'w', encoding='utf-8') as output_file: json.dump(output_data, output_file, ensure_ascii=False, indent=4) # Load predictions for comparison - pred_dict = {item["qid"]: item["retrieve"] for item in output_data["answers"]} + pred_dict = {item['qid']: item['retrieve'] for item in output_data['answers']} # Initialize counters and data structures for accuracy calculation correct_count = 0 - category_counts = defaultdict(lambda: {"correct": 0, "total": 0}) + category_counts = defaultdict(lambda: {'correct': 0, 'total': 0}) # Compare predictions to ground truth for ground in ground_truths: - qid = ground["qid"] - category = ground["category"] - correct_retrieve = ground["retrieve"] + qid = ground['qid'] + category = ground['category'] + correct_retrieve = ground['retrieve'] predicted_retrieve = pred_dict.get(qid) if predicted_retrieve == correct_retrieve: correct_count += 1 - category_counts[category]["correct"] += 1 + category_counts[category]['correct'] += 1 - category_counts[category]["total"] += 1 + category_counts[category]['total'] += 1 # Calculate accuracy for the current alpha accuracy = correct_count / len(ground_truths) - print("Corrrect count: ", correct_count) - print(f"Alpha: {alpha}, 正確率: {accuracy:.2%}") + print('Corrrect count: ', correct_count) + print(f'Alpha: {alpha}, 正確率: {accuracy:.2%}') # Track the best alpha and accuracy if accuracy > best_accuracy: @@ -77,4 +75,4 @@ best_accuracy = accuracy # Output the best alpha and accuracy -print(f"最佳 Alpha 值: {best_alpha}, 準確率: {best_accuracy:.2%}") +print(f'最佳 Alpha 值: {best_alpha}, 準確率: {best_accuracy:.2%}') diff --git a/src/utils/ckip.py b/src/utils/ckip.py index df2f0b8..23d192e 100644 --- a/src/utils/ckip.py +++ b/src/utils/ckip.py @@ -1,17 +1,16 @@ -from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger +from ckip_transformers.nlp import CkipPosTagger, CkipWordSegmenter - -ws_driver = CkipWordSegmenter(model="albert-base") -pos_driver = CkipPosTagger(model="albert-base") +ws_driver = CkipWordSegmenter(model='albert-base') +pos_driver = CkipPosTagger(model='albert-base') def clean(sentence_ws, sentence_pos): short_sentence = [] - stop_pos = set(["Nep", "Nh", "Nb"]) + stop_pos = set(['Nep', 'Nh', 'Nb']) for word_ws, word_pos in zip(sentence_ws, sentence_pos): - is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N") + is_n_or_v = word_pos.startswith('V') or word_pos.startswith('N') is_not_stop_pos = word_pos not in stop_pos is_not_one_charactor = not (len(word_ws) == 1) - if is_N_or_V and is_not_stop_pos and is_not_one_charactor: - short_sentence.append(f"{word_ws}") - return " ".join(short_sentence) + if is_n_or_v and is_not_stop_pos and is_not_one_charactor: + short_sentence.append(f'{word_ws}') + return ' '.join(short_sentence) diff --git a/src/utils/weaviatexreranker.py b/src/utils/weaviatexreranker.py index f600343..4400670 100644 --- a/src/utils/weaviatexreranker.py +++ b/src/utils/weaviatexreranker.py @@ -1,8 +1,10 @@ import os + +import voyageai import weaviate from langchain.embeddings import OpenAIEmbeddings + import utils.config_log as config_log -import voyageai # 載入設定檔案和日誌設定 config, logger, CONFIG_PATH = config_log.setup_config_and_logging() @@ -16,6 +18,7 @@ # 設定 OpenAI API 金鑰 os.environ['OPENAI_API_KEY'] = config.get('OpenAI', 'api_key') + class WeaviateSemanticSearch: def __init__(self, classnm): self.url = wea_url @@ -27,9 +30,7 @@ def hybrid_search(self, query, source, num, alpha): query_vector = self.embeddings.embed_query(query) vector_str = ','.join(map(str, query_vector)) - where_conditions = ' '.join([ - f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source - ]) + where_conditions = ' '.join([f'{{path: ["pid"], operator: Equal, valueText: "{pid}"}}' for pid in source]) gql_query = f""" {{ @@ -63,7 +64,7 @@ def hybrid_search(self, query, source, num, alpha): def rerank_with_voyage(query, documents, pids, api_key): vo = voyageai.Client(api_key=api_key) - reranking = vo.rerank(query, documents, model="rerank-2", top_k=1) + reranking = vo.rerank(query, documents, model='rerank-2', top_k=1) top_result = reranking.results[0] # 根據內容找到相對應的 pid @@ -72,17 +73,17 @@ def rerank_with_voyage(query, documents, pids, api_key): def search_do(question, category, source, alpha): - if category == "finance": - vdb_named = "Financedev" - elif category == "insurance": - vdb_named = "Insurancedev" + if category == 'finance': + vdb_named = 'Financedev' + elif category == 'insurance': + vdb_named = 'Insurancedev' else: - vdb_named = "Faqdev" + vdb_named = 'Faqdev' searcher = WeaviateSemanticSearch(vdb_named) # 從 Weaviate 取得前 100 筆結果 top_100_results = searcher.hybrid_search(question, source, 100, alpha=alpha) - + # 準備文件和 pid 列表供 rerank 使用 documents = [result['content'] for result in top_100_results] pids = [result['pid'] for result in top_100_results] @@ -90,8 +91,8 @@ def search_do(question, category, source, alpha): # 使用 VoyageAI 重新排序,並取得排名最高的 pid top_reranked_result = rerank_with_voyage(question, documents, pids, voyage_api_key) - print("最相關文件的 PID:") + print('最相關文件的 PID:') print(f"PID: {top_reranked_result['pid']}") print(f"相關性分數: {top_reranked_result['relevance_score']}") - + return top_reranked_result['pid']