refactor: for aicup review structure

JustinHsu1019 · Nov 13, 2024 · db84d4c · db84d4c
1 parent b44ae8d
commit db84d4c
Show file tree

Hide file tree

Showing 32 changed files with 188 additions and 201 deletions.
diff --git a/src/utils/__init__.py → .github/README.md b/src/utils/__init__.py → .github/README.md
diff --git a/src/utils/ai/__init__.py → Model/README.md b/src/utils/ai/__init__.py → Model/README.md
diff --git a/src/flask_app.py → Model/flask_app.py b/src/flask_app.py → Model/flask_app.py
diff --git a/Model/utils/README.md b/Model/utils/README.md
diff --git a/Model/utils/__init__.py b/Model/utils/__init__.py
diff --git a/src/utils/config_log.py → Model/utils/config_log.py b/src/utils/config_log.py → Model/utils/config_log.py
diff --git a/src/utils/retrieval_agent.py → Model/utils/retrieval_agent.py b/src/utils/retrieval_agent.py → Model/utils/retrieval_agent.py
diff --git a/src/data/README.md → Preprocess/README.md b/src/data/README.md → Preprocess/README.md
diff --git a/src/data/conbine_readpdf_result.py → ...ss/data_process/conbine_readpdf_result.py b/src/data/conbine_readpdf_result.py → ...ss/data_process/conbine_readpdf_result.py
diff --git a/Preprocess/data_process/merge_with_ocr_pdfminer.py b/Preprocess/data_process/merge_with_ocr_pdfminer.py
@@ -0,0 +1,117 @@
+import json
+import os
+
+from pdfminer.high_level import extract_text
+
+# 文件路径
+FAQ_FILEPATH = 'reference/faq/pid_map_content.json'
+FINANCE_FOLDER_PATH = 'reference/finance'
+INSURANCE_FOLDER_PATH = 'reference/insurance'
+OCR_FOLDER_PATH = 'dataset/output_text/競賽資料集/reference'
+
+
+def check_text(file_path, category):
+    """处理 FAQ 文件，返回格式化的数据列表。"""
+    formatted_data = []
+    with open(file_path, encoding='utf-8') as faq_file:
+        loaded_faq = json.load(faq_file)
+
+    for qid, questions in loaded_faq.items():
+        for question_item in questions:
+            formatted_entry = {
+                'category': category,
+                'qid': qid,
+                'content': {'question': question_item['question'], 'answers': question_item['answers']},
+            }
+            formatted_data.append(formatted_entry)
+            print(formatted_entry)
+    return formatted_data
+
+
+def check_pdf_with_table(folder_path, category):
+    """处理 PDF 文件，返回格式化的数据列表和需要 OCR 的文件列表。"""
+    formatted_data = []
+    need_to_ocr = []
+
+    file_list = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
+    sorted_file_list = sorted(file_list, key=lambda x: int(os.path.splitext(x)[0]))
+
+    for filename in sorted_file_list:
+        filename_without_ext, _ = os.path.splitext(filename)
+        file_path = os.path.join(folder_path, filename)
+        all_text = ''
+
+        try:
+            # 获取 PDF 的总页数
+            from pdfminer.pdfpage import PDFPage
+
+            with open(file_path, 'rb') as f:
+                total_pages = len(list(PDFPage.get_pages(f)))
+        except Exception as e:
+            print(f'无法获取文件 {file_path} 的页数：{e}')
+            need_to_ocr.append([category, filename, 'all pages'])
+            continue
+
+        for page_number in range(total_pages):
+            ocr_file_path = os.path.join(
+                OCR_FOLDER_PATH, category, f'{filename_without_ext}.pdf_page_{page_number + 1}.txt'
+            )
+
+            try:
+                # 尝试读取 OCR 生成的文本文件
+                with open(ocr_file_path, encoding='utf-8') as ocr_file:
+                    content = ocr_file.read()
+            except FileNotFoundError:
+                # 如果没有 OCR 文件，则从 PDF 中提取该页的文本
+                try:
+                    content = extract_text(file_path, page_numbers=[page_number]) or ''
+                except Exception as e:
+                    print(f'提取文件 {file_path} 第 {page_number + 1} 页时出错：{e}')
+                    content = ''
+
+            if content:
+                # signal_character_lines = sum(
+                #     1 for line in content.split("\n") if len(line.strip()) == 1
+                # )
+                all_text += content + '\n\n'
+
+                # if signal_character_lines >= 40:
+                #     need_to_ocr.append([category, filename, f"page{page_number + 1}"])
+            else:
+                need_to_ocr.append([category, filename, f'page{page_number + 1}'])
+
+        formatted_entry = {'category': category, 'qid': filename_without_ext, 'content': all_text.strip()}
+        formatted_data.append(formatted_entry)
+        print(formatted_entry)
+
+    return formatted_data, need_to_ocr
+
+
+if __name__ == '__main__':
+    # 总的格式化数据列表和需要 OCR 的文件列表
+    total_formatted_data = []
+    total_need_to_ocr = []
+
+    # 处理 FAQ 文件
+    faq_data = check_text(FAQ_FILEPATH, 'faq')
+    total_formatted_data.extend(faq_data)
+
+    # 处理金融类 PDF 文件
+    finance_data, finance_need_to_ocr = check_pdf_with_table(FINANCE_FOLDER_PATH, 'finance')
+    total_formatted_data.extend(finance_data)
+    total_need_to_ocr.extend(finance_need_to_ocr)
+
+    # 处理保险类 PDF 文件
+    insurance_data, insurance_need_to_ocr = check_pdf_with_table(INSURANCE_FOLDER_PATH, 'insurance')
+    total_formatted_data.extend(insurance_data)
+    total_need_to_ocr.extend(insurance_need_to_ocr)
+
+    # 将整理好的数据存入 formatted_reference_ocr.json
+    with open('dataset/formatted_reference_ocr_pdfminer.json', 'w', encoding='utf-8') as formatted_file:
+        json.dump(total_formatted_data, formatted_file, ensure_ascii=False, indent=4)
+
+    # 将需要 OCR 的文件列表存入 need_to_ocr.txt
+    with open('dataset/need_to_ocr_again_pdfminer.txt', 'w', encoding='utf-8') as ocr_file:
+        json.dump(total_need_to_ocr, ocr_file, ensure_ascii=False, indent=4)
+
+    print(f'需要 OCR 的文件数量: {len(total_need_to_ocr)}')
diff --git a/src/data/read_pdf_noocr.py → Preprocess/data_process/read_pdf_noocr.py b/src/data/read_pdf_noocr.py → Preprocess/data_process/read_pdf_noocr.py
diff --git a/Preprocess/data_process/read_pdf_ocr.py b/Preprocess/data_process/read_pdf_ocr.py
@@ -0,0 +1,69 @@
+import os
+import zipfile
+
+import pytesseract
+from pdf2image import convert_from_bytes
+
+# Configure Tesseract path if necessary
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+
+
+def ocr_cond_in_floder(zip_path, folder):
+    folder_path = f'{folder}/'
+    file_ratios = []
+
+    with zipfile.ZipFile(zip_path, 'r') as zipf:
+        # Select the files that the content can only be captued with OCR
+        # by finding its compression ratio and original size
+        for zip_info in zipf.infolist():
+            if zip_info.filename.startswith(folder_path) and not zip_info.is_dir():
+                original_size = zip_info.file_size  # Uncompressed size
+                compressed_size = zip_info.compress_size  # Compressed size
+
+                # Avoid division by zero for empty files
+                if compressed_size > 0:
+                    compression_ratio = 1 - (compressed_size / original_size)
+                else:
+                    compression_ratio = float('inf')  # Assign infinite ratio for empty files
+
+                # Since we decide to OCR all the files, the code below is marked down
+                # if (compression_ratio >= 0.1)
+                # or (compression_ratio >= 0.05
+                # and (original_size > 750*1024 or original_size < 170*1024))
+                # or (original_size > 1500*1024) or (original_size < 120*1024):
+                file_ratios.append((zip_info.filename, compression_ratio))
+
+        # Sort by compression ratio in descending order
+        file_ratios.sort(key=lambda x: x[1], reverse=False)
+
+        for file_name, _ in file_ratios:
+            with zipf.open(file_name) as pdf_file:
+                pdf_bytes = pdf_file.read()
+
+                # Specify the path to the Poppler binaries if needed
+                poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin'
+
+                # Convert the PDF bytes to images
+                images = convert_from_bytes(pdf_bytes, dpi=300, poppler_path=poppler_path)
+
+                # Create output directory
+                output_dir = 'output_text'
+                os.makedirs(output_dir, exist_ok=True)
+
+                # Perform OCR on each page and save the text
+                for i, image in enumerate(images):
+                    text = pytesseract.image_to_string(image, lang='chi_tra')  # Specify the language for OCR
+                    # Create subdirectory structure within 'output_text'
+                    output_file_path = os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt')
+                    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
+                    with open(os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt'), 'w', encoding='utf-8') as f:
+                        f.write(text)
+                print(f'OCR completed for {file_name}')
+
+    # return file_ratios
+
+
+# Usage
+zip_path = 'datazip.zip'
+ocr_cond_in_floder(zip_path, '競賽資料集/reference/finance')
+ocr_cond_in_floder(zip_path, '競賽資料集/reference/insurance')
diff --git a/src/db_insert.py → Preprocess/insert_data.py b/src/db_insert.py → Preprocess/insert_data.py
diff --git a/README.md b/README.md
@@ -1,12 +1,4 @@
 # AI CUP 2024 玉山人工智慧公開挑戰賽－RAG與LLM在金融問答的應用
-＊＊ High-Accuracy RAG Retriever Template ＊＊
-
-## Rankings
-
-- Overall Ranking: 38th out of 487 teams (~7.8%)
-   - Leaderboard: 38th out of 222
-
-![AI Cup Result](img/aicup_result.png)
 
 ## Development Mode
 To set up the development environment, follow these steps:

diff --git a/data/README.md b/data/README.md
diff --git a/docker/README.md b/docker/README.md
diff --git a/docker/dockerfile b/docker/dockerfile
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
diff --git a/img/aicup_result.png b/img/aicup_result.png
diff --git a/src/tools/automate.py → main.py b/src/tools/automate.py → main.py
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+Python==3.12.0
 Flask==2.3.2
 Flask_Cors==4.0.0
 keyboard==0.13.5
@@ -13,4 +14,4 @@ flask_restx==1.3.0
 python-dateutil
 redis==5.0.8
 flask-httpauth==4.8.0
-voyageai
+voyageai==0.3.1
diff --git a/src/batch/README.md b/src/batch/README.md
diff --git a/src/batch/time.txt b/src/batch/time.txt
diff --git a/src/data/read_pdf_ocr.py b/src/data/read_pdf_ocr.py
diff --git a/src/tools/monitor.py b/src/tools/monitor.py
diff --git a/src/utils/ai/call_ai.py b/src/utils/ai/call_ai.py
diff --git a/src/utils/ai/gemini_tem.py b/src/utils/ai/gemini_tem.py
diff --git a/src/utils/ai/gpt_tem.py b/src/utils/ai/gpt_tem.py