Skip to content

Commit

Permalink
refactor: for aicup review structure
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinHsu1019 committed Nov 13, 2024
1 parent b44ae8d commit db84d4c
Show file tree
Hide file tree
Showing 32 changed files with 188 additions and 201 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file added Model/utils/README.md
Empty file.
Empty file added Model/utils/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
117 changes: 117 additions & 0 deletions Preprocess/data_process/merge_with_ocr_pdfminer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import json
import os

from pdfminer.high_level import extract_text

# 文件路径
FAQ_FILEPATH = 'reference/faq/pid_map_content.json'
FINANCE_FOLDER_PATH = 'reference/finance'
INSURANCE_FOLDER_PATH = 'reference/insurance'
OCR_FOLDER_PATH = 'dataset/output_text/競賽資料集/reference'


def check_text(file_path, category):
"""处理 FAQ 文件,返回格式化的数据列表。"""
formatted_data = []
with open(file_path, encoding='utf-8') as faq_file:
loaded_faq = json.load(faq_file)

for qid, questions in loaded_faq.items():
for question_item in questions:
formatted_entry = {
'category': category,
'qid': qid,
'content': {'question': question_item['question'], 'answers': question_item['answers']},
}
formatted_data.append(formatted_entry)
print(formatted_entry)
return formatted_data


def check_pdf_with_table(folder_path, category):
"""处理 PDF 文件,返回格式化的数据列表和需要 OCR 的文件列表。"""
formatted_data = []
need_to_ocr = []

file_list = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
sorted_file_list = sorted(file_list, key=lambda x: int(os.path.splitext(x)[0]))

for filename in sorted_file_list:
filename_without_ext, _ = os.path.splitext(filename)
file_path = os.path.join(folder_path, filename)
all_text = ''

try:
# 获取 PDF 的总页数
from pdfminer.pdfpage import PDFPage

with open(file_path, 'rb') as f:
total_pages = len(list(PDFPage.get_pages(f)))
except Exception as e:
print(f'无法获取文件 {file_path} 的页数:{e}')
need_to_ocr.append([category, filename, 'all pages'])
continue

for page_number in range(total_pages):
ocr_file_path = os.path.join(
OCR_FOLDER_PATH, category, f'{filename_without_ext}.pdf_page_{page_number + 1}.txt'
)

try:
# 尝试读取 OCR 生成的文本文件
with open(ocr_file_path, encoding='utf-8') as ocr_file:
content = ocr_file.read()
except FileNotFoundError:
# 如果没有 OCR 文件,则从 PDF 中提取该页的文本
try:
content = extract_text(file_path, page_numbers=[page_number]) or ''
except Exception as e:
print(f'提取文件 {file_path}{page_number + 1} 页时出错:{e}')
content = ''

if content:
# signal_character_lines = sum(
# 1 for line in content.split("\n") if len(line.strip()) == 1
# )
all_text += content + '\n\n'

# if signal_character_lines >= 40:
# need_to_ocr.append([category, filename, f"page{page_number + 1}"])
else:
need_to_ocr.append([category, filename, f'page{page_number + 1}'])

formatted_entry = {'category': category, 'qid': filename_without_ext, 'content': all_text.strip()}
formatted_data.append(formatted_entry)
print(formatted_entry)

return formatted_data, need_to_ocr


if __name__ == '__main__':
# 总的格式化数据列表和需要 OCR 的文件列表
total_formatted_data = []
total_need_to_ocr = []

# 处理 FAQ 文件
faq_data = check_text(FAQ_FILEPATH, 'faq')
total_formatted_data.extend(faq_data)

# 处理金融类 PDF 文件
finance_data, finance_need_to_ocr = check_pdf_with_table(FINANCE_FOLDER_PATH, 'finance')
total_formatted_data.extend(finance_data)
total_need_to_ocr.extend(finance_need_to_ocr)

# 处理保险类 PDF 文件
insurance_data, insurance_need_to_ocr = check_pdf_with_table(INSURANCE_FOLDER_PATH, 'insurance')
total_formatted_data.extend(insurance_data)
total_need_to_ocr.extend(insurance_need_to_ocr)

# 将整理好的数据存入 formatted_reference_ocr.json
with open('dataset/formatted_reference_ocr_pdfminer.json', 'w', encoding='utf-8') as formatted_file:
json.dump(total_formatted_data, formatted_file, ensure_ascii=False, indent=4)

# 将需要 OCR 的文件列表存入 need_to_ocr.txt
with open('dataset/need_to_ocr_again_pdfminer.txt', 'w', encoding='utf-8') as ocr_file:
json.dump(total_need_to_ocr, ocr_file, ensure_ascii=False, indent=4)

print(f'需要 OCR 的文件数量: {len(total_need_to_ocr)}')
File renamed without changes.
69 changes: 69 additions & 0 deletions Preprocess/data_process/read_pdf_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import zipfile

import pytesseract
from pdf2image import convert_from_bytes

# Configure Tesseract path if necessary
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


def ocr_cond_in_floder(zip_path, folder):
folder_path = f'{folder}/'
file_ratios = []

with zipfile.ZipFile(zip_path, 'r') as zipf:
# Select the files that the content can only be captued with OCR
# by finding its compression ratio and original size
for zip_info in zipf.infolist():
if zip_info.filename.startswith(folder_path) and not zip_info.is_dir():
original_size = zip_info.file_size # Uncompressed size
compressed_size = zip_info.compress_size # Compressed size

# Avoid division by zero for empty files
if compressed_size > 0:
compression_ratio = 1 - (compressed_size / original_size)
else:
compression_ratio = float('inf') # Assign infinite ratio for empty files

# Since we decide to OCR all the files, the code below is marked down
# if (compression_ratio >= 0.1)
# or (compression_ratio >= 0.05
# and (original_size > 750*1024 or original_size < 170*1024))
# or (original_size > 1500*1024) or (original_size < 120*1024):
file_ratios.append((zip_info.filename, compression_ratio))

# Sort by compression ratio in descending order
file_ratios.sort(key=lambda x: x[1], reverse=False)

for file_name, _ in file_ratios:
with zipf.open(file_name) as pdf_file:
pdf_bytes = pdf_file.read()

# Specify the path to the Poppler binaries if needed
poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin'

# Convert the PDF bytes to images
images = convert_from_bytes(pdf_bytes, dpi=300, poppler_path=poppler_path)

# Create output directory
output_dir = 'output_text'
os.makedirs(output_dir, exist_ok=True)

# Perform OCR on each page and save the text
for i, image in enumerate(images):
text = pytesseract.image_to_string(image, lang='chi_tra') # Specify the language for OCR
# Create subdirectory structure within 'output_text'
output_file_path = os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt')
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
with open(os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt'), 'w', encoding='utf-8') as f:
f.write(text)
print(f'OCR completed for {file_name}')

# return file_ratios


# Usage
zip_path = 'datazip.zip'
ocr_cond_in_floder(zip_path, '競賽資料集/reference/finance')
ocr_cond_in_floder(zip_path, '競賽資料集/reference/insurance')
File renamed without changes.
8 changes: 0 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
# AI CUP 2024 玉山人工智慧公開挑戰賽-RAG與LLM在金融問答的應用
** High-Accuracy RAG Retriever Template **

## Rankings

- Overall Ranking: 38th out of 487 teams (~7.8%)
- Leaderboard: 38th out of 222

![AI Cup Result](img/aicup_result.png)

## Development Mode
To set up the development environment, follow these steps:
Expand Down
3 changes: 0 additions & 3 deletions data/README.md

This file was deleted.

Empty file added docker/README.md
Empty file.
24 changes: 0 additions & 24 deletions docker/dockerfile

This file was deleted.

5 changes: 0 additions & 5 deletions docker/entrypoint.sh

This file was deleted.

Binary file removed img/aicup_result.png
Binary file not shown.
File renamed without changes.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Python==3.12.0
Flask==2.3.2
Flask_Cors==4.0.0
keyboard==0.13.5
Expand All @@ -13,4 +14,4 @@ flask_restx==1.3.0
python-dateutil
redis==5.0.8
flask-httpauth==4.8.0
voyageai
voyageai==0.3.1
1 change: 0 additions & 1 deletion src/batch/README.md

This file was deleted.

1 change: 0 additions & 1 deletion src/batch/time.txt

This file was deleted.

2 changes: 0 additions & 2 deletions src/data/read_pdf_ocr.py

This file was deleted.

25 changes: 0 additions & 25 deletions src/tools/monitor.py

This file was deleted.

53 changes: 0 additions & 53 deletions src/utils/ai/call_ai.py

This file was deleted.

23 changes: 0 additions & 23 deletions src/utils/ai/gemini_tem.py

This file was deleted.

39 changes: 0 additions & 39 deletions src/utils/ai/gpt_tem.py

This file was deleted.

Loading

0 comments on commit db84d4c

Please sign in to comment.