-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: for aicup review structure
- Loading branch information
1 parent
b44ae8d
commit db84d4c
Showing
32 changed files
with
188 additions
and
201 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import json | ||
import os | ||
|
||
from pdfminer.high_level import extract_text | ||
|
||
# 文件路径 | ||
FAQ_FILEPATH = 'reference/faq/pid_map_content.json' | ||
FINANCE_FOLDER_PATH = 'reference/finance' | ||
INSURANCE_FOLDER_PATH = 'reference/insurance' | ||
OCR_FOLDER_PATH = 'dataset/output_text/競賽資料集/reference' | ||
|
||
|
||
def check_text(file_path, category): | ||
"""处理 FAQ 文件,返回格式化的数据列表。""" | ||
formatted_data = [] | ||
with open(file_path, encoding='utf-8') as faq_file: | ||
loaded_faq = json.load(faq_file) | ||
|
||
for qid, questions in loaded_faq.items(): | ||
for question_item in questions: | ||
formatted_entry = { | ||
'category': category, | ||
'qid': qid, | ||
'content': {'question': question_item['question'], 'answers': question_item['answers']}, | ||
} | ||
formatted_data.append(formatted_entry) | ||
print(formatted_entry) | ||
return formatted_data | ||
|
||
|
||
def check_pdf_with_table(folder_path, category): | ||
"""处理 PDF 文件,返回格式化的数据列表和需要 OCR 的文件列表。""" | ||
formatted_data = [] | ||
need_to_ocr = [] | ||
|
||
file_list = [f for f in os.listdir(folder_path) if f.endswith('.pdf')] | ||
sorted_file_list = sorted(file_list, key=lambda x: int(os.path.splitext(x)[0])) | ||
|
||
for filename in sorted_file_list: | ||
filename_without_ext, _ = os.path.splitext(filename) | ||
file_path = os.path.join(folder_path, filename) | ||
all_text = '' | ||
|
||
try: | ||
# 获取 PDF 的总页数 | ||
from pdfminer.pdfpage import PDFPage | ||
|
||
with open(file_path, 'rb') as f: | ||
total_pages = len(list(PDFPage.get_pages(f))) | ||
except Exception as e: | ||
print(f'无法获取文件 {file_path} 的页数:{e}') | ||
need_to_ocr.append([category, filename, 'all pages']) | ||
continue | ||
|
||
for page_number in range(total_pages): | ||
ocr_file_path = os.path.join( | ||
OCR_FOLDER_PATH, category, f'{filename_without_ext}.pdf_page_{page_number + 1}.txt' | ||
) | ||
|
||
try: | ||
# 尝试读取 OCR 生成的文本文件 | ||
with open(ocr_file_path, encoding='utf-8') as ocr_file: | ||
content = ocr_file.read() | ||
except FileNotFoundError: | ||
# 如果没有 OCR 文件,则从 PDF 中提取该页的文本 | ||
try: | ||
content = extract_text(file_path, page_numbers=[page_number]) or '' | ||
except Exception as e: | ||
print(f'提取文件 {file_path} 第 {page_number + 1} 页时出错:{e}') | ||
content = '' | ||
|
||
if content: | ||
# signal_character_lines = sum( | ||
# 1 for line in content.split("\n") if len(line.strip()) == 1 | ||
# ) | ||
all_text += content + '\n\n' | ||
|
||
# if signal_character_lines >= 40: | ||
# need_to_ocr.append([category, filename, f"page{page_number + 1}"]) | ||
else: | ||
need_to_ocr.append([category, filename, f'page{page_number + 1}']) | ||
|
||
formatted_entry = {'category': category, 'qid': filename_without_ext, 'content': all_text.strip()} | ||
formatted_data.append(formatted_entry) | ||
print(formatted_entry) | ||
|
||
return formatted_data, need_to_ocr | ||
|
||
|
||
if __name__ == '__main__': | ||
# 总的格式化数据列表和需要 OCR 的文件列表 | ||
total_formatted_data = [] | ||
total_need_to_ocr = [] | ||
|
||
# 处理 FAQ 文件 | ||
faq_data = check_text(FAQ_FILEPATH, 'faq') | ||
total_formatted_data.extend(faq_data) | ||
|
||
# 处理金融类 PDF 文件 | ||
finance_data, finance_need_to_ocr = check_pdf_with_table(FINANCE_FOLDER_PATH, 'finance') | ||
total_formatted_data.extend(finance_data) | ||
total_need_to_ocr.extend(finance_need_to_ocr) | ||
|
||
# 处理保险类 PDF 文件 | ||
insurance_data, insurance_need_to_ocr = check_pdf_with_table(INSURANCE_FOLDER_PATH, 'insurance') | ||
total_formatted_data.extend(insurance_data) | ||
total_need_to_ocr.extend(insurance_need_to_ocr) | ||
|
||
# 将整理好的数据存入 formatted_reference_ocr.json | ||
with open('dataset/formatted_reference_ocr_pdfminer.json', 'w', encoding='utf-8') as formatted_file: | ||
json.dump(total_formatted_data, formatted_file, ensure_ascii=False, indent=4) | ||
|
||
# 将需要 OCR 的文件列表存入 need_to_ocr.txt | ||
with open('dataset/need_to_ocr_again_pdfminer.txt', 'w', encoding='utf-8') as ocr_file: | ||
json.dump(total_need_to_ocr, ocr_file, ensure_ascii=False, indent=4) | ||
|
||
print(f'需要 OCR 的文件数量: {len(total_need_to_ocr)}') |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import os | ||
import zipfile | ||
|
||
import pytesseract | ||
from pdf2image import convert_from_bytes | ||
|
||
# Configure Tesseract path if necessary | ||
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | ||
|
||
|
||
def ocr_cond_in_floder(zip_path, folder): | ||
folder_path = f'{folder}/' | ||
file_ratios = [] | ||
|
||
with zipfile.ZipFile(zip_path, 'r') as zipf: | ||
# Select the files that the content can only be captued with OCR | ||
# by finding its compression ratio and original size | ||
for zip_info in zipf.infolist(): | ||
if zip_info.filename.startswith(folder_path) and not zip_info.is_dir(): | ||
original_size = zip_info.file_size # Uncompressed size | ||
compressed_size = zip_info.compress_size # Compressed size | ||
|
||
# Avoid division by zero for empty files | ||
if compressed_size > 0: | ||
compression_ratio = 1 - (compressed_size / original_size) | ||
else: | ||
compression_ratio = float('inf') # Assign infinite ratio for empty files | ||
|
||
# Since we decide to OCR all the files, the code below is marked down | ||
# if (compression_ratio >= 0.1) | ||
# or (compression_ratio >= 0.05 | ||
# and (original_size > 750*1024 or original_size < 170*1024)) | ||
# or (original_size > 1500*1024) or (original_size < 120*1024): | ||
file_ratios.append((zip_info.filename, compression_ratio)) | ||
|
||
# Sort by compression ratio in descending order | ||
file_ratios.sort(key=lambda x: x[1], reverse=False) | ||
|
||
for file_name, _ in file_ratios: | ||
with zipf.open(file_name) as pdf_file: | ||
pdf_bytes = pdf_file.read() | ||
|
||
# Specify the path to the Poppler binaries if needed | ||
poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin' | ||
|
||
# Convert the PDF bytes to images | ||
images = convert_from_bytes(pdf_bytes, dpi=300, poppler_path=poppler_path) | ||
|
||
# Create output directory | ||
output_dir = 'output_text' | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
# Perform OCR on each page and save the text | ||
for i, image in enumerate(images): | ||
text = pytesseract.image_to_string(image, lang='chi_tra') # Specify the language for OCR | ||
# Create subdirectory structure within 'output_text' | ||
output_file_path = os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt') | ||
os.makedirs(os.path.dirname(output_file_path), exist_ok=True) | ||
with open(os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt'), 'w', encoding='utf-8') as f: | ||
f.write(text) | ||
print(f'OCR completed for {file_name}') | ||
|
||
# return file_ratios | ||
|
||
|
||
# Usage | ||
zip_path = 'datazip.zip' | ||
ocr_cond_in_floder(zip_path, '競賽資料集/reference/finance') | ||
ocr_cond_in_floder(zip_path, '競賽資料集/reference/insurance') |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Empty file.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Binary file not shown.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.