From a0391db2292659235a83dec3e078ce300ed6a21e Mon Sep 17 00:00:00 2001 From: "justin.hsu" Date: Thu, 14 Nov 2024 10:27:51 +0800 Subject: [PATCH] add preprocess --- Preprocess/data_process/README.md | 206 +++++++++++++++++- .../data_process/conbine_readpdf_result.py | 2 +- Preprocess/data_process/data_preprocess.py | 167 ++++++++++++++ .../data_process/merge_with_ocr_pdfminer.py | 117 ---------- Preprocess/data_process/read_pdf_ocr.py | 69 ------ README.md | 46 +++- requirements.txt | 2 + 7 files changed, 414 insertions(+), 195 deletions(-) create mode 100644 Preprocess/data_process/data_preprocess.py delete mode 100644 Preprocess/data_process/merge_with_ocr_pdfminer.py delete mode 100644 Preprocess/data_process/read_pdf_ocr.py diff --git a/Preprocess/data_process/README.md b/Preprocess/data_process/README.md index f065c58..bbdafb9 100644 --- a/Preprocess/data_process/README.md +++ b/Preprocess/data_process/README.md @@ -1,2 +1,204 @@ -# 此資料夾為資料預處理的程式碼 -OCR & PDF 文字直接讀取 +--- +title: 資料前處理使用指南 +此資料夾為資料預處理的程式碼 +- OCR & PDF 文字直接讀取 + +--- + + +# 資料前處理使用指南 + +## 簡介 + +此程式碼包含用於讀取與處理 Reference 檔案夾中 FAQ(JSON)文件和 Finance 與 Insurance(PDF)文本文件的 Python 程式碼。程式碼的主要功能包括: + +- 先從 ZIP 壓縮檔案中提取指定資料夾內的 PDF 文件,再將每一頁轉換為圖像,並使用 Tesseract 進行 OCR 識別以提取文本內容。將提取的文本內容保存為 `.txt` 文件,按類別分類儲存。 +- 再讀取 FAQ JSON 文件和 OCR 生成的文本文件,將所有資料格式化並合併為一個統一的 JSON 文件,便於後續的檢索與處理。 + +## 運行環境和套件 + +### Python 套件 + +- `pytesseract` +- `pdf2image` +- `zipfile`(標準函式庫) +- `json`(標準函式庫) +- `os`(標準函式庫) + +### 外部套件 + +- **Tesseract-OCR**:用於 OCR 識別。 + - 下載地址:[Tesseract OCR](https://github.com/tesseract-ocr/tesseract) + - 安裝路徑示例:`C:\Program Files\Tesseract-OCR\tesseract.exe` +- **Poppler**:用於 PDF 轉圖片。 + - 下載地址:[Poppler for Windows](http://blog.alivate.com.au/poppler-windows/) + - 安裝路徑示例:`C:\Program Files\poppler-24.08.0\Library\bin` + +## 安裝 + +### 1. 複製或下載專案 + +如果您尚未獲取專案代碼,請複製或下載到本地: + +```bash +git clone https://github.com/yourusername/your-repo-name.git +cd your-repo-name +``` + + +### 2. 安裝外部套件 + +- **Tesseract-OCR**: + - 下載並安裝 Tesseract-OCR。 + - 安裝完成後,記下安裝路徑(如 `C:\Program Files\Tesseract-OCR\tesseract.exe`)。 + +- **Poppler**: + - 下載並安裝 Poppler。 + - 安裝完成後,記下 `poppler_path`(如 `C:\Program Files\poppler-24.08.0\Library\bin`)。 + +### 3. 安裝 Python 套件 + +安裝所需的 Python 套件: + +```bash +pip install pytesseract==0.3.13 +pip install pdf2image==1.17.0 +``` + +## 配置 + +在程式碼中配置 Tesseract 和 Poppler 的路徑: + +```python +# Configure Tesseract path if necessary (update this path as needed) +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + +# Specify the path to the Poppler binaries +poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" +``` + +確保將上述路徑替換為本地實際安裝的路徑。 + +## 使用說明 + +### 1. 準備資料 + +確保您的 ZIP 文件包含以下資料夾和文件: + +- `競賽資料集/reference/faq/pid_map_content.json` +- `競賽資料集/reference/finance/*.pdf` +- `競賽資料集/reference/insurance/*.pdf` + +### 2. 運行 OCR 提取 + +運行以下命令進行 OCR 處理: + +```bash +python data_preprocess.py +``` + +程式碼將執行以下步驟: + +1. 從指定的 ZIP 文件中提取 Finance 和 Insurance 的 PDF 文件。 +2. 將每個 PDF 文件的每一頁轉換為圖像。 +3. 使用 Tesseract 進行 OCR 識別,提取文本內容。 +4. 將提取的文本保存為 `.txt` 文件,按類別儲存在 `dataset/output_text/finance/` 和 `dataset/output_text/insurance/` 目錄下。 + +### 3. 資料格式化 + +程式碼會繼續執行以下步驟: + +1. 讀取 FAQ 文件 `pid_map_content.json`,提取問題和答案。 +2. 讀取 OCR 生成的文本文件,按 PDF 文件和頁碼順序合併文本內容。 +3. 將所有資料格式化並合併為一個 JSON 文件 `dataset/formatted_reference_ocr.json`。 + +### 4. 查看輸出 + +- **OCR 輸出文本文件**: + - Finance 文本文件保存在 `dataset/output_text/finance/`。 + - Insurance 文本文件保存在 `dataset/output_text/insurance/`。 + +- **合併後的 JSON 文件**: + - `dataset/formatted_reference_ocr.json` 包含了所有格式化後的 FAQ、Finance 與 Insurance 資料。 + +## 文件結構 + +``` +project/ +├── dataset/ +│ ├── output_text/ +│ │ └── 競賽資料集/ +│ │ └── reference/ +│ │ ├── finance/ +│ │ │ ├── 0.pdf_page_1.txt +│ │ │ ├── 1.pdf_page_1.txt +│ │ │ ├── 1.pdf_page_2.txt +│ │ │ └── ... +│ │ └── insurance/ +│ │ ├── 1.pdf_page_1.txt +│ │ ├── 1.pdf_page_2.txt +│ │ └── ... +│ └── formatted_reference_ocr.json +├── datazip.zip/ +│ └── 競賽資料集/ +│ └── reference/ +│ ├── faq/ +│ │ └── pid_map_content.json +│ ├── finance/ +│ │ ├── 0.pdf +│ │ ├── 1.pdf +│ │ └── ... +│ └── insurance/ +│ ├── 1.pdf +│ ├── 2.pdf +│ └── ... +├── data_preprocess.py +└── README.md +``` + +## 範例輸出 + +生成的 `formatted_reference_ocr.json` 文件結構示例: + +```json +[ + { + "category": "faq", + "qid": "0", + "content": { + "question": "什麼是跨境手機掃碼支付?", + "answers": [ + "允許大陸消費者可以用手機支付寶App在台灣實體商店購買商品或服務" + ] + } + },// 其他 FAQ 資料條目... + { + "category": "finance", + "qid": "0", + "content": "註 1U ﹕ 本 雄 團 於 民 國 111] 年 第 1 季 投 賁 成 立 寶 元 智 造 公 司 , 由 本 集 圖 持\n有 100% 股 權 , 另 於 民 國 111 年 第 3 季 及 112 年 第 1 季 未 依 持 股 比..." + },// 其他 Finance 資料條目... + { + "category": "insurance", + "qid": "1", + "content": "延 期 間 內 發 生 第 十 六 條 或 第 十 七 條 本 公 司 應 負 係 險 貫 任 之 事 故 時 , 其 約 定 之 係 險 金 計 算 方 式 將 不 適 用 , 本 公\n..." + },// 其他 Insurance 資料條目... +] +``` + +## 注意事項 + +- **編碼**:確保所有文本文件均使用 UTF-8 編碼,以支持中文字符,避免出現亂碼。 +- **路徑配置**: + - 請根據您本地的安裝路徑,更新程式碼中的 `tesseract_cmd` 和 `poppler_path` 變數。 +- **文件命名**: + - OCR 文本文件必須遵循 `{文件名}.pdf_page_{頁碼}.txt` 的命名規則,以確保程式碼能夠正確讀取並合併各頁內容。 +- **套件安裝**: + - 確保已正確安裝並配置 Tesseract-OCR 和 Poppler,否則程式碼將無法正常運行。 + +## 許可證 + +本專案採用 [MIT 許可證](LICENSE)。您可以自由地使用、修改和分發本專案。 + +--- + +**感謝您的使用!** \ No newline at end of file diff --git a/Preprocess/data_process/conbine_readpdf_result.py b/Preprocess/data_process/conbine_readpdf_result.py index a021ced..7976b17 100644 --- a/Preprocess/data_process/conbine_readpdf_result.py +++ b/Preprocess/data_process/conbine_readpdf_result.py @@ -4,7 +4,7 @@ with open('data/aicup_noocr.json', encoding='utf-8') as file: noocr_data = json.load(file) -with open('data/aicup_ref.json', encoding='utf-8') as file: +with open('data/formatted_reference_ocr.json', encoding='utf-8') as file: ref_data = json.load(file) # 建立 ref_data 的 dictionary,並檢查 content 是否為字串,再去除空格 diff --git a/Preprocess/data_process/data_preprocess.py b/Preprocess/data_process/data_preprocess.py new file mode 100644 index 0000000..dc68cf0 --- /dev/null +++ b/Preprocess/data_process/data_preprocess.py @@ -0,0 +1,167 @@ +import zipfile +import pytesseract +from pdf2image import convert_from_bytes +import os +import json + +# Configure Tesseract path if necessary (update this path as needed) +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + +def ocr_in_folder(zip_path, folder, output_dir): + """ + Extracts PDF files from a ZIP archive, performs OCR, and saves the output text. + + Args: + zip_path (str): The path to the ZIP file containing the documents. + folder (str): The folder path inside the ZIP to search for PDF files. + output_dir (str): The directory to save the OCR output text files. + + Returns: + None + """ + folder_path = f"{folder}/" + + with zipfile.ZipFile(zip_path, 'r') as zipf: + for zip_info in zipf.infolist(): + if zip_info.filename.startswith(folder_path) and not zip_info.is_dir(): + with zipf.open(zip_info.filename) as pdf_file: + pdf_bytes = pdf_file.read() + + # Specify the path to the Poppler binaries if needed + poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" + + # Convert the PDF bytes to images + images = convert_from_bytes(pdf_bytes, dpi=300, poppler_path=poppler_path) + + os.makedirs(output_dir, exist_ok=True) + + # Extract only the base filename (e.g., "file1.pdf" instead of the full path) + base_filename = os.path.basename(zip_info.filename) + + # Perform OCR on each page and save the text + for i, image in enumerate(images): + text = pytesseract.image_to_string(image, lang="chi_tra") + output_file_path = os.path.join(output_dir, f'{base_filename}_page_{i + 1}.txt') + os.makedirs(os.path.dirname(output_file_path), exist_ok=True) + with open(output_file_path, 'w', encoding='utf-8') as f: + f.write(text) + print(f"OCR completed for {base_filename}") + +# OCR extraction paths +zip_path = 'datazip.zip' +ocr_in_folder(zip_path, "競賽資料集/reference/insurance", 'dataset/output_text/insurance') +ocr_in_folder(zip_path, "競賽資料集/reference/finance", 'dataset/output_text/finance') + +# FAQ and OCR JSON processing +import json +import os + +# File paths +FAQ_FILEPATH = 'datazip/競賽資料集/reference/faq/pid_map_content.json' +FINANCE_OCR_FOLDER_PATH = 'dataset/output_text/finance' +INSURANCE_OCR_FOLDER_PATH = 'dataset/output_text/insurance' + + +def check_text(file_path, category): + """ + Reads a JSON FAQ file, processes it, and returns formatted data. + + Args: + file_path (str): Path to the FAQ JSON file. + category (str): Category label for the FAQ data. + + Returns: + list: A list of dictionaries containing formatted FAQ data. + """ + formatted_data = [] + with open(file_path, "r", encoding="utf-8") as faq_file: + loaded_faq = json.load(faq_file) + + for qid, questions in loaded_faq.items(): + for question_item in questions: + formatted_entry = { + "category": category, + "qid": qid, + "content": { + "question": question_item["question"], + "answers": question_item["answers"] + } + } + formatted_data.append(formatted_entry) + print(formatted_entry) + return formatted_data + + +def read_ocr_files(ocr_folder_path, category): + """ + Reads text files generated from OCR, consolidates them, and returns formatted data. + + Args: + ocr_folder_path (str): Path to the folder containing OCR text files. + category (str): Category label for the OCR data. + + Returns: + list: A list of dictionaries containing consolidated OCR data. + """ + formatted_data = [] + + # Capture the name of file + file_basenames = set() + for filename in os.listdir(ocr_folder_path): + if filename.endswith('.txt'): + basename = filename.split('.pdf_page_')[0] + file_basenames.add(basename) + + for basename in sorted(file_basenames, key=lambda x: int(x)): + all_text = "" + page_files = [] + + for filename in os.listdir(ocr_folder_path): + if filename.startswith(basename) and filename.endswith('.txt'): + page_files.append(filename) + + page_files = sorted(page_files, key=lambda x: int(x.split('.pdf_page_')[1].split('.txt')[0])) + + for page_file in page_files: + ocr_file_path = os.path.join(ocr_folder_path, page_file) + with open(ocr_file_path, "r", encoding="utf-8") as ocr_file: + content = ocr_file.read() + all_text += content + "\n\n" + + formatted_entry = { + "category": category, + "qid": basename, + "content": all_text.strip() + } + formatted_data.append(formatted_entry) + print(formatted_entry) + + return formatted_data + + +if __name__ == "__main__": + """ + Main entry point of the script. Processes FAQ, finance, and insurance OCR data, + consolidates them, and saves the result to a JSON file. + """ + total_formatted_data = [] + + # handle faq + faq_data = check_text(FAQ_FILEPATH, "faq") + total_formatted_data.extend(faq_data) + + # read finance ocr + finance_data = read_ocr_files(FINANCE_OCR_FOLDER_PATH, "finance") + total_formatted_data.extend(finance_data) + + # read insurance ocr + insurance_data = read_ocr_files(INSURANCE_OCR_FOLDER_PATH, "insurance") + total_formatted_data.extend(insurance_data) + + # store the data after cleaning in formatted_reference_ocr.json + output_json_path = "data/formatted_reference_ocr.json" + # os.makedirs(os.path.dirname(output_json_path), exist_ok=True) + with open(output_json_path, "w", encoding="utf-8") as formatted_file: + json.dump(total_formatted_data, formatted_file, ensure_ascii=False, indent=4) + + print("The process is finished and the result is saved in dataset/formatted_reference_ocr.json") diff --git a/Preprocess/data_process/merge_with_ocr_pdfminer.py b/Preprocess/data_process/merge_with_ocr_pdfminer.py deleted file mode 100644 index f81d837..0000000 --- a/Preprocess/data_process/merge_with_ocr_pdfminer.py +++ /dev/null @@ -1,117 +0,0 @@ -import json -import os - -from pdfminer.high_level import extract_text - -# 文件路径 -FAQ_FILEPATH = 'reference/faq/pid_map_content.json' -FINANCE_FOLDER_PATH = 'reference/finance' -INSURANCE_FOLDER_PATH = 'reference/insurance' -OCR_FOLDER_PATH = 'dataset/output_text/競賽資料集/reference' - - -def check_text(file_path, category): - """处理 FAQ 文件,返回格式化的数据列表。""" - formatted_data = [] - with open(file_path, encoding='utf-8') as faq_file: - loaded_faq = json.load(faq_file) - - for qid, questions in loaded_faq.items(): - for question_item in questions: - formatted_entry = { - 'category': category, - 'qid': qid, - 'content': {'question': question_item['question'], 'answers': question_item['answers']}, - } - formatted_data.append(formatted_entry) - print(formatted_entry) - return formatted_data - - -def check_pdf_with_table(folder_path, category): - """处理 PDF 文件,返回格式化的数据列表和需要 OCR 的文件列表。""" - formatted_data = [] - need_to_ocr = [] - - file_list = [f for f in os.listdir(folder_path) if f.endswith('.pdf')] - sorted_file_list = sorted(file_list, key=lambda x: int(os.path.splitext(x)[0])) - - for filename in sorted_file_list: - filename_without_ext, _ = os.path.splitext(filename) - file_path = os.path.join(folder_path, filename) - all_text = '' - - try: - # 获取 PDF 的总页数 - from pdfminer.pdfpage import PDFPage - - with open(file_path, 'rb') as f: - total_pages = len(list(PDFPage.get_pages(f))) - except Exception as e: - print(f'无法获取文件 {file_path} 的页数:{e}') - need_to_ocr.append([category, filename, 'all pages']) - continue - - for page_number in range(total_pages): - ocr_file_path = os.path.join( - OCR_FOLDER_PATH, category, f'{filename_without_ext}.pdf_page_{page_number + 1}.txt' - ) - - try: - # 尝试读取 OCR 生成的文本文件 - with open(ocr_file_path, encoding='utf-8') as ocr_file: - content = ocr_file.read() - except FileNotFoundError: - # 如果没有 OCR 文件,则从 PDF 中提取该页的文本 - try: - content = extract_text(file_path, page_numbers=[page_number]) or '' - except Exception as e: - print(f'提取文件 {file_path} 第 {page_number + 1} 页时出错:{e}') - content = '' - - if content: - # signal_character_lines = sum( - # 1 for line in content.split("\n") if len(line.strip()) == 1 - # ) - all_text += content + '\n\n' - - # if signal_character_lines >= 40: - # need_to_ocr.append([category, filename, f"page{page_number + 1}"]) - else: - need_to_ocr.append([category, filename, f'page{page_number + 1}']) - - formatted_entry = {'category': category, 'qid': filename_without_ext, 'content': all_text.strip()} - formatted_data.append(formatted_entry) - print(formatted_entry) - - return formatted_data, need_to_ocr - - -if __name__ == '__main__': - # 总的格式化数据列表和需要 OCR 的文件列表 - total_formatted_data = [] - total_need_to_ocr = [] - - # 处理 FAQ 文件 - faq_data = check_text(FAQ_FILEPATH, 'faq') - total_formatted_data.extend(faq_data) - - # 处理金融类 PDF 文件 - finance_data, finance_need_to_ocr = check_pdf_with_table(FINANCE_FOLDER_PATH, 'finance') - total_formatted_data.extend(finance_data) - total_need_to_ocr.extend(finance_need_to_ocr) - - # 处理保险类 PDF 文件 - insurance_data, insurance_need_to_ocr = check_pdf_with_table(INSURANCE_FOLDER_PATH, 'insurance') - total_formatted_data.extend(insurance_data) - total_need_to_ocr.extend(insurance_need_to_ocr) - - # 将整理好的数据存入 formatted_reference_ocr.json - with open('dataset/formatted_reference_ocr_pdfminer.json', 'w', encoding='utf-8') as formatted_file: - json.dump(total_formatted_data, formatted_file, ensure_ascii=False, indent=4) - - # 将需要 OCR 的文件列表存入 need_to_ocr.txt - with open('dataset/need_to_ocr_again_pdfminer.txt', 'w', encoding='utf-8') as ocr_file: - json.dump(total_need_to_ocr, ocr_file, ensure_ascii=False, indent=4) - - print(f'需要 OCR 的文件数量: {len(total_need_to_ocr)}') diff --git a/Preprocess/data_process/read_pdf_ocr.py b/Preprocess/data_process/read_pdf_ocr.py deleted file mode 100644 index 866f9bc..0000000 --- a/Preprocess/data_process/read_pdf_ocr.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -import zipfile - -import pytesseract -from pdf2image import convert_from_bytes - -# Configure Tesseract path if necessary -pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' - - -def ocr_cond_in_floder(zip_path, folder): - folder_path = f'{folder}/' - file_ratios = [] - - with zipfile.ZipFile(zip_path, 'r') as zipf: - # Select the files that the content can only be captued with OCR - # by finding its compression ratio and original size - for zip_info in zipf.infolist(): - if zip_info.filename.startswith(folder_path) and not zip_info.is_dir(): - original_size = zip_info.file_size # Uncompressed size - compressed_size = zip_info.compress_size # Compressed size - - # Avoid division by zero for empty files - if compressed_size > 0: - compression_ratio = 1 - (compressed_size / original_size) - else: - compression_ratio = float('inf') # Assign infinite ratio for empty files - - # Since we decide to OCR all the files, the code below is marked down - # if (compression_ratio >= 0.1) - # or (compression_ratio >= 0.05 - # and (original_size > 750*1024 or original_size < 170*1024)) - # or (original_size > 1500*1024) or (original_size < 120*1024): - file_ratios.append((zip_info.filename, compression_ratio)) - - # Sort by compression ratio in descending order - file_ratios.sort(key=lambda x: x[1], reverse=False) - - for file_name, _ in file_ratios: - with zipf.open(file_name) as pdf_file: - pdf_bytes = pdf_file.read() - - # Specify the path to the Poppler binaries if needed - poppler_path = r'C:\Program Files\poppler-24.08.0\Library\bin' - - # Convert the PDF bytes to images - images = convert_from_bytes(pdf_bytes, dpi=300, poppler_path=poppler_path) - - # Create output directory - output_dir = 'output_text' - os.makedirs(output_dir, exist_ok=True) - - # Perform OCR on each page and save the text - for i, image in enumerate(images): - text = pytesseract.image_to_string(image, lang='chi_tra') # Specify the language for OCR - # Create subdirectory structure within 'output_text' - output_file_path = os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt') - os.makedirs(os.path.dirname(output_file_path), exist_ok=True) - with open(os.path.join(output_dir, f'{file_name}_page_{i + 1}.txt'), 'w', encoding='utf-8') as f: - f.write(text) - print(f'OCR completed for {file_name}') - - # return file_ratios - - -# Usage -zip_path = 'datazip.zip' -ocr_cond_in_floder(zip_path, '競賽資料集/reference/finance') -ocr_cond_in_floder(zip_path, '競賽資料集/reference/insurance') diff --git a/README.md b/README.md index a9a252e..6314cb3 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ ``` ## Setup Environment -- **OS:** MacOS, Linux 為主, Windows 需安裝 WSL2 等來模擬出 Linux 環境 +- **OS:** 除了 Data processing 使用 Windows, 其他以 MacOS, Linux 為主, Windows 需安裝 WSL2 等來模擬出 Linux 環境 To set up the development environment, follow these steps: @@ -55,17 +55,23 @@ To set up the development environment, follow these steps: source aicup_venv/bin/activate ``` -2. Install the required dependencies: +2. git clone our repo: + ``` + git clone https://github.com/JustinHsu1019/AICUP2024-RAG-LLM.git + cd AICUP2024-RAG-LLM + ``` + +3. Install the required dependencies: ``` pip install -r requirements.txt ``` -3. Copy the configuration example and create your own config file: +4. Copy the configuration example and create your own config file: ``` cp config_example.ini config.ini ``` -4. Manually add your `secret key` to the `config.ini`: +5. Manually add your `secret key` to the `config.ini`: - [OpenAI] 的 api_key 可以在 openai 官網註冊取得 - [VoyageAI] 的 api_key 可以在 voyageai 官網註冊取得 @@ -87,9 +93,37 @@ To set up the development environment, follow these steps: docker-compose up -d ``` -9. Data preprocessing: +9. Data preprocessing (這一階段因不同組員處理原因,OS 環境為 Windows): + - **Tesseract-OCR**: + - 下載並安裝 Tesseract-OCR。 + - 安裝完成後,記下安裝路徑(如 `C:\Program Files\Tesseract-OCR\tesseract.exe`)。 + + - **Poppler**: + - 下載並安裝 Poppler。 + - 安裝完成後,記下 `poppler_path`(如 `C:\Program Files\poppler-24.08.0\Library\bin`)。 + +在程式碼中配置 Tesseract 和 Poppler 的路徑: + +```python +# Configure Tesseract path if necessary (update this path as needed) +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + +# Specify the path to the Poppler binaries +poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" +``` + +確保將上述路徑替換為本地實際安裝的路徑。 + +確保您的 ZIP 文件包含以下資料夾和文件 (下載官方 dataset 後): + +- `競賽資料集/reference/faq/pid_map_content.json` +- `競賽資料集/reference/finance/*.pdf` +- `競賽資料集/reference/insurance/*.pdf` + ``` - (TODO: 等 data 那邊處理好) + python3 Proprocess/data_process/data_preprocess.py + python3 Preprocess/data_process/read_pdf_noocr.py + python3 Preprocess/data_process/conbine_readpdf_result.py ``` 10. Data insert to weaviate: diff --git a/requirements.txt b/requirements.txt index 3cdbe00..c332b34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,5 @@ python-dateutil==2.9.0.post0 redis==5.0.8 flask-httpauth==4.8.0 voyageai==0.3.1 +pytesseract==0.3.13 +pdf2image==1.17.0