From 7e39f3d0491c10ed61e01bae0519fcd85591b5ac Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Tue, 26 Nov 2024 13:43:22 +0800 Subject: [PATCH 1/4] [filetool] Add a html table converrt --- pyproject.toml | 22 ++++++++++-- src/pdfdeal/FileTools/html2md.py | 57 ++++++++++++++++++++++++++++++++ src/pdfdeal/file_tools.py | 2 ++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 src/pdfdeal/FileTools/html2md.py diff --git a/pyproject.toml b/pyproject.toml index cb77a48..3e75967 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,8 +13,26 @@ classifiers = [ dependencies = ["httpx[http2]>=0.23.1, <1", "pypdf"] [project.optional-dependencies] -rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"] -dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"] +tools = ["emoji", "Pillow", "reportlab", "beautifulsoup4"] +rag = [ + "emoji", + "Pillow", + "reportlab", + "oss2", + "boto3", + "minio", + "beautifulsoup4", +] +dev = [ + "pytest", + "emoji", + "Pillow", + "reportlab", + "oss2", + "boto3", + "minio", + "beautifulsoup4", +] [project.urls] Issues = "https://github.com/Menghuan1918/pdfdeal/issues" diff --git a/src/pdfdeal/FileTools/html2md.py b/src/pdfdeal/FileTools/html2md.py new file mode 100644 index 0000000..513cf75 --- /dev/null +++ b/src/pdfdeal/FileTools/html2md.py @@ -0,0 +1,57 @@ +from bs4 import BeautifulSoup + + +def html_table_to_md(text: str) -> str: + """Convert HTML tables to Markdown tables in the given text. + + Args: + text (str): Text containing HTML tables + + Returns: + str: Text with HTML tables converted to Markdown format + """ + soup = BeautifulSoup(text, "html.parser") + tables = soup.find_all("table") + + for table in tables: + md_table = [] + max_cols = 0 + + # Get all rows + rows = table.find_all("tr") + if not rows: + continue + + for row in rows: + cols = 0 + for cell in row.find_all(["td", "th"]): + colspan = int(cell.get("colspan", 1)) + cols += colspan + max_cols = max(max_cols, cols) + + for row in rows: + row_data = [] + cells = row.find_all(["td", "th"]) + + col_count = 0 + for cell in cells: + content = cell.get_text().strip() + colspan = int(cell.get("colspan", 1)) + for _ in range(colspan): + row_data.append(content) + col_count += 1 + + while col_count < max_cols: + row_data.append("") + col_count += 1 + + md_table.append("| " + " | ".join(row_data) + " |") + + if len(md_table) == 1: + md_table.append("| " + " | ".join(["---"] * max_cols) + " |") + + # Replace the HTML table with markdown table + md_table_str = "\n".join(md_table) + table.replace_with(md_table_str) + + return str(soup) diff --git a/src/pdfdeal/file_tools.py b/src/pdfdeal/file_tools.py index 8a7c9ac..2ac72fd 100644 --- a/src/pdfdeal/file_tools.py +++ b/src/pdfdeal/file_tools.py @@ -6,6 +6,7 @@ unzips, ) from .FileTools.extract_img import md_replace_imgs, mds_replace_imgs +from .FileTools.html2md import html_table_to_md __all__ = [ @@ -16,4 +17,5 @@ "mds_replace_imgs", "auto_split_md", "auto_split_mds", + "html_table_to_md", ] From f842140ab42076561ddbffc937c3e0c08bd69660 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Tue, 26 Nov 2024 13:54:40 +0800 Subject: [PATCH 2/4] [filetools] Add path_style --- pyproject.toml | 2 +- src/pdfdeal/FileTools/extract_img.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3e75967..3c1a951 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pdfdeal" -version = "0.4.9" +version = "0.4.10" authors = [{ name = "Menghuan1918", email = "menghuan@menghuan1918.com" }] description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)." readme = "README.md" diff --git a/src/pdfdeal/FileTools/extract_img.py b/src/pdfdeal/FileTools/extract_img.py index 6dfdcb1..9f4beea 100644 --- a/src/pdfdeal/FileTools/extract_img.py +++ b/src/pdfdeal/FileTools/extract_img.py @@ -2,6 +2,7 @@ from typing import Tuple, Callable import httpx import os +import hashlib from ..Doc2X.Exception import nomal_retry import concurrent.futures import logging @@ -59,6 +60,7 @@ def md_replace_imgs( outputpath: str = "", relative: bool = False, threads: int = 5, + path_style: bool = False, ) -> bool: """Replace the image links in the markdown file (cdn links -> local file). @@ -69,6 +71,7 @@ def md_replace_imgs( outputpath (str, optional): The output path to save the images, if not set, will create a folder named as same as the markdown file name and add `_img`. **⚠️Only works when `replace` is "local".** relative (bool, optional): The output path to save the images with relative path. Defaults to False. **⚠️Only works when `replace` is "local".** threads (int, optional): The number of threads to download the images. Defaults to 5. + path_style (bool, optional): Whether to use path style when uploading to OSS. If True, the path will be /{filename}/{md5}.{extension}. Defaults to False. Returns: bool: If all images are downloaded successfully, return True, else return False. @@ -158,7 +161,13 @@ def upload_task(i, img_path, replace): if os.path.isabs(img_path) is False: img_path = os.path.join(os.path.dirname(mdfile), img_path) try: - remote_file_name = f"{os.path.splitext(os.path.basename(mdfile))[0]}_{os.path.basename(img_path)}" + if path_style: + with open(img_path, 'rb') as f: + file_md5 = hashlib.md5(f.read()).hexdigest() + file_ext = os.path.splitext(img_path)[1] + remote_file_name = f"{os.path.splitext(os.path.basename(mdfile))[0]}/{file_md5}{file_ext}" + else: + remote_file_name = f"{os.path.splitext(os.path.basename(mdfile))[0]}_{os.path.basename(img_path)}" new_url, flag = replace(img_path, remote_file_name) if flag: img_url = f"![{os.path.splitext(os.path.basename(mdfile))[0]}](<{new_url}>)\n" @@ -217,6 +226,7 @@ def mds_replace_imgs( skip: str = None, threads: int = 2, down_load_threads: int = 3, + path_style: bool = False, ) -> Tuple[list, list, bool]: """Replace the image links in the markdown file (cdn links -> local file). @@ -228,6 +238,7 @@ def mds_replace_imgs( skip (str, optional): The URL start with this string will be skipped. Defaults to None. For example, "https://menghuan1918.github.io/pdfdeal-docs". threads (int, optional): The number of threads to download the images. Defaults to 2. down_load_threads (int, optional): The number of threads to download the images in one md file. Defaults to 3. + path_style (bool, optional): Whether to use path style when uploading to OSS. If True, the path will be /{filename}/{md5}.{extension}. Defaults to False. Returns: Tuple[list, list, bool]: @@ -260,6 +271,7 @@ def process_mdfile(mdfile, replace, outputpath, relative): relative=relative, skip=skip, threads=down_load_threads, + path_style=path_style, ) return mdfile, None except Exception as e: From 6223669f2d4b2363ee834f26edb52782f9dcbd13 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Tue, 26 Nov 2024 14:18:36 +0800 Subject: [PATCH 3/4] [upload] Support picgo and some bug fix --- src/pdfdeal/FileTools/Img/PicGO.py | 56 ++++++++++++++++++++++++++++ src/pdfdeal/FileTools/extract_img.py | 23 ++++++------ 2 files changed, 68 insertions(+), 11 deletions(-) create mode 100644 src/pdfdeal/FileTools/Img/PicGO.py diff --git a/src/pdfdeal/FileTools/Img/PicGO.py b/src/pdfdeal/FileTools/Img/PicGO.py new file mode 100644 index 0000000..62ff2bc --- /dev/null +++ b/src/pdfdeal/FileTools/Img/PicGO.py @@ -0,0 +1,56 @@ +import httpx +import logging + + +class PicGO_Settings: + def __init__(self, endpoint="http://127.0.0.1:36677"): + """Initialize the PicGO client. + + Args: + endpoint (str): The endpoint for PicGO API. Defaults to "http://127.0.0.1:36677". + """ + self.endpoint = endpoint + + def upload_file(self, local_file_path, remote_file_path=None): + """Upload a file to PicGO. + + Args: + local_file_path (str): The path of the local file to upload. + remote_file_path (str): Not used in PicGO, kept for interface consistency. + + Returns: + tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful. + """ + try: + data = {"list": [local_file_path]} + response = httpx.post( + f"{self.endpoint}/upload", + json=data, + headers={"Content-Type": "application/json"}, + ) + result = response.json() + + if result.get("success"): + return result["result"][0], True + else: + logging.error( + f"Failed to upload file: {local_file_path}, PicGO response: {result}" + ) + return result, False + + except Exception as e: + logging.exception(f"Error uploading file: {local_file_path}, {str(e)}") + return str(e), False + + +def PicGO(endpoint="http://127.0.0.1:36677") -> callable: + """Initialize the PicGO client and return a callable function to upload files. + + Args: + endpoint (str): The endpoint for PicGO API. Defaults to "http://127.0.0.1:36677". + + Returns: + callable: The upload_file method of the PicGO client. + """ + picgo_uploader = PicGO_Settings(endpoint=endpoint) + return picgo_uploader.upload_file diff --git a/src/pdfdeal/FileTools/extract_img.py b/src/pdfdeal/FileTools/extract_img.py index 9f4beea..ff4b9fc 100644 --- a/src/pdfdeal/FileTools/extract_img.py +++ b/src/pdfdeal/FileTools/extract_img.py @@ -118,7 +118,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile): return (imglist[i], f"![{imgurl}](<{savepath}>)\n") except Exception as e: logging.warning( - f"Error to download the image: {imgurl}, continue to download the next image:\n {e}" + f"Error to download the image: {imgurl}, keep original url:\n {e}" ) return None @@ -140,13 +140,12 @@ def download_image(i, imgurl, outputpath, relative, mdfile): if result: replacements.append(result) - flag = True for old, new in replacements: content = content.replace(old, new) if len(replacements) < len(imglist): logging.info( - "Some images may not be downloaded successfully. Please check the log." + "Some images were not downloaded successfully. Original URLs have been kept." ) flag = False @@ -162,7 +161,7 @@ def upload_task(i, img_path, replace): img_path = os.path.join(os.path.dirname(mdfile), img_path) try: if path_style: - with open(img_path, 'rb') as f: + with open(img_path, "rb") as f: file_md5 = hashlib.md5(f.read()).hexdigest() file_ext = os.path.splitext(img_path)[1] remote_file_name = f"{os.path.splitext(os.path.basename(mdfile))[0]}/{file_md5}{file_ext}" @@ -174,14 +173,14 @@ def upload_task(i, img_path, replace): return img_url, True, i else: logging.error( - f"Error to upload the image: {img_path}, {new_url}, continue to upload the next image." + f"Error to upload the image: {img_path}, {new_url}, keeping original path." ) return new_url, False, i except Exception: logging.exception( - f"=====\nError to upload the image: {img_path}, Continue to upload the next image:" + f"=====\nError to upload the image: {img_path}, keeping original path:" ) - return new_url, False, i + return None, False, i with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: futures = [ @@ -196,18 +195,20 @@ def upload_task(i, img_path, replace): pass else: logging.warning( - f"=====\nError to upload the image: {imgpath[i]}, {new_url}, continue to upload the next image." + f"=====\nError to upload the image: {imgpath[i]}, keeping original path." ) flag = False if no_outputppath_flag: - for img in imgpath: + for i, img in enumerate(imgpath): try: - os.remove(img) + if futures[i].result()[1]: + os.remove(img) except Exception: pass try: - os.rmdir(outputpath) + if not os.listdir(outputpath): + os.rmdir(outputpath) except Exception as e: logging.error(f"\nError to remove the folder: {outputpath}, {e}") From ae8755ec4b631939df394ad169abc1209cbdee8f Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Tue, 26 Nov 2024 14:36:54 +0800 Subject: [PATCH 4/4] [README] Update --- README.md | 12 ++++++++++++ README_CN.md | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/README.md b/README.md index c7a76e2..121f368 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,18 @@ Use various OCR or PDF recognition tools to identify images and add them to the After conversion and pre-processing of PDF using Doc2X, you can achieve better recognition rates when used with knowledge base applications such as [graphrag](https://github.com/microsoft/graphrag), [Dify](https://github.com/langgenius/dify), and [FastGPT](https://github.com/labring/FastGPT). +### Markdown Document Processing Features + +`pdfdeal` also provides a series of powerful tools to handle Markdown documents: + +- **Convert HTML tables to Markdown format**: Allows conversion of HTML formatted tables to Markdown format for easy use in Markdown documents. +- **Upload images to remote storage services**: Supports uploading local or online images in Markdown documents to remote storage services to ensure image persistence and accessibility. +- **Convert online images to local images**: Allows downloading and converting online images in Markdown documents to local images for offline use. +- **Document splitting and separator addition**: Supports splitting Markdown documents by headings or adding separators within documents for better organization and management. + +For detailed feature introduction and usage, please refer to the [documentation link](https://menghuan1918.github.io/pdfdeal-docs/guide/Tools/). + + ## Cases ### graphrag diff --git a/README_CN.md b/README_CN.md index e25f58a..d67c24a 100644 --- a/README_CN.md +++ b/README_CN.md @@ -40,6 +40,17 @@ 对 PDF 使用 Doc2X 转换并预处理后,与知识库应用程序(例如[graphrag](https://github.com/microsoft/graphrag),[Dify](https://github.com/langgenius/dify),[FastGPT](https://github.com/labring/FastGPT)),可以显著提升召回率。 +### Markdown 文档处理功能 + +`pdfdeal` 也提供了一系列强大的工具来处理 Markdown 文档: + +- **HTML 表格转换为 Markdown 格式**:可以将 HTML 格式的表格转换为 Markdown 格式,方便在 Markdown 文档中使用。 +- **图片上传到远端储存服务**:支持将 Markdown 文档中的本地或在线图片上传到远端储存服务,确保图片的持久性和可访问性。 +- **在线图片转换为本地图片**:可以将 Markdown 文档中的在线图片下载并转换为本地图片,便于离线使用。 +- **文档拆分与分隔符添加**:支持按照标题拆分 Markdown 文档或在文档中添加分隔符,以便于文档的组织和管理。 + +详细功能介绍和使用方法请参见[文档链接](https://menghuan1918.github.io/pdfdeal-docs/zh/guide/Tools/)。 + ## 案例 ### graphrag