NoEdgeAI · Menghuan1918 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/README.md b/README.md
@@ -39,6 +39,18 @@ Use various OCR or PDF recognition tools to identify images and add them to the
 
 After conversion and pre-processing of PDF using Doc2X, you can achieve better recognition rates when used with knowledge base applications such as [graphrag](https://github.com/microsoft/graphrag), [Dify](https://github.com/langgenius/dify), and [FastGPT](https://github.com/labring/FastGPT).
 
+### Markdown Document Processing Features
+
+`pdfdeal` also provides a series of powerful tools to handle Markdown documents:
+
+- **Convert HTML tables to Markdown format**: Allows conversion of HTML formatted tables to Markdown format for easy use in Markdown documents.
+- **Upload images to remote storage services**: Supports uploading local or online images in Markdown documents to remote storage services to ensure image persistence and accessibility.
+- **Convert online images to local images**: Allows downloading and converting online images in Markdown documents to local images for offline use.
+- **Document splitting and separator addition**: Supports splitting Markdown documents by headings or adding separators within documents for better organization and management.
+
+For detailed feature introduction and usage, please refer to the [documentation link](https://menghuan1918.github.io/pdfdeal-docs/guide/Tools/).
+
+
 ## Cases
 
 ### graphrag

diff --git a/README_CN.md b/README_CN.md
@@ -40,6 +40,17 @@
 
 对 PDF 使用 Doc2X 转换并预处理后，与知识库应用程序（例如[graphrag](https://github.com/microsoft/graphrag)，[Dify](https://github.com/langgenius/dify)，[FastGPT](https://github.com/labring/FastGPT)），可以显著提升召回率。
 
+### Markdown 文档处理功能
+
+`pdfdeal` 也提供了一系列强大的工具来处理 Markdown 文档：
+
+- **HTML 表格转换为 Markdown 格式**：可以将 HTML 格式的表格转换为 Markdown 格式，方便在 Markdown 文档中使用。
+- **图片上传到远端储存服务**：支持将 Markdown 文档中的本地或在线图片上传到远端储存服务，确保图片的持久性和可访问性。
+- **在线图片转换为本地图片**：可以将 Markdown 文档中的在线图片下载并转换为本地图片，便于离线使用。
+- **文档拆分与分隔符添加**：支持按照标题拆分 Markdown 文档或在文档中添加分隔符，以便于文档的组织和管理。
+
+详细功能介绍和使用方法请参见[文档链接](https://menghuan1918.github.io/pdfdeal-docs/zh/guide/Tools/)。
+
 ## 案例
 
 ### graphrag

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pdfdeal"
-version = "0.4.9"
+version = "0.4.10"
 authors = [{ name = "Menghuan1918", email = "[email protected]" }]
 description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)."
 readme = "README.md"
@@ -13,8 +13,26 @@ classifiers = [
 dependencies = ["httpx[http2]>=0.23.1, <1", "pypdf"]
 
 [project.optional-dependencies]
-rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"]
-dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"]
+tools = ["emoji", "Pillow", "reportlab", "beautifulsoup4"]
+rag = [
+    "emoji",
+    "Pillow",
+    "reportlab",
+    "oss2",
+    "boto3",
+    "minio",
+    "beautifulsoup4",
+]
+dev = [
+    "pytest",
+    "emoji",
+    "Pillow",
+    "reportlab",
+    "oss2",
+    "boto3",
+    "minio",
+    "beautifulsoup4",
+]
 
 [project.urls]
 Issues = "https://github.com/Menghuan1918/pdfdeal/issues"

diff --git a/src/pdfdeal/FileTools/Img/PicGO.py b/src/pdfdeal/FileTools/Img/PicGO.py
@@ -0,0 +1,56 @@
+import httpx
+import logging
+
+
+class PicGO_Settings:
+    def __init__(self, endpoint="http://127.0.0.1:36677"):
+        """Initialize the PicGO client.
+
+        Args:
+            endpoint (str): The endpoint for PicGO API. Defaults to "http://127.0.0.1:36677".
+        """
+        self.endpoint = endpoint
+
+    def upload_file(self, local_file_path, remote_file_path=None):
+        """Upload a file to PicGO.
+
+        Args:
+            local_file_path (str): The path of the local file to upload.
+            remote_file_path (str): Not used in PicGO, kept for interface consistency.
+
+        Returns:
+            tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful.
+        """
+        try:
+            data = {"list": [local_file_path]}
+            response = httpx.post(
+                f"{self.endpoint}/upload",
+                json=data,
+                headers={"Content-Type": "application/json"},
+            )
+            result = response.json()
+
+            if result.get("success"):
+                return result["result"][0], True
+            else:
+                logging.error(
+                    f"Failed to upload file: {local_file_path}, PicGO response: {result}"
+                )
+                return result, False
+
+        except Exception as e:
+            logging.exception(f"Error uploading file: {local_file_path}, {str(e)}")
+            return str(e), False
+
+
+def PicGO(endpoint="http://127.0.0.1:36677") -> callable:
+    """Initialize the PicGO client and return a callable function to upload files.
+
+    Args:
+        endpoint (str): The endpoint for PicGO API. Defaults to "http://127.0.0.1:36677".
+
+    Returns:
+        callable: The upload_file method of the PicGO client.
+    """
+    picgo_uploader = PicGO_Settings(endpoint=endpoint)
+    return picgo_uploader.upload_file
diff --git a/src/pdfdeal/FileTools/extract_img.py b/src/pdfdeal/FileTools/extract_img.py
@@ -2,6 +2,7 @@
 from typing import Tuple, Callable
 import httpx
 import os
+import hashlib
 from ..Doc2X.Exception import nomal_retry
 import concurrent.futures
 import logging
@@ -59,6 +60,7 @@ def md_replace_imgs(
     outputpath: str = "",
     relative: bool = False,
     threads: int = 5,
+    path_style: bool = False,
 ) -> bool:
     """Replace the image links in the markdown file (cdn links -> local file).
 
@@ -69,6 +71,7 @@ def md_replace_imgs(
         outputpath (str, optional): The output path to save the images, if not set, will create a folder named as same as the markdown file name and add `_img`. **⚠️Only works when `replace` is "local".**
         relative (bool, optional): The output path to save the images with relative path. Defaults to False. **⚠️Only works when `replace` is "local".**
         threads (int, optional): The number of threads to download the images. Defaults to 5.
+        path_style (bool, optional): Whether to use path style when uploading to OSS. If True, the path will be /{filename}/{md5}.{extension}. Defaults to False.
 
     Returns:
         bool: If all images are downloaded successfully, return True, else return False.
@@ -115,7 +118,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
                 return (imglist[i], f"![{imgurl}](<{savepath}>)\n")
         except Exception as e:
             logging.warning(
-                f"Error to download the image: {imgurl}, continue to download the next image:\n {e}"
+                f"Error to download the image: {imgurl}, keep original url:\n {e}"
             )
             return None
 
@@ -137,13 +140,12 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
             if result:
                 replacements.append(result)
 
-    flag = True
     for old, new in replacements:
         content = content.replace(old, new)
 
     if len(replacements) < len(imglist):
         logging.info(
-            "Some images may not be downloaded successfully. Please check the log."
+            "Some images were not downloaded successfully. Original URLs have been kept."
         )
         flag = False
 
@@ -158,21 +160,27 @@ def upload_task(i, img_path, replace):
             if os.path.isabs(img_path) is False:
                 img_path = os.path.join(os.path.dirname(mdfile), img_path)
             try:
-                remote_file_name = f"{os.path.splitext(os.path.basename(mdfile))[0]}_{os.path.basename(img_path)}"
+                if path_style:
+                    with open(img_path, "rb") as f:
+                        file_md5 = hashlib.md5(f.read()).hexdigest()
+                    file_ext = os.path.splitext(img_path)[1]
+                    remote_file_name = f"{os.path.splitext(os.path.basename(mdfile))[0]}/{file_md5}{file_ext}"
+                else:
+                    remote_file_name = f"{os.path.splitext(os.path.basename(mdfile))[0]}_{os.path.basename(img_path)}"
                 new_url, flag = replace(img_path, remote_file_name)
                 if flag:
                     img_url = f"![{os.path.splitext(os.path.basename(mdfile))[0]}](<{new_url}>)\n"
                     return img_url, True, i
                 else:
                     logging.error(
-                        f"Error to upload the image: {img_path}, {new_url}, continue to upload the next image."
+                        f"Error to upload the image: {img_path}, {new_url}, keeping original path."
                     )
                     return new_url, False, i
             except Exception:
                 logging.exception(
-                    f"=====\nError to upload the image: {img_path}, Continue to upload the next image:"
+                    f"=====\nError to upload the image: {img_path}, keeping original path:"
                 )
-                return new_url, False, i
+                return None, False, i
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
             futures = [
@@ -187,18 +195,20 @@ def upload_task(i, img_path, replace):
                     pass
                 else:
                     logging.warning(
-                        f"=====\nError to upload the image: {imgpath[i]}, {new_url}, continue to upload the next image."
+                        f"=====\nError to upload the image: {imgpath[i]}, keeping original path."
                     )
                     flag = False
 
         if no_outputppath_flag:
-            for img in imgpath:
+            for i, img in enumerate(imgpath):
                 try:
-                    os.remove(img)
+                    if futures[i].result()[1]:
+                        os.remove(img)
                 except Exception:
                     pass
             try:
-                os.rmdir(outputpath)
+                if not os.listdir(outputpath):
+                    os.rmdir(outputpath)
             except Exception as e:
                 logging.error(f"\nError to remove the folder: {outputpath}, {e}")
 
@@ -217,6 +227,7 @@ def mds_replace_imgs(
     skip: str = None,
     threads: int = 2,
     down_load_threads: int = 3,
+    path_style: bool = False,
 ) -> Tuple[list, list, bool]:
     """Replace the image links in the markdown file (cdn links -> local file).
 
@@ -228,6 +239,7 @@ def mds_replace_imgs(
         skip (str, optional): The URL start with this string will be skipped. Defaults to None. For example, "https://menghuan1918.github.io/pdfdeal-docs".
         threads (int, optional): The number of threads to download the images. Defaults to 2.
         down_load_threads (int, optional): The number of threads to download the images in one md file. Defaults to 3.
+        path_style (bool, optional): Whether to use path style when uploading to OSS. If True, the path will be /{filename}/{md5}.{extension}. Defaults to False.
 
     Returns:
         Tuple[list, list, bool]:
@@ -260,6 +272,7 @@ def process_mdfile(mdfile, replace, outputpath, relative):
                 relative=relative,
                 skip=skip,
                 threads=down_load_threads,
+                path_style=path_style,
             )
             return mdfile, None
         except Exception as e:

diff --git a/src/pdfdeal/FileTools/html2md.py b/src/pdfdeal/FileTools/html2md.py
@@ -0,0 +1,57 @@
+from bs4 import BeautifulSoup
+
+
+def html_table_to_md(text: str) -> str:
+    """Convert HTML tables to Markdown tables in the given text.
+
+    Args:
+        text (str): Text containing HTML tables
+
+    Returns:
+        str: Text with HTML tables converted to Markdown format
+    """
+    soup = BeautifulSoup(text, "html.parser")
+    tables = soup.find_all("table")
+
+    for table in tables:
+        md_table = []
+        max_cols = 0
+
+        # Get all rows
+        rows = table.find_all("tr")
+        if not rows:
+            continue
+
+        for row in rows:
+            cols = 0
+            for cell in row.find_all(["td", "th"]):
+                colspan = int(cell.get("colspan", 1))
+                cols += colspan
+            max_cols = max(max_cols, cols)
+
+        for row in rows:
+            row_data = []
+            cells = row.find_all(["td", "th"])
+
+            col_count = 0
+            for cell in cells:
+                content = cell.get_text().strip()
+                colspan = int(cell.get("colspan", 1))
+                for _ in range(colspan):
+                    row_data.append(content)
+                    col_count += 1
+
+            while col_count < max_cols:
+                row_data.append("")
+                col_count += 1
+
+            md_table.append("| " + " | ".join(row_data) + " |")
+
+            if len(md_table) == 1:
+                md_table.append("| " + " | ".join(["---"] * max_cols) + " |")
+
+        # Replace the HTML table with markdown table
+        md_table_str = "\n".join(md_table)
+        table.replace_with(md_table_str)
+
+    return str(soup)
diff --git a/src/pdfdeal/file_tools.py b/src/pdfdeal/file_tools.py
@@ -6,6 +6,7 @@
     unzips,
 )
 from .FileTools.extract_img import md_replace_imgs, mds_replace_imgs
+from .FileTools.html2md import html_table_to_md
 
 
 __all__ = [
@@ -16,4 +17,5 @@
     "mds_replace_imgs",
     "auto_split_md",
     "auto_split_mds",
+    "html_table_to_md",
 ]