Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support to use Pathlib #93

Merged
merged 9 commits into from
Dec 20, 2024
11 changes: 8 additions & 3 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings

Expand Down Expand Up @@ -1286,11 +1287,11 @@ def __init__(
self.register_page_converter(ZipConverter())

def convert(
self, source: Union[str, requests.Response], **kwargs: Any
self, source: Union[str, requests.Response, Path], **kwargs: Any
SigireddyBalasai marked this conversation as resolved.
Show resolved Hide resolved
) -> DocumentConverterResult: # TODO: deal with kwargs
"""
Args:
- source: can be a string representing a path or url, or a requests.response object
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
"""

Expand All @@ -1307,10 +1308,14 @@ def convert(
# Request response
elif isinstance(source, requests.Response):
return self.convert_response(source, **kwargs)
elif isinstance(source, Path):
gagb marked this conversation as resolved.
Show resolved Hide resolved
return self.convert_local(source, **kwargs)

def convert_local(
self, path: str, **kwargs: Any
self, path: Union[str, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs
if isinstance(path, Path):
path = str(path)
# Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else []
Expand Down
Loading