Skip to content

Commit

Permalink
Add Ollama integration for image descriptions
Browse files Browse the repository at this point in the history
---

For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX).
  • Loading branch information
JoyRushMedia committed Jan 3, 2025
1 parent 125e206 commit e2470fc
Show file tree
Hide file tree
Showing 10 changed files with 232 additions and 20 deletions.
27 changes: 8 additions & 19 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,32 +1,21 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
{
"name": "Existing Dockerfile",
"build": {
// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerfile": "../Dockerfile",
"args": {
"INSTALL_GIT": "true"
}
},

// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
"features": {
"ghcr.io/devcontainers-extra/features/hatch:2": {}
"ghcr.io/devcontainers-extra/features/hatch:2": {},
"ghcr.io/devcontainers/features/python:1": {
"version": "3.10"
},
"ghcr.io/devcontainers/features/node:1": {
"version": "16"
},
"ghcr.io/devcontainers/features/ollama:1": {}
},

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Uncomment the next line to run commands after the container is created.
// "postCreateCommand": "cat /etc/os-release",

// Configure tool-specific properties.
// "customizations": {},

// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
"remoteUser": "root"
}
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*

RUN pip install markitdown
RUN pip install markitdown ollama

# Default USERID and GROUPID
ARG USERID=10000
Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@ result = md.convert("example.jpg")
print(result.text_content)
```

To use Ollama for image descriptions, provide `ollama_client`:

```python
from markitdown import MarkItDown
from ollama import Ollama

client = Ollama(api_key="your-api-key")
md = MarkItDown(ollama_client=client)
result = md.convert("example.jpg")
print(result.text_content)
```

### Docker

```sh
Expand Down
54 changes: 54 additions & 0 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,54 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
return response.choices[0].message.content


class OllamaConverter(DocumentConverter):
"""
Converts images to markdown via description using Ollama API.
"""

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
return None

md_content = ""

# Try describing the image with Ollama
ollama_client = kwargs.get("ollama_client")
if ollama_client is not None:
md_content += (
"\n# Description:\n"
+ self._get_ollama_description(
local_path,
extension,
ollama_client,
prompt=kwargs.get("ollama_prompt"),
).strip()
+ "\n"
)

return DocumentConverterResult(
title=None,
text_content=md_content,
)

def _get_ollama_description(self, local_path, extension, client, prompt=None):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."

data_uri = ""
with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None:
content_type = "image/jpeg"
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"

response = client.describe_image(data_uri, prompt)
return response["description"]


class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.
Expand Down Expand Up @@ -1223,6 +1271,7 @@ def __init__(
llm_client: Optional[Any] = None,
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
ollama_client: Optional[Any] = None,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
Expand Down Expand Up @@ -1264,6 +1313,7 @@ def __init__(
self._llm_client = llm_client
self._llm_model = llm_model
self._style_map = style_map
self._ollama_client = ollama_client

self._page_converters: List[DocumentConverter] = []

Expand All @@ -1285,6 +1335,7 @@ def __init__(
self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
self.register_page_converter(OllamaConverter())

def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any
Expand Down Expand Up @@ -1445,6 +1496,9 @@ def _convert(
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model

if "ollama_client" not in _kwargs and self._ollama_client is not None:
_kwargs["ollama_client"] = self._ollama_client

# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters

Expand Down
27 changes: 27 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@
# Skip exiftool tests if not installed
skip_exiftool = shutil.which("exiftool") is None

# Skip Ollama tests if not installed
skip_ollama = False if os.environ.get("OLLAMA_API_KEY") else True
try:
import ollama
except ModuleNotFoundError:
skip_ollama = True

TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")

JPG_TEST_EXIFTOOL = {
Expand Down Expand Up @@ -130,6 +137,11 @@
"5bda1dd6",
]

OLLAMA_TEST_STRINGS = [
"detailed caption",
"image",
]


# --- Helper Functions ---
def validate_strings(result, expected_strings, exclude_strings=None):
Expand Down Expand Up @@ -300,10 +312,25 @@ def test_markitdown_llm() -> None:
assert test_string in result.text_content.lower()


@pytest.mark.skipif(
skip_ollama,
reason="do not run ollama tests without a key",
)
def test_markitdown_ollama() -> None:
client = ollama.Ollama(api_key=os.environ.get("OLLAMA_API_KEY"))
markitdown = MarkItDown(ollama_client=client)

result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_ollama.jpg"))

for test_string in OLLAMA_TEST_STRINGS:
assert test_string in result.text_content


if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
test_markitdown_ollama()
37 changes: 37 additions & 0 deletions web-ui/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"name": "markitdown-web-ui",
"version": "1.0.0",
"description": "Web-based UI for MarkItDown",
"main": "src/App.js",
"scripts": {
"start": "react-scripts start",
"build": "react-scripts build",
"test": "react-scripts test",
"eject": "react-scripts eject"
},
"dependencies": {
"react": "^17.0.2",
"react-dom": "^17.0.2",
"react-scripts": "4.0.3",
"axios": "^0.21.1",
"react-markdown": "^7.0.0"
},
"eslintConfig": {
"extends": [
"react-app",
"react-app/jest"
]
},
"browserslist": {
"production": [
">0.2%",
"not dead",
"not op_mini all"
],
"development": [
"last 1 chrome version",
"last 1 firefox version",
"last 1 safari version"
]
}
}
42 changes: 42 additions & 0 deletions web-ui/src/App.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import React, { useState } from 'react';
import FileUpload from './components/FileUpload';
import MarkdownPreview from './components/MarkdownPreview';
import DownloadButton from './components/DownloadButton';
import axios from 'axios';

function App() {
const [markdownContent, setMarkdownContent] = useState('');
const [fileName, setFileName] = useState('');

const handleFileUpload = async (file) => {
const formData = new FormData();
formData.append('file', file);

try {
const response = await axios.post('/api/convert', formData, {
headers: {
'Content-Type': 'multipart/form-data',
},
});
setMarkdownContent(response.data.markdown);
setFileName(file.name);
} catch (error) {
console.error('Error uploading file:', error);
}
};

return (
<div className="App">
<header className="App-header">
<h1>MarkItDown Web UI</h1>
</header>
<main>
<FileUpload onFileUpload={handleFileUpload} />
<MarkdownPreview content={markdownContent} />
<DownloadButton content={markdownContent} fileName={fileName} />
</main>
</div>
);
}

export default App;
21 changes: 21 additions & 0 deletions web-ui/src/components/DownloadButton.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import React from 'react';

function DownloadButton({ content, fileName }) {
const handleDownload = () => {
const element = document.createElement('a');
const file = new Blob([content], { type: 'text/markdown' });
element.href = URL.createObjectURL(file);
element.download = fileName.replace(/\.[^/.]+$/, "") + ".md";
document.body.appendChild(element);
element.click();
document.body.removeChild(element);
};

return (
<button onClick={handleDownload}>
Download Markdown
</button>
);
}

export default DownloadButton;
18 changes: 18 additions & 0 deletions web-ui/src/components/FileUpload.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import React from 'react';

function FileUpload({ onFileUpload }) {
const handleFileChange = (event) => {
const file = event.target.files[0];
if (file) {
onFileUpload(file);
}
};

return (
<div className="file-upload">
<input type="file" onChange={handleFileChange} />
</div>
);
}

export default FileUpload;
12 changes: 12 additions & 0 deletions web-ui/src/components/MarkdownPreview.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import React from 'react';
import ReactMarkdown from 'react-markdown';

function MarkdownPreview({ content }) {
return (
<div className="markdown-preview">
<ReactMarkdown>{content}</ReactMarkdown>
</div>
);
}

export default MarkdownPreview;

0 comments on commit e2470fc

Please sign in to comment.