Add Ollama integration for image descriptions

--- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX).
microsoft · Jan 3, 2025 · e2470fc · e2470fc
1 parent 125e206
commit e2470fc
Show file tree

Hide file tree

Showing 10 changed files with 232 additions and 20 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,32 +1,21 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the
-// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 {
 	"name": "Existing Dockerfile",
 	"build": {
-		// Sets the run context to one level up instead of the .devcontainer folder.
 		"context": "..",
-		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
 		"dockerfile": "../Dockerfile",
 		"args": {
 			"INSTALL_GIT": "true"
 		}
 	},
-
-	// Features to add to the dev container. More info: https://containers.dev/features.
-	// "features": {},
 	"features": {
-		"ghcr.io/devcontainers-extra/features/hatch:2": {}
+		"ghcr.io/devcontainers-extra/features/hatch:2": {},
+		"ghcr.io/devcontainers/features/python:1": {
+			"version": "3.10"
+		},
+		"ghcr.io/devcontainers/features/node:1": {
+			"version": "16"
+		},
+		"ghcr.io/devcontainers/features/ollama:1": {}
 	},
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	// "forwardPorts": [],
-
-	// Uncomment the next line to run commands after the container is created.
-	// "postCreateCommand": "cat /etc/os-release",
-
-	// Configure tool-specific properties.
-	// "customizations": {},
-
-	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
 	"remoteUser": "root"
 }
diff --git a/Dockerfile b/Dockerfile
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ffmpeg \
     && rm -rf /var/lib/apt/lists/*
 
-RUN pip install markitdown
+RUN pip install markitdown ollama
 
 # Default USERID and GROUPID
 ARG USERID=10000

diff --git a/README.md b/README.md
@@ -66,6 +66,18 @@ result = md.convert("example.jpg")
 print(result.text_content)
 ```
 
+To use Ollama for image descriptions, provide `ollama_client`:
+
+```python
+from markitdown import MarkItDown
+from ollama import Ollama
+
+client = Ollama(api_key="your-api-key")
+md = MarkItDown(ollama_client=client)
+result = md.convert("example.jpg")
+print(result.text_content)
+```
+
 ### Docker
 
 ```sh

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -1076,6 +1076,54 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
         return response.choices[0].message.content
 
 
+class OllamaConverter(DocumentConverter):
+    """
+    Converts images to markdown via description using Ollama API.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not an image
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
+            return None
+
+        md_content = ""
+
+        # Try describing the image with Ollama
+        ollama_client = kwargs.get("ollama_client")
+        if ollama_client is not None:
+            md_content += (
+                "\n# Description:\n"
+                + self._get_ollama_description(
+                    local_path,
+                    extension,
+                    ollama_client,
+                    prompt=kwargs.get("ollama_prompt"),
+                ).strip()
+                + "\n"
+            )
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content,
+        )
+
+    def _get_ollama_description(self, local_path, extension, client, prompt=None):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed caption for this image."
+
+        data_uri = ""
+        with open(local_path, "rb") as image_file:
+            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+            if content_type is None:
+                content_type = "image/jpeg"
+            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+            data_uri = f"data:{content_type};base64,{image_base64}"
+
+        response = client.describe_image(data_uri, prompt)
+        return response["description"]
+
+
 class ZipConverter(DocumentConverter):
     """Converts ZIP files to markdown by extracting and converting all contained files.
 
@@ -1223,6 +1271,7 @@ def __init__(
         llm_client: Optional[Any] = None,
         llm_model: Optional[str] = None,
         style_map: Optional[str] = None,
+        ollama_client: Optional[Any] = None,
         # Deprecated
         mlm_client: Optional[Any] = None,
         mlm_model: Optional[str] = None,
@@ -1264,6 +1313,7 @@ def __init__(
         self._llm_client = llm_client
         self._llm_model = llm_model
         self._style_map = style_map
+        self._ollama_client = ollama_client
 
         self._page_converters: List[DocumentConverter] = []
 
@@ -1285,6 +1335,7 @@ def __init__(
         self.register_page_converter(IpynbConverter())
         self.register_page_converter(PdfConverter())
         self.register_page_converter(ZipConverter())
+        self.register_page_converter(OllamaConverter())
 
     def convert(
         self, source: Union[str, requests.Response, Path], **kwargs: Any
@@ -1445,6 +1496,9 @@ def _convert(
                 if "llm_model" not in _kwargs and self._llm_model is not None:
                     _kwargs["llm_model"] = self._llm_model
 
+                if "ollama_client" not in _kwargs and self._ollama_client is not None:
+                    _kwargs["ollama_client"] = self._ollama_client
+
                 # Add the list of converters for nested processing
                 _kwargs["_parent_converters"] = self._page_converters
 

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -25,6 +25,13 @@
 # Skip exiftool tests if not installed
 skip_exiftool = shutil.which("exiftool") is None
 
+# Skip Ollama tests if not installed
+skip_ollama = False if os.environ.get("OLLAMA_API_KEY") else True
+try:
+    import ollama
+except ModuleNotFoundError:
+    skip_ollama = True
+
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 
 JPG_TEST_EXIFTOOL = {
@@ -130,6 +137,11 @@
     "5bda1dd6",
 ]
 
+OLLAMA_TEST_STRINGS = [
+    "detailed caption",
+    "image",
+]
+
 
 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
@@ -300,10 +312,25 @@ def test_markitdown_llm() -> None:
         assert test_string in result.text_content.lower()
 
 
+@pytest.mark.skipif(
+    skip_ollama,
+    reason="do not run ollama tests without a key",
+)
+def test_markitdown_ollama() -> None:
+    client = ollama.Ollama(api_key=os.environ.get("OLLAMA_API_KEY"))
+    markitdown = MarkItDown(ollama_client=client)
+
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_ollama.jpg"))
+
+    for test_string in OLLAMA_TEST_STRINGS:
+        assert test_string in result.text_content
+
+
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""
     test_markitdown_remote()
     test_markitdown_local()
     test_markitdown_exiftool()
     test_markitdown_deprecation()
     test_markitdown_llm()
+    test_markitdown_ollama()
diff --git a/web-ui/package.json b/web-ui/package.json
@@ -0,0 +1,37 @@
+{
+  "name": "markitdown-web-ui",
+  "version": "1.0.0",
+  "description": "Web-based UI for MarkItDown",
+  "main": "src/App.js",
+  "scripts": {
+    "start": "react-scripts start",
+    "build": "react-scripts build",
+    "test": "react-scripts test",
+    "eject": "react-scripts eject"
+  },
+  "dependencies": {
+    "react": "^17.0.2",
+    "react-dom": "^17.0.2",
+    "react-scripts": "4.0.3",
+    "axios": "^0.21.1",
+    "react-markdown": "^7.0.0"
+  },
+  "eslintConfig": {
+    "extends": [
+      "react-app",
+      "react-app/jest"
+    ]
+  },
+  "browserslist": {
+    "production": [
+      ">0.2%",
+      "not dead",
+      "not op_mini all"
+    ],
+    "development": [
+      "last 1 chrome version",
+      "last 1 firefox version",
+      "last 1 safari version"
+    ]
+  }
+}
diff --git a/web-ui/src/App.js b/web-ui/src/App.js
@@ -0,0 +1,42 @@
+import React, { useState } from 'react';
+import FileUpload from './components/FileUpload';
+import MarkdownPreview from './components/MarkdownPreview';
+import DownloadButton from './components/DownloadButton';
+import axios from 'axios';
+
+function App() {
+  const [markdownContent, setMarkdownContent] = useState('');
+  const [fileName, setFileName] = useState('');
+
+  const handleFileUpload = async (file) => {
+    const formData = new FormData();
+    formData.append('file', file);
+
+    try {
+      const response = await axios.post('/api/convert', formData, {
+        headers: {
+          'Content-Type': 'multipart/form-data',
+        },
+      });
+      setMarkdownContent(response.data.markdown);
+      setFileName(file.name);
+    } catch (error) {
+      console.error('Error uploading file:', error);
+    }
+  };
+
+  return (
+    <div className="App">
+      <header className="App-header">
+        <h1>MarkItDown Web UI</h1>
+      </header>
+      <main>
+        <FileUpload onFileUpload={handleFileUpload} />
+        <MarkdownPreview content={markdownContent} />
+        <DownloadButton content={markdownContent} fileName={fileName} />
+      </main>
+    </div>
+  );
+}
+
+export default App;
diff --git a/web-ui/src/components/DownloadButton.js b/web-ui/src/components/DownloadButton.js
@@ -0,0 +1,21 @@
+import React from 'react';
+
+function DownloadButton({ content, fileName }) {
+  const handleDownload = () => {
+    const element = document.createElement('a');
+    const file = new Blob([content], { type: 'text/markdown' });
+    element.href = URL.createObjectURL(file);
+    element.download = fileName.replace(/\.[^/.]+$/, "") + ".md";
+    document.body.appendChild(element);
+    element.click();
+    document.body.removeChild(element);
+  };
+
+  return (
+    <button onClick={handleDownload}>
+      Download Markdown
+    </button>
+  );
+}
+
+export default DownloadButton;
diff --git a/web-ui/src/components/FileUpload.js b/web-ui/src/components/FileUpload.js
@@ -0,0 +1,18 @@
+import React from 'react';
+
+function FileUpload({ onFileUpload }) {
+  const handleFileChange = (event) => {
+    const file = event.target.files[0];
+    if (file) {
+      onFileUpload(file);
+    }
+  };
+
+  return (
+    <div className="file-upload">
+      <input type="file" onChange={handleFileChange} />
+    </div>
+  );
+}
+
+export default FileUpload;
diff --git a/web-ui/src/components/MarkdownPreview.js b/web-ui/src/components/MarkdownPreview.js
@@ -0,0 +1,12 @@
+import React from 'react';
+import ReactMarkdown from 'react-markdown';
+
+function MarkdownPreview({ content }) {
+  return (
+    <div className="markdown-preview">
+      <ReactMarkdown>{content}</ReactMarkdown>
+    </div>
+  );
+}
+
+export default MarkdownPreview;