Merge pull request #2 from HelgeSverre/feature/support-vision-api

[WIP] Feature/support vision api
HelgeSverre · Dec 7, 2023 · 88f147c · 88f147c
2 parents 3140c87 + 8990558
commit 88f147c
Show file tree

Hide file tree

Showing 13 changed files with 407 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,8 @@ Laravel application.
 - Includes a flexible Field Extractor that can extract any arbitrary data without writing custom logic.
 - Can return a regular array or a [Spatie/data](https://spatie.be/docs/laravel-data/v3/introduction) object.
 - Integrates with [Textract](https://aws.amazon.com/textract/) for OCR functionality.
-- Uses [JSON Mode](https://platform.openai.com/docs/guides/text-generation/json-mode) from the latest GPT-3.5 and GPT-4 models.
+- Uses [JSON Mode](https://platform.openai.com/docs/guides/text-generation/json-mode) from the latest GPT-3.5 and GPT-4
+  models.
 
 ## Example
 
@@ -79,6 +80,9 @@ php artisan vendor:publish --provider="OpenAI\Laravel\ServiceProvider"
 
 ```dotenv
 OPENAI_API_KEY="your-key-here"
+
+# Optional: Set request timeout (default: 30s).
+OPENAI_REQUEST_TIMEOUT=60
 ```
 
 ## Usage
@@ -151,6 +155,75 @@ $data = Extractor::fields($sample,
 );
 ```
 
+## Using GPT-4-Vision with Extractor
+
+**Note**: This feature is still WIP.
+
+The `Extractor` package also integrates with OpenAI's new Vision API, leveraging the powerful `gpt-4-vision-preview`
+model to extract
+structured data from images. This feature enables you to analyze and interpret visual content with ease, whether it's
+reading text from images, extracting data from charts, or understanding complex visual scenarios.
+
+### How to Use OpenAI's Vision API with ImageContent
+
+To use the Vision features in `Extractor`, you need to provide an image as input. This can be done in a few different
+ways:
+
+1. **Using a File Path**: Load an image from a file path.
+2. **Using Raw Image Data**: Use the raw data of an image, for example, from an uploaded file.
+3. **Using an Image URL**: Load an image directly from a URL.
+
+Here's how you can use each method:
+
+#### Using a File Path
+
+```php
+use HelgeSverre\Extractor\Text\ImageContent;
+
+$imagePath = __DIR__ . '/../samples/sample-image.jpg';
+$imageContent = ImageContent::file($imagePath);
+```
+
+#### Using Raw Image Data
+
+```php
+use HelgeSverre\Extractor\Text\ImageContent;
+
+$rawImageData = file_get_contents(__DIR__ . '/../samples/sample-image.jpg');
+$imageContent = ImageContent::raw($rawImageData);
+```
+
+#### Using an Image URL
+
+```php
+use HelgeSverre\Extractor\Text\ImageContent;
+
+$imageUrl = 'https://example.com/sample-image.jpg';
+$imageContent = ImageContent::url($imageUrl);
+```
+
+### Extracting Data from Images with OpenAI's Vision API
+
+After preparing your `ImageContent` object, you can pass it to the `Extractor::fields` method to extract structured data
+using OpenAI's Vision API. For example:
+
+```php
+use HelgeSverre\Extractor\Facades\Extractor;
+use HelgeSverre\Extractor\Text\ImageContent;
+
+$imageContent = ImageContent::file(__DIR__ . '/../samples/product-catalog.jpg');
+
+$data = Extractor::fields(
+    $imageContent,
+    fields: [
+        'productName',
+        'price',
+        'description',
+    ],
+    model: Engine::GPT_4_VISION,
+);
+```
+
 ## Creating Custom Extractors
 
 Custom extractors in Extractor allow for tailored data extraction to meet specific needs. Here's how you can create and

diff --git a/composer.json b/composer.json
@@ -29,7 +29,8 @@
         "smalot/pdfparser": "*",
         "spatie/laravel-data": "^3.9",
         "spatie/laravel-package-tools": "^1.14.0",
-        "symfony/dom-crawler": "^6.3"
+        "symfony/dom-crawler": "^7.0.0",
+        "brandembassy/file-type-detector": "^2.3.1"
     },
     "require-dev": {
         "laravel/pint": "^1.0",

diff --git a/resources/prompts/fields-vision.blade.php b/resources/prompts/fields-vision.blade.php
@@ -0,0 +1,9 @@
+You need to carry out data extraction from the provided image and transform it into a structured JSON format.
+The data points you are required to extract include:
+
+{{-- @formatter:off --}}
+{{ $fieldList }}
+
+In a situation where there are no suitable values for any of the above information, kindly set the value as null in your response.
+
+The output should be a JSON object under the key of "{{ $outputKey }}".
diff --git a/src/Engine.php b/src/Engine.php
@@ -3,7 +3,9 @@
 namespace HelgeSverre\Extractor;
 
 use HelgeSverre\Extractor\Extraction\Extractor;
+use HelgeSverre\Extractor\Text\ImageContent;
 use HelgeSverre\Extractor\Text\TextContent;
+use InvalidArgumentException;
 use OpenAI\Laravel\Facades\OpenAI;
 use OpenAI\Responses\Chat\CreateResponse as ChatResponse;
 use OpenAI\Responses\Completions\CreateResponse as CompletionResponse;
@@ -13,6 +15,8 @@ class Engine
     // New
     const GPT_4_1106_PREVIEW = 'gpt-4-1106-preview';
 
+    const GPT_4_VISION = 'gpt-4-vision-preview';
+
     const GPT_3_TURBO_1106 = 'gpt-3.5-turbo-1106';
 
     // GPT-4
@@ -53,7 +57,34 @@ public function run(
             ]),
 
             // New json mode models.
-            $this->supportsJsonMode($model) => OpenAI::chat()->create([
+            $this->isVisionModel($model) => OpenAI::chat()->create([
+                'model' => $model,
+                'max_tokens' => $maxTokens,
+                'temperature' => $temperature,
+                'messages' => [
+                    [
+                        'role' => 'user',
+                        'content' => [
+                            [
+                                'type' => 'text',
+                                'text' => $prompt,
+                            ],
+                            [
+                                'type' => 'image_url',
+                                'image_url' => [
+                                    'url' => match (true) {
+                                        $input instanceof ImageContent && $input->isUrl() => $input->content(),
+                                        $input instanceof ImageContent && $input->isBase64able() => $input->toBase64Url(),
+                                        default => throw new InvalidArgumentException('TODO: replace this exception message')
+                                    },
+                                ],
+                            ],
+                        ],
+                    ],
+                ],
+            ]),
+
+            $this->isJsonModeCompatibleModel($model) => OpenAI::chat()->create([
                 'model' => $model,
                 'max_tokens' => $maxTokens,
                 'temperature' => $temperature,
@@ -81,20 +112,27 @@ public function run(
         return $extractor->process($text);
     }
 
+    public function isVisionModel(string $model): bool
+    {
+        return in_array($model, [
+            self::GPT_4_VISION,
+        ]);
+    }
+
     public function isCompletionModel(string $model): bool
     {
         return in_array($model, [
-            'gpt-3.5-turbo-instruct',
-            'text-davinci-003',
-            'text-davinci-002',
+            self::GPT_3_TURBO_INSTRUCT,
+            self::TEXT_DAVINCI_003,
+            self::TEXT_DAVINCI_002,
         ]);
     }
 
-    public function supportsJsonMode(string $model): bool
+    public function isJsonModeCompatibleModel(string $model): bool
     {
         return in_array($model, [
-            'gpt-4-1106-preview',
-            'gpt-3.5-turbo-1106',
+            self::GPT_4_1106_PREVIEW,
+            self::GPT_3_TURBO_1106,
         ]);
     }
 

diff --git a/src/Extraction/Builtins/Fields.php b/src/Extraction/Builtins/Fields.php
@@ -3,6 +3,8 @@
 namespace HelgeSverre\Extractor\Extraction\Builtins;
 
 use HelgeSverre\Extractor\Extraction\Extractor;
+use HelgeSverre\Extractor\Text\ImageContent;
+use Illuminate\Contracts\View\View;
 
 class Fields extends Extractor
 {
@@ -37,8 +39,12 @@ public function prepareInput(array $input): array
         return $input;
     }
 
-    public function viewName(): string
+    public function view($input): View
     {
-        return 'extractor::fields';
+        if ($input['input'] instanceof ImageContent) {
+            return view('extractor::fields-vision', $input);
+        }
+
+        return view('extractor::fields', $input);
     }
 }
diff --git a/src/Extraction/Concerns/DecodesResponse.php b/src/Extraction/Concerns/DecodesResponse.php
@@ -5,6 +5,7 @@
 use HelgeSverre\Extractor\Exceptions\InvalidJsonReturnedError;
 use HelgeSverre\Extractor\Extraction\Extractor;
 use Illuminate\Support\Arr;
+use Illuminate\Support\Str;
 
 /**
  * @mixin Extractor
@@ -21,6 +22,28 @@ public function expectedOutputKey(): string
         return 'output';
     }
 
+    public function extractJsonString($response): ?string
+    {
+        // It's already valid JSON
+        if (json_validate($response)) {
+            return $response;
+        }
+
+        // Attempt to extract the JSON from a Markdown code block.
+        // TODO: make this case in-sensitive (JSON vs json vs Json)
+        $maybeJson = Str::of($response)->between('```json', '```')->trim();
+
+        if ($maybeJson->isJson()) {
+            return $maybeJson->toString();
+        }
+
+        // TODO: Attempt to recover incorrectly formatted json (missing comma, unclosed brace etc
+
+        // TODO: Idea: optional property you can enable on extractor to attempt to "fix" the broken JSON by calling the openai model again.
+
+        return $response;
+    }
+
     public function bootDecodesResponse(): void
     {
         $this->registerPreprocessor(function ($input): mixed {
@@ -31,10 +54,11 @@ public function bootDecodesResponse(): void
 
         $this->registerProcessor(function ($response): mixed {
 
-            $decoded = json_decode($response, true);
+            $maybeJson = $this->extractJsonString($response);
+
+            $decoded = json_decode($maybeJson, true);
 
             if ($decoded === null && $this->throwsOnInvalidJsonResponse()) {
-                // TODO: Attempt recovery by looking between first/last { and }
                 throw new InvalidJsonReturnedError("Invalid JSON returned:\n$response");
             }
 

diff --git a/src/Extraction/Extractor.php b/src/Extraction/Extractor.php
@@ -88,7 +88,7 @@ public function name(): string
         return Str::slug(class_basename(get_class($this)));
     }
 
-    public function preprocess(TextContent|string $input): string
+    public function preprocess(TextContent|string $input): mixed
     {
         foreach (Arr::pluck($this->preprocessors, 'callback') as $preprocessor) {
             $input = $preprocessor($input, $this);

diff --git a/src/ExtractorManager.php b/src/ExtractorManager.php
@@ -6,6 +6,7 @@
 use HelgeSverre\Extractor\Extraction\Builtins\Fields;
 use HelgeSverre\Extractor\Extraction\Builtins\Simple;
 use HelgeSverre\Extractor\Extraction\Extractor;
+use HelgeSverre\Extractor\Text\ImageContent;
 use HelgeSverre\Extractor\Text\TextContent;
 
 class ExtractorManager
@@ -25,7 +26,7 @@ public function extend(string $name, callable $callback): void
     public function extract(
         string|Extractor $nameOrClass,
         TextContent|string $input,
-        array $config = null,
+        ?array $config = null,
         string $model = 'gpt-3.5-turbo-1106',
         int $maxTokens = 2000,
         float $temperature = 0.1,
@@ -48,7 +49,7 @@ public function extract(
     public function view(
         string $view,
         TextContent|string $input,
-        array $config = null,
+        ?array $config = null,
         string $model = 'gpt-3.5-turbo-1106',
         int $maxTokens = 2000,
         float $temperature = 0.1,
@@ -67,9 +68,9 @@ public function view(
     }
 
     public function fields(
-        TextContent|string $input,
+        ImageContent|TextContent|string $input,
         array $fields,
-        array $config = null,
+        ?array $config = null,
         string $model = 'gpt-3.5-turbo-1106',
         int $maxTokens = 2000,
         float $temperature = 0.1,