image understanding OCR

Kav-K · Apr 24, 2023 · 1925f62 · 1925f62
1 parent 44f9b6a
commit 1925f62
Show file tree

Hide file tree

Showing 8 changed files with 110 additions and 49 deletions.
diff --git a/conversation_starter_pretext.txt b/conversation_starter_pretext.txt
@@ -33,9 +33,9 @@ Human: I'm making a discord bot <|endofstatement|>
 
 There can be an arbitrary amount of newlines between chat entries. <username> can be any name, pay attention to who's talking. The text "<|endofstatement|>" is used to separate chat entries and make it easier for you to understand the context.
 
-Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA".
+Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA". Optical Character Recognition of the image will be denoted by "Image Info-OCR"
 For example:
-Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nWhat is this image? Is it cartoony? <|endofstatement|>
+Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nImage Info-OCR: \nWhat is this image? Is it cartoony? <|endofstatement|>
 <yourname>: This image is a landscape with a river and trees. It does look cartoony! <|endofstatement|>
 ...
 

diff --git a/conversation_starter_pretext_minimal.txt b/conversation_starter_pretext_minimal.txt
@@ -4,9 +4,9 @@ The conversations are in this format, there can be an arbitrary amount of newlin
 <username>: [MESSAGE 1] <|endofstatement|>
 <yourname>: [RESPONSE TO MESSAGE 1] <|endofstatement|>
 
-Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA".
+Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA". Optical Character Recognition of the image will be denoted by "Image Info-OCR"
 For example:
-Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nWhat is this image? Is it cartoony? <|endofstatement|>
+Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nImage Info-OCR: \nWhat is this image? Is it cartoony? <|endofstatement|>
 <yourname>: This image is a landscape with a river and trees. It does look cartoony! <|endofstatement|>
 ...
 

diff --git a/detailed_guides/MULTI-MODALITY.md b/detailed_guides/MULTI-MODALITY.md
@@ -4,4 +4,6 @@ This bot simulates GPT-4 multimodality by using a collection of services to obta
 
 For this functionality to work, you need a replicate account, and a corresponding replicate api token. You can sign up and get an api key at https://replicate.com/pricing. After getting the key, set `REPLICATE_API_KEY` in your environment file.
 
-The cost to run replicate for image understanding is roughly $0.0032 per second, it will take on average 0.5-1.0 seconds per image. This is a small cost, but it will add up over time, so it's not recommended to release this feature to the public unless you're comfortable with it or have billing limits set on your replicate account.
+The cost to run replicate for image understanding is roughly $0.0032 per second, it will take on average 0.5-1.0 seconds per image. This is a small cost, but it will add up over time, so it's not recommended to release this feature to the public unless you're comfortable with it or have billing limits set on your replicate account.
+
+As a second part of multi-modality, the bot will do OCR on uploaded images. You need to have the 'Google Cloud Vision' API enabled, and put your API key in the `GOOGLE_SEARCH_API_KEY` field in your `.env` file for this to work.
diff --git a/gpt3discord.py b/gpt3discord.py
@@ -33,7 +33,7 @@
 from models.openai_model import Model
 
 
-__version__ = "11.4.6"
+__version__ = "11.5.0"
 
 
 PID_FILE = Path("bot.pid")

diff --git a/models/image_understanding_model.py b/models/image_understanding_model.py
@@ -0,0 +1,90 @@
+import base64
+import json
+import os
+
+import aiohttp
+
+from services.environment_service import EnvService
+import replicate
+
+
+class ImageUnderstandingModel:
+    def __init__(self):
+        # Try to get the replicate API key from the environment
+        self.replicate_key = EnvService.get_replicate_api_key()
+        # Set the environment REPLICATE_API_TOKEN to the replicate API key
+        if self.replicate_key:
+            os.environ["REPLICATE_API_TOKEN"] = self.replicate_key
+            self.key_set = True
+        else:
+            self.key_set = False
+
+        self.google_cloud_project_id = EnvService.get_google_cloud_project_id()
+        self.google_cloud_api_key = EnvService.get_google_search_api_key()
+
+    def get_is_usable(self):
+        return self.key_set
+
+    def ask_image_question(self, prompt, filepath):
+        output = replicate.run(
+            "andreasjansson/blip-2:4b32258c42e9efd4288bb9910bc532a69727f9acd26aa08e175713a0a857a608",
+            input={"image": open(filepath, "rb"), "question": prompt},
+        )
+        return output
+
+    def get_image_caption(self, filepath):
+        output = replicate.run(
+            "andreasjansson/blip-2:4b32258c42e9efd4288bb9910bc532a69727f9acd26aa08e175713a0a857a608",
+            input={"image": open(filepath, "rb"), "caption": True},
+        )
+        return output
+
+    def get_image_stylistic_caption(self, filepath):
+        output = replicate.run(
+            "pharmapsychotic/clip-interrogator:a4a8bafd6089e1716b06057c42b19378250d008b80fe87caa5cd36d40c1eda90",
+            input={"image": open(filepath, "rb")},
+        )
+        return output
+
+    async def do_image_ocr(self, filepath):
+        # Read the image file and encode it in base64 format
+        if not self.google_cloud_api_key:
+            return "None"
+        with open(filepath, 'rb') as image_file:
+            encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
+
+        # Prepare the JSON payload
+        payload = {
+            "requests": [
+                {
+                    "image": {"content": encoded_image},
+                    "features": [{"type": "TEXT_DETECTION"}]
+                }
+            ]
+        }
+
+        header = {
+            "Content-Type": "application/json; charset=utf-8",
+        }
+
+        url = f"https://vision.googleapis.com/v1/images:annotate?key={self.google_cloud_api_key}"
+
+        # Send the async request
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, headers=header, data=json.dumps(payload)) as response:
+                result = await response.json()
+
+                if response.status == 200:
+                    # Get fullTextAnnotation
+                    full_text_annotation = result.get('responses', [])[0].get('fullTextAnnotation')
+
+                    if full_text_annotation:
+                        extracted_text = full_text_annotation.get('text')
+
+                        # Return the extracted text
+                        return extracted_text
+                    else:
+                        return ""
+                else:
+                    raise Exception(
+                        f"Google Cloud Vision API returned an error. Status code: {response.status}, Error: {result}")
diff --git a/models/replicate_model.py b/models/replicate_model.py
diff --git a/services/environment_service.py b/services/environment_service.py
@@ -533,3 +533,11 @@ def get_max_deep_compose_price():
             return deep_compose_price
         except Exception:
             return 3.00
+
+    @staticmethod
+    def get_google_cloud_project_id():
+        try:
+            google_cloud_project_id = os.getenv("GOOGLE_CLOUD_PROJECT_ID")
+            return google_cloud_project_id
+        except Exception:
+            return None
diff --git a/services/text_service.py b/services/text_service.py
@@ -12,7 +12,7 @@
 import unidecode
 
 from models.embed_statics_model import EmbedStatics
-from models.replicate_model import ImageUnderstandingModel
+from models.image_understanding_model import ImageUnderstandingModel
 from services.deletion_service import Deletion
 from models.openai_model import Model, Override, Models
 from models.user_model import EmbeddedConversationItem, RedoUser
@@ -756,7 +756,7 @@ async def process_conversation_message(
                     ) as temp_file:
                         await file.save(temp_file.name)
                         try:
-                            image_caption, image_qa = await asyncio.gather(
+                            image_caption, image_qa, image_ocr = await asyncio.gather(
                                 asyncio.to_thread(
                                     image_understanding_model.get_image_caption,
                                     temp_file.name,
@@ -766,9 +766,10 @@ async def process_conversation_message(
                                     prompt,
                                     temp_file.name,
                                 ),
+                                image_understanding_model.do_image_ocr(temp_file.name)
                             )
                             prompt = (
-                                f"Image Info-Caption: {image_caption}\nImage Info-QA: {image_qa}\n"
+                                f"Image Info-Caption: {image_caption}\nImage Info-QA: {image_qa}\nImage Info-OCR: {image_ocr}\n"
                                 + prompt
                             )
                             try: