diff --git a/conversation_starter_pretext.txt b/conversation_starter_pretext.txt index 8b15483f..caf144bf 100644 --- a/conversation_starter_pretext.txt +++ b/conversation_starter_pretext.txt @@ -33,9 +33,9 @@ Human: I'm making a discord bot <|endofstatement|> There can be an arbitrary amount of newlines between chat entries. can be any name, pay attention to who's talking. The text "<|endofstatement|>" is used to separate chat entries and make it easier for you to understand the context. -Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA". +Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA". Optical Character Recognition of the image will be denoted by "Image Info-OCR" For example: -Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nWhat is this image? Is it cartoony? <|endofstatement|> +Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nImage Info-OCR: \nWhat is this image? Is it cartoony? <|endofstatement|> : This image is a landscape with a river and trees. It does look cartoony! <|endofstatement|> ... diff --git a/conversation_starter_pretext_minimal.txt b/conversation_starter_pretext_minimal.txt index 265b9810..71e42bdc 100644 --- a/conversation_starter_pretext_minimal.txt +++ b/conversation_starter_pretext_minimal.txt @@ -4,9 +4,9 @@ The conversations are in this format, there can be an arbitrary amount of newlin : [MESSAGE 1] <|endofstatement|> : [RESPONSE TO MESSAGE 1] <|endofstatement|> -Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA". +Sometimes, users will upload images during a conversation, when that happens, you will already have an understanding of what that image is, you will know what the image is denoted by "Image Info-Caption" and you will have an answer to what the user asked alongside the image denoted by "Image Info-QA". Optical Character Recognition of the image will be denoted by "Image Info-OCR" For example: -Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nWhat is this image? Is it cartoony? <|endofstatement|> +Human: Image Info-Caption: a landscape with a river and trees\nImage Info-QA: yes\nImage Info-OCR: \nWhat is this image? Is it cartoony? <|endofstatement|> : This image is a landscape with a river and trees. It does look cartoony! <|endofstatement|> ... diff --git a/detailed_guides/MULTI-MODALITY.md b/detailed_guides/MULTI-MODALITY.md index 087a9e06..5dcf26b3 100644 --- a/detailed_guides/MULTI-MODALITY.md +++ b/detailed_guides/MULTI-MODALITY.md @@ -4,4 +4,6 @@ This bot simulates GPT-4 multimodality by using a collection of services to obta For this functionality to work, you need a replicate account, and a corresponding replicate api token. You can sign up and get an api key at https://replicate.com/pricing. After getting the key, set `REPLICATE_API_KEY` in your environment file. -The cost to run replicate for image understanding is roughly $0.0032 per second, it will take on average 0.5-1.0 seconds per image. This is a small cost, but it will add up over time, so it's not recommended to release this feature to the public unless you're comfortable with it or have billing limits set on your replicate account. \ No newline at end of file +The cost to run replicate for image understanding is roughly $0.0032 per second, it will take on average 0.5-1.0 seconds per image. This is a small cost, but it will add up over time, so it's not recommended to release this feature to the public unless you're comfortable with it or have billing limits set on your replicate account. + +As a second part of multi-modality, the bot will do OCR on uploaded images. You need to have the 'Google Cloud Vision' API enabled, and put your API key in the `GOOGLE_SEARCH_API_KEY` field in your `.env` file for this to work. \ No newline at end of file diff --git a/gpt3discord.py b/gpt3discord.py index c221d2f2..9393f755 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -33,7 +33,7 @@ from models.openai_model import Model -__version__ = "11.4.6" +__version__ = "11.5.0" PID_FILE = Path("bot.pid") diff --git a/models/image_understanding_model.py b/models/image_understanding_model.py new file mode 100644 index 00000000..be668498 --- /dev/null +++ b/models/image_understanding_model.py @@ -0,0 +1,90 @@ +import base64 +import json +import os + +import aiohttp + +from services.environment_service import EnvService +import replicate + + +class ImageUnderstandingModel: + def __init__(self): + # Try to get the replicate API key from the environment + self.replicate_key = EnvService.get_replicate_api_key() + # Set the environment REPLICATE_API_TOKEN to the replicate API key + if self.replicate_key: + os.environ["REPLICATE_API_TOKEN"] = self.replicate_key + self.key_set = True + else: + self.key_set = False + + self.google_cloud_project_id = EnvService.get_google_cloud_project_id() + self.google_cloud_api_key = EnvService.get_google_search_api_key() + + def get_is_usable(self): + return self.key_set + + def ask_image_question(self, prompt, filepath): + output = replicate.run( + "andreasjansson/blip-2:4b32258c42e9efd4288bb9910bc532a69727f9acd26aa08e175713a0a857a608", + input={"image": open(filepath, "rb"), "question": prompt}, + ) + return output + + def get_image_caption(self, filepath): + output = replicate.run( + "andreasjansson/blip-2:4b32258c42e9efd4288bb9910bc532a69727f9acd26aa08e175713a0a857a608", + input={"image": open(filepath, "rb"), "caption": True}, + ) + return output + + def get_image_stylistic_caption(self, filepath): + output = replicate.run( + "pharmapsychotic/clip-interrogator:a4a8bafd6089e1716b06057c42b19378250d008b80fe87caa5cd36d40c1eda90", + input={"image": open(filepath, "rb")}, + ) + return output + + async def do_image_ocr(self, filepath): + # Read the image file and encode it in base64 format + if not self.google_cloud_api_key: + return "None" + with open(filepath, 'rb') as image_file: + encoded_image = base64.b64encode(image_file.read()).decode('utf-8') + + # Prepare the JSON payload + payload = { + "requests": [ + { + "image": {"content": encoded_image}, + "features": [{"type": "TEXT_DETECTION"}] + } + ] + } + + header = { + "Content-Type": "application/json; charset=utf-8", + } + + url = f"https://vision.googleapis.com/v1/images:annotate?key={self.google_cloud_api_key}" + + # Send the async request + async with aiohttp.ClientSession() as session: + async with session.post(url, headers=header, data=json.dumps(payload)) as response: + result = await response.json() + + if response.status == 200: + # Get fullTextAnnotation + full_text_annotation = result.get('responses', [])[0].get('fullTextAnnotation') + + if full_text_annotation: + extracted_text = full_text_annotation.get('text') + + # Return the extracted text + return extracted_text + else: + return "" + else: + raise Exception( + f"Google Cloud Vision API returned an error. Status code: {response.status}, Error: {result}") diff --git a/models/replicate_model.py b/models/replicate_model.py deleted file mode 100644 index 1ec883da..00000000 --- a/models/replicate_model.py +++ /dev/null @@ -1,40 +0,0 @@ -import os - -from services.environment_service import EnvService -import replicate - - -class ImageUnderstandingModel: - def __init__(self): - # Try to get the replicate API key from the environment - self.replicate_key = EnvService.get_replicate_api_key() - # Set the environment REPLICATE_API_TOKEN to the replicate API key - if self.replicate_key: - os.environ["REPLICATE_API_TOKEN"] = self.replicate_key - self.key_set = True - else: - self.key_set = False - - def get_is_usable(self): - return self.key_set - - def ask_image_question(self, prompt, filepath): - output = replicate.run( - "andreasjansson/blip-2:4b32258c42e9efd4288bb9910bc532a69727f9acd26aa08e175713a0a857a608", - input={"image": open(filepath, "rb"), "question": prompt}, - ) - return output - - def get_image_caption(self, filepath): - output = replicate.run( - "andreasjansson/blip-2:4b32258c42e9efd4288bb9910bc532a69727f9acd26aa08e175713a0a857a608", - input={"image": open(filepath, "rb"), "caption": True}, - ) - return output - - def get_image_stylistic_caption(self, filepath): - output = replicate.run( - "pharmapsychotic/clip-interrogator:a4a8bafd6089e1716b06057c42b19378250d008b80fe87caa5cd36d40c1eda90", - input={"image": open(filepath, "rb")}, - ) - return output diff --git a/services/environment_service.py b/services/environment_service.py index ba80b677..90ee3add 100644 --- a/services/environment_service.py +++ b/services/environment_service.py @@ -533,3 +533,11 @@ def get_max_deep_compose_price(): return deep_compose_price except Exception: return 3.00 + + @staticmethod + def get_google_cloud_project_id(): + try: + google_cloud_project_id = os.getenv("GOOGLE_CLOUD_PROJECT_ID") + return google_cloud_project_id + except Exception: + return None diff --git a/services/text_service.py b/services/text_service.py index 278581c2..1712c27e 100644 --- a/services/text_service.py +++ b/services/text_service.py @@ -12,7 +12,7 @@ import unidecode from models.embed_statics_model import EmbedStatics -from models.replicate_model import ImageUnderstandingModel +from models.image_understanding_model import ImageUnderstandingModel from services.deletion_service import Deletion from models.openai_model import Model, Override, Models from models.user_model import EmbeddedConversationItem, RedoUser @@ -756,7 +756,7 @@ async def process_conversation_message( ) as temp_file: await file.save(temp_file.name) try: - image_caption, image_qa = await asyncio.gather( + image_caption, image_qa, image_ocr = await asyncio.gather( asyncio.to_thread( image_understanding_model.get_image_caption, temp_file.name, @@ -766,9 +766,10 @@ async def process_conversation_message( prompt, temp_file.name, ), + image_understanding_model.do_image_ocr(temp_file.name) ) prompt = ( - f"Image Info-Caption: {image_caption}\nImage Info-QA: {image_qa}\n" + f"Image Info-Caption: {image_caption}\nImage Info-QA: {image_qa}\nImage Info-OCR: {image_ocr}\n" + prompt ) try: