Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: replicate demo integration #13

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# The .dockerignore file excludes files from the container build process.
#
# https://docs.docker.com/engine/reference/builder/#dockerignore-file

# Exclude Git files
.git
.github
.gitignore

# Exclude Python cache files
__pycache__
.mypy_cache
.pytest_cache
.ruff_cache

# Exclude Python virtual environment
/venv
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
<a href="https://huggingface.co/deepseek-ai" target="_blank">
<img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
</a>

<a href="https://replicate.com/lucataco/deepseek-vl-7b-base" target="_blank_">
<img src="https://replicate.com/lucataco/deepseek-vl-7b-base/badge" alt="Replicate"/>
</a>
</div>


Expand Down
19 changes: 19 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md

build:
gpu: true
python_version: "3.9"
python_packages:
- "accelerate==0.27.2"
- "attrdict==2.0.1"
- "einops==0.7.0"
- "sentencepiece==0.2.0"
- "torch==2.0.1"
- "torchvision==0.15.2"
- "transformers>=4.38.2"
- "timm>=0.9.16"
- "hf_transfer==0.1.6"

# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"
82 changes: 82 additions & 0 deletions predict.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lucataco How should this code be used?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use Cog to run the DeepSeek-VL model:
cog predict -i image=@images/monday.jpg -i prompt="Describe this image"
This PR is the same as our PR to your other model, DeepSeek-Math

Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md

from cog import BasePredictor, Input, Path, ConcatenateIterator
import os
import torch
from threading import Thread
from deepseek_vl.utils.io import load_pil_images
from transformers import AutoModelForCausalLM, TextIteratorStreamer
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM

# Enable faster download speed
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
MODEL_NAME = "deepseek-ai/deepseek-vl-7b-base"
CACHE_DIR = "checkpoints"


class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
self.vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
MODEL_NAME,
cache_dir=CACHE_DIR
)
self.tokenizer = self.vl_chat_processor.tokenizer
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
cache_dir=CACHE_DIR
)
self.vl_gpt = vl_gpt.to('cuda')

@torch.inference_mode()
def predict(
self,
image: Path = Input(description="Input image"),
prompt: str = Input(description="Input prompt", default="Describe this image"),
max_new_tokens: int = Input(description="Maximum number of tokens to generate", default=512)
) -> ConcatenateIterator[str]:
"""Run a single prediction on the model"""
conversation = [
{
"role": "User",
"content": "<image_placeholder>"+prompt,
"images": [str(image)]
},
{
"role": "Assistant",
"content": ""
}
]

# load images and prepare for inputs
pil_images = load_pil_images(conversation)
prepare_inputs = self.vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True
).to('cuda')

streamer = TextIteratorStreamer(
self.tokenizer, skip_prompt=True, skip_special_tokens=True
)

thread = Thread(
target=self.vl_gpt.language_model.generate,
kwargs={
"inputs_embeds": self.vl_gpt.prepare_inputs_embeds(**prepare_inputs),
"attention_mask": prepare_inputs.attention_mask,
"pad_token_id": self.tokenizer.eos_token_id,
"bos_token_id": self.tokenizer.bos_token_id,
"eos_token_id": self.tokenizer.eos_token_id,
"max_new_tokens": max_new_tokens,
"do_sample": False,
"use_cache": True,
"streamer": streamer,
},
)
thread.start()
for new_token in streamer:
yield new_token
thread.join()