Skip to content

Commit

Permalink
remove "qwen_vl_utils" dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
ZX-ModelCloud committed Dec 20, 2024
1 parent c31e767 commit 977ed38
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 14 deletions.
28 changes: 21 additions & 7 deletions gptqmodel/models/definitions/qwen2_vl.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from typing import Dict
from typing import Dict, Optional
from PIL import Image

from transformers import AutoModelForVision2Seq, Qwen2VLProcessor

from ..base import BaseGPTQModel
from ...utils.calibration import batched
from ...utils.image import fetch_image, extract_vision_info
from ...utils.model import MODALITY


class Qwen2VLGPTQ(BaseGPTQModel):
require_pkgs_version = ["qwen_vl_utils>=0.0.8"]

loader = AutoModelForVision2Seq

base_modules = ["model.embed_tokens", "model.norm"]
Expand Down Expand Up @@ -58,6 +58,22 @@ class Qwen2VLGPTQ(BaseGPTQModel):
}
}

@staticmethod
def process_vision_info(
conversations: list[dict] | list[list[dict]],
) -> Optional[list[Image.Image]]:
vision_infos = extract_vision_info(conversations)
# Read images
image_inputs = []
for vision_info in vision_infos:
if "image" in vision_info or "image_url" in vision_info:
image_inputs.append(fetch_image(vision_info))
else:
raise ValueError("image, image_url should in content.")
if len(image_inputs) == 0:
image_inputs = None
return image_inputs

def preprocess_dataset(self, sample: Dict) -> Dict:
return sample

Expand All @@ -66,19 +82,17 @@ def prepare_dataset(
calibration_dataset,
batch_size: int = 1,
tokenizer=None, ):
from qwen_vl_utils import process_vision_info

processor = Qwen2VLProcessor.from_pretrained(self.model_id_or_path)
calib_data = []
for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset):
text = processor.apply_chat_template(
batch, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(batch)
image_inputs = self.process_vision_info(batch)
inputs = processor(
text=text,
images=image_inputs,
videos=video_inputs,
videos=None,
padding=True,
return_tensors="pt",
)
Expand Down
21 changes: 20 additions & 1 deletion gptqmodel/utils/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,25 @@
import requests
import base64


def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
vision_infos = []
if isinstance(conversations[0], dict):
conversations = [conversations]
for conversation in conversations:
for message in conversation:
if isinstance(message["content"], list):
for ele in message["content"]:
if (
"image" in ele
or "image_url" in ele
or "video" in ele
or ele["type"] in ("image", "image_url", "video")
):
vision_infos.append(ele)
return vision_infos


def fetch_image(ele: dict[str, str | Image.Image]) -> Image.Image:
if "image" in ele:
image = ele["image"]
Expand All @@ -24,4 +43,4 @@ def fetch_image(ele: dict[str, str | Image.Image]) -> Image.Image:
image_obj = Image.open(image)
if image_obj is None:
raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
return image_obj
return image_obj
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ packaging>=24.2
device-smi==0.3.3
sentencepiece>=0.2.0
protobuf>=5.29.1
pillow>=10.4.0
9 changes: 3 additions & 6 deletions tests/models/test_qwen2_vl.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@


from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ
from model_test import ModelTest


Expand Down Expand Up @@ -34,13 +33,11 @@ def test_qwen2_vl(self):
messages, tokenize=False, add_generation_prompt=True
)

from qwen_vl_utils import process_vision_info

image_inputs, video_inputs = process_vision_info(messages)
image_inputs, video_inputs = Qwen2VLGPTQ.process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
videos=None,
padding=True,
return_tensors="pt",
)
Expand Down

0 comments on commit 977ed38

Please sign in to comment.