pytorch · SalmanMohammadi · Oct 23, 2024 · Oct 23, 2024 · Nov 7, 2024 · Nov 8, 2024
diff --git a/tests/cache_artifacts.sh b/tests/cache_artifacts.sh
@@ -18,6 +18,9 @@ SMALL_MODEL_URLS=(
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-03082024.pt"
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-tune-llama3-05052024.pt"
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-reward-07122024.pt"
+    "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-meta-vision-10172024.pt"
+    "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-vision-10172024.pt"
+
 )
 FULL_MODEL_URL=("s3://pytorch-multimodal/llama2-7b-torchtune.pt")
 TOKENIZER_URLS=(

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
@@ -13,7 +13,12 @@
 import pytest
 
 from tests.common import TUNE_PATH
-from tests.recipes.utils import llama2_test_config, write_hf_ckpt_config
+from tests.recipes.utils import (
+    llama2_test_config,
+    llama3_2_vision_test_config,
+    write_hf_ckpt_config,
+    write_hf_vision_ckpt_config,
+)
 from tests.test_utils import CKPT_MODEL_PATHS
 
 
@@ -26,6 +31,30 @@ class TestEleutherEval:
             ("truthfulqa_mc2", 0.4, 4),
         ],
     )
+    @pytest.fixture
+    def hide_correct_version_number(self, monkeypatch):
+        import importlib.metadata
+
+        import_orig = importlib.metadata.version
+
+        def mocked_import(name, *args, **kwargs):
+            if name == "lm-eval":
+                return "0.4.4"  # Hardcode wrong version number
+            return import_orig(name, *args, **kwargs)
+
+        monkeypatch.setattr(importlib.metadata, "version", mocked_import)
+
+    @pytest.fixture
+    def expected_vision_acc(self):
+        return {
+            "Science": 0.2,
+            "Biology": 0.4,
+            "Chemistry": 0.0,
+            "Geography": 0.2,
+            "Math": 0.0,
+            "Physics": 0.2,
+        }
+
     @pytest.mark.integration_test
     def test_torchtune_checkpoint_eval_results(
         self, caplog, monkeypatch, tmpdir, eval_name, expected_acc, bsz
@@ -74,22 +103,9 @@ def test_torchtune_checkpoint_eval_results(
         acc_result = float(search_results.group(1))
         assert math.isclose(acc_result, expected_acc, abs_tol=0.05)
 
-    @pytest.fixture
-    def hide_correct_version_number(self, monkeypatch):
-        import importlib.metadata
-
-        import_orig = importlib.metadata.version
-
-        def mocked_import(name, *args, **kwargs):
-            if name == "lm-eval":
-                return "0.4.4"  # Hardcode wrong version number
-            return import_orig(name, *args, **kwargs)
-
-        monkeypatch.setattr(importlib.metadata, "version", mocked_import)
-
     @pytest.mark.integration_test
     @pytest.mark.usefixtures("hide_correct_version_number")
-    def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
+    def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir):
         ckpt = "llama2_tune"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
@@ -123,7 +139,7 @@ def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
 
     @pytest.mark.integration_test
     def test_eval_recipe_errors_with_quantization_hf_checkpointer(
-        self, capsys, monkeypatch, tmpdir
+        self, monkeypatch, tmpdir
     ):
         ckpt = "llama2_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -162,7 +178,7 @@ def test_eval_recipe_errors_with_quantization_hf_checkpointer(
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
-    def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir):
+    def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
         ckpt = "llama2_tune"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
@@ -194,3 +210,84 @@ def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir
             match="QAT quantizers should only be used during quantization aware training",
         ):
             runpy.run_path(TUNE_PATH, run_name="__main__")
+
+    @pytest.mark.integration_test
+    def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
+        ckpt = "llama3_2_vision_meta"
+        ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
+        ckpt_dir = ckpt_path.parent
+
+        cmd = f"""
+        tune run eleuther_eval \
+            --config llama3_2_vision/11B_evaluation \
+            output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelMetaCheckpointer \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
+            checkpointer.checkpoint_files=[{ckpt_path}] \
+            ~checkpointer.checkpoint_files.filename_format \
+            ~checkpointer.checkpoint_files.max_filename \
+            checkpointer.output_dir={tmpdir} \
+            checkpointer.model_type=LLAMA3_VISION \
+            tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
+            tokenizer.prompt_template=null \
+            limit=5 \
+            dtype=bf16 \
+            device=cpu \
+        """.split()
+
+        model_config = llama3_2_vision_test_config()
+        cmd = cmd + model_config
+
+        monkeypatch.setattr(sys, "argv", cmd)
+        with pytest.raises(SystemExit, match=""):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        out = caplog.text
+
+        pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
+
+        matches = re.findall(pattern, out, re.MULTILINE)
+        for task_name, _, accuracy in matches:
+            assert math.isclose(float(accuracy), expected_vision_acc[task_name])
+
+    @pytest.mark.integration_test
+    def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
+        ckpt = "llama3_2_vision_hf"
+        ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
+        ckpt_dir = ckpt_path.parent
+
+        # Config file needed for model conversion.
+        write_hf_vision_ckpt_config(ckpt_dir)
+
+        cmd = f"""
+        tune run eleuther_eval \
+            --config llama3_2_vision/11B_evaluation \
+            output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelHFCheckpointer \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
+            checkpointer.checkpoint_files=[{ckpt_path}]\
+            ~checkpointer.checkpoint_files.filename_format \
+            ~checkpointer.checkpoint_files.max_filename \
+            checkpointer.output_dir={tmpdir} \
+            checkpointer.model_type=LLAMA3_VISION \
+            tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
+            tokenizer.prompt_template=null \
+            limit=5 \
+            dtype=bf16 \
+            device=cpu \
+        """.split()
+
+        model_config = llama3_2_vision_test_config()
+        cmd = cmd + model_config
+
+        monkeypatch.setattr(sys, "argv", cmd)
+        with pytest.raises(SystemExit, match=""):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        out = caplog.text
+
+        pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
+
+        matches = re.findall(pattern, out, re.MULTILINE)
+        for task_name, _, accuracy in matches:
+            assert math.isclose(float(accuracy), expected_vision_acc[task_name])
diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
@@ -128,6 +128,58 @@ def llama3_test_config() -> List[str]:
     ]
 
 
+def llama3_2_vision_test_config() -> List[str]:
+    return [
+        "model=tests.recipes.utils.dummy_vision_model",
+        "tokenizer._component_=torchtune.models.llama3_2_vision._transform.Llama3VisionTransform",
+        "tokenizer.patch_size=9",
+        "tokenizer.max_num_tiles=2",
+        "tokenizer.tile_size=18",
+        "tokenizer.max_seq_len=4096",
+    ]
+
+
+def dummy_vision_model():
+    from torchtune.models.llama3_2_vision._component_builders import (
+        llama3_2_vision_decoder,
+        llama3_2_vision_encoder,
+    )
+    from torchtune.modules.model_fusion import DeepFusionModel
+
+    vision_encoder = llama3_2_vision_encoder(
+        clip_embed_dim=128,
+        clip_num_layers=4,
+        num_heads=4,
+        tile_size=18,
+        patch_size=9,
+        max_num_tiles=2,
+        in_channels=3,
+        clip_hidden_states=[0, 1],
+        num_layers_projection=2,
+        decoder_embed_dim=128,
+    )
+    vision_decoder = llama3_2_vision_decoder(
+        vocab_size=128256,
+        num_layers=4,
+        fusion_interval=2,
+        num_special_tokens=2,
+        num_heads=8,
+        num_kv_heads=4,
+        embed_dim=128,
+        max_seq_len=4096,
+        encoder_max_seq_len=4096,
+    )
+
+    model = DeepFusionModel(
+        encoder=vision_encoder,
+        decoder=vision_decoder,
+        encoder_trainable=False,
+        decoder_trainable=False,
+        fusion_trainable=False,
+    )
+    return model
+
+
 def lora_llama2_test_config(
     lora_attn_modules,
     apply_lora_to_mlp: bool = False,
@@ -199,6 +251,27 @@ def write_hf_ckpt_config(ckpt_dir: str):
         json.dump(config, f)
 
 
+def write_hf_vision_ckpt_config(ckpt_dir: str):
+    config = {
+        "text_config": {
+            "num_attention_heads": 8,
+            "num_key_value_heads": 4,
+            "hidden_size": 128,
+            "vocab_size": 128256,
+            "cross_attention_layers": [1, 4],
+        },
+        "vision_config": {
+            "hidden_size": 128,
+            "image_size": 18,
+            "max_num_tiles": 2,
+            "supported_aspect_ratios": [[1, 1], [1, 2], [2, 1]],
+        },
+    }
+    config_file = Path.joinpath(Path(ckpt_dir), "config.json")
+    with config_file.open("w") as f:
+        json.dump(config, f)
+
+
 MODEL_TEST_CONFIGS = {
     "llama2": llama2_test_config(),
     "llama3": llama3_test_config(),

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -33,6 +33,8 @@
     "llama2_hf": "/tmp/test-artifacts/small-ckpt-hf-03082024.pt",
     "llama2_reward_hf": "/tmp/test-artifacts/small-ckpt-hf-reward-07122024.pt",
     "llama3_tune": "/tmp/test-artifacts/small-ckpt-tune-llama3-05052024.pt",
+    "llama3_2_vision_hf": "/tmp/test-artifacts/small-ckpt-hf-vision-10172024.pt",
+    "llama3_2_vision_meta": "/tmp/test-artifacts/small-ckpt-meta-vision-10172024.pt",
     "llama2_7b": "/tmp/test-artifacts/llama2-7b-torchtune.pt",
 }
 

diff --git a/torchtune/models/llama3_2_vision/_component_builders.py b/torchtune/models/llama3_2_vision/_component_builders.py
@@ -170,6 +170,7 @@ def llama3_2_vision_decoder(
             by :func:`~torchtune.modules.KVCache`.
         encoder_max_seq_len (int): maximum sequence length the encoder will be run with, as used
             by :func:`~torchtune.modules.KVCache`.
+        rope_base (int): base for the rotary positional embeddings. Default: 500_000
         intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
             this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`.