Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MM Eval tests #1887

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5cb9140
mm eval tests
SalmanMohammadi Oct 23, 2024
63ba175
mm eval tests
SalmanMohammadi Oct 23, 2024
0331778
Merge branch 'main' into mm_tests
SalmanMohammadi Nov 7, 2024
578aa48
adding test values
SalmanMohammadi Nov 8, 2024
f0a94d7
reverting changes
SalmanMohammadi Nov 8, 2024
df3402c
Merge branch 'main' into mm_tests
SalmanMohammadi Nov 8, 2024
60bccc6
whoops
SalmanMohammadi Nov 8, 2024
6681749
whoops 2
SalmanMohammadi Nov 8, 2024
d214f52
tidy tidy tidy tidy fresh clean
SalmanMohammadi Nov 8, 2024
e3155a1
what is this rounding nonesense?
SalmanMohammadi Nov 8, 2024
7add9af
fixing values
SalmanMohammadi Nov 9, 2024
c3246c0
fixing parameterize
SalmanMohammadi Nov 9, 2024
e3f8178
just put it on teh gpu?
SalmanMohammadi Nov 11, 2024
acd6763
Merge branch 'mm_tests' of github.com:SalmanMohammadi/torchtune into …
SalmanMohammadi Nov 11, 2024
ed3f02e
what a silly billy I am oh boy
SalmanMohammadi Nov 12, 2024
8de3350
is it a python version thing?
SalmanMohammadi Nov 12, 2024
3424c32
it is NOT. BACK TO THE CPU
SalmanMohammadi Nov 12, 2024
abca4d1
back to gpu.. it's a max_seq_len thing??
SalmanMohammadi Nov 12, 2024
5ab8f83
that didn't work...
SalmanMohammadi Nov 12, 2024
19c029e
this is a terrible experience for me
SalmanMohammadi Nov 12, 2024
a691a08
stg if this doesn't work
SalmanMohammadi Nov 12, 2024
e7018fa
Merge branch 'main' into mm_tests
SalmanMohammadi Nov 12, 2024
3bb57fa
I don't even know at this point
SalmanMohammadi Nov 12, 2024
76ff0fd
OKAY this should work right?
SalmanMohammadi Nov 12, 2024
24e24b5
????
SalmanMohammadi Nov 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tests/cache_artifacts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ SMALL_MODEL_URLS=(
"https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-03082024.pt"
"https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-tune-llama3-05052024.pt"
"https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-reward-07122024.pt"
"https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-meta-vision-10172024.pt"
"https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-vision-10172024.pt"

)
FULL_MODEL_URL=("s3://pytorch-multimodal/llama2-7b-torchtune.pt")
TOKENIZER_URLS=(
Expand Down
131 changes: 114 additions & 17 deletions tests/recipes/test_eleuther_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
import pytest

from tests.common import TUNE_PATH
from tests.recipes.utils import llama2_test_config, write_hf_ckpt_config
from tests.recipes.utils import (
llama2_test_config,
llama3_2_vision_test_config,
write_hf_ckpt_config,
write_hf_vision_ckpt_config,
)
from tests.test_utils import CKPT_MODEL_PATHS


Expand All @@ -26,6 +31,30 @@ class TestEleutherEval:
("truthfulqa_mc2", 0.4, 4),
],
)
@pytest.fixture
def hide_correct_version_number(self, monkeypatch):
import importlib.metadata

import_orig = importlib.metadata.version

def mocked_import(name, *args, **kwargs):
if name == "lm-eval":
return "0.4.4" # Hardcode wrong version number
return import_orig(name, *args, **kwargs)

monkeypatch.setattr(importlib.metadata, "version", mocked_import)

@pytest.fixture
def expected_vision_acc(self):
return {
"Science": 0.2,
"Biology": 0.4,
"Chemistry": 0.0,
"Geography": 0.2,
"Math": 0.0,
"Physics": 0.2,
}

@pytest.mark.integration_test
def test_torchtune_checkpoint_eval_results(
self, caplog, monkeypatch, tmpdir, eval_name, expected_acc, bsz
Expand Down Expand Up @@ -74,22 +103,9 @@ def test_torchtune_checkpoint_eval_results(
acc_result = float(search_results.group(1))
assert math.isclose(acc_result, expected_acc, abs_tol=0.05)

@pytest.fixture
def hide_correct_version_number(self, monkeypatch):
import importlib.metadata

import_orig = importlib.metadata.version

def mocked_import(name, *args, **kwargs):
if name == "lm-eval":
return "0.4.4" # Hardcode wrong version number
return import_orig(name, *args, **kwargs)

monkeypatch.setattr(importlib.metadata, "version", mocked_import)

@pytest.mark.integration_test
@pytest.mark.usefixtures("hide_correct_version_number")
def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir):
ckpt = "llama2_tune"
ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
ckpt_dir = ckpt_path.parent
Expand Down Expand Up @@ -123,7 +139,7 @@ def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):

@pytest.mark.integration_test
def test_eval_recipe_errors_with_quantization_hf_checkpointer(
self, capsys, monkeypatch, tmpdir
self, monkeypatch, tmpdir
):
ckpt = "llama2_hf"
ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
Expand Down Expand Up @@ -162,7 +178,7 @@ def test_eval_recipe_errors_with_quantization_hf_checkpointer(
runpy.run_path(TUNE_PATH, run_name="__main__")

@pytest.mark.integration_test
def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir):
def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
ckpt = "llama2_tune"
ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
ckpt_dir = ckpt_path.parent
Expand Down Expand Up @@ -194,3 +210,84 @@ def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir
match="QAT quantizers should only be used during quantization aware training",
):
runpy.run_path(TUNE_PATH, run_name="__main__")

@pytest.mark.integration_test
def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
ckpt = "llama3_2_vision_meta"
ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
ckpt_dir = ckpt_path.parent

cmd = f"""
tune run eleuther_eval \
--config llama3_2_vision/11B_evaluation \
output_dir={tmpdir} \
checkpointer=torchtune.training.FullModelMetaCheckpointer \
checkpointer.checkpoint_dir='{ckpt_dir}' \
checkpointer.checkpoint_files=[{ckpt_path}] \
~checkpointer.checkpoint_files.filename_format \
~checkpointer.checkpoint_files.max_filename \
checkpointer.output_dir={tmpdir} \
checkpointer.model_type=LLAMA3_VISION \
tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
tokenizer.prompt_template=null \
limit=5 \
dtype=bf16 \
device=cpu \
""".split()

model_config = llama3_2_vision_test_config()
cmd = cmd + model_config

monkeypatch.setattr(sys, "argv", cmd)
with pytest.raises(SystemExit, match=""):
runpy.run_path(TUNE_PATH, run_name="__main__")

out = caplog.text

pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"

matches = re.findall(pattern, out, re.MULTILINE)
for task_name, _, accuracy in matches:
assert math.isclose(float(accuracy), expected_vision_acc[task_name])

@pytest.mark.integration_test
def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
ckpt = "llama3_2_vision_hf"
ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
ckpt_dir = ckpt_path.parent

# Config file needed for model conversion.
write_hf_vision_ckpt_config(ckpt_dir)

cmd = f"""
tune run eleuther_eval \
--config llama3_2_vision/11B_evaluation \
output_dir={tmpdir} \
checkpointer=torchtune.training.FullModelHFCheckpointer \
checkpointer.checkpoint_dir='{ckpt_dir}' \
checkpointer.checkpoint_files=[{ckpt_path}]\
~checkpointer.checkpoint_files.filename_format \
~checkpointer.checkpoint_files.max_filename \
checkpointer.output_dir={tmpdir} \
checkpointer.model_type=LLAMA3_VISION \
tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
tokenizer.prompt_template=null \
limit=5 \
dtype=bf16 \
device=cpu \
""".split()

model_config = llama3_2_vision_test_config()
cmd = cmd + model_config

monkeypatch.setattr(sys, "argv", cmd)
with pytest.raises(SystemExit, match=""):
runpy.run_path(TUNE_PATH, run_name="__main__")

out = caplog.text

pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"

matches = re.findall(pattern, out, re.MULTILINE)
for task_name, _, accuracy in matches:
assert math.isclose(float(accuracy), expected_vision_acc[task_name])
73 changes: 73 additions & 0 deletions tests/recipes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,58 @@ def llama3_test_config() -> List[str]:
]


def llama3_2_vision_test_config() -> List[str]:
return [
"model=tests.recipes.utils.dummy_vision_model",
"tokenizer._component_=torchtune.models.llama3_2_vision._transform.Llama3VisionTransform",
"tokenizer.patch_size=9",
"tokenizer.max_num_tiles=2",
"tokenizer.tile_size=18",
"tokenizer.max_seq_len=4096",
]


def dummy_vision_model():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have this in tests/torchtune/modules/test_common_utils.py, opportunity to unify?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It made the most sense to me to define the fixture where it was being used - do you have strong opinions here?

from torchtune.models.llama3_2_vision._component_builders import (
llama3_2_vision_decoder,
llama3_2_vision_encoder,
)
from torchtune.modules.model_fusion import DeepFusionModel

vision_encoder = llama3_2_vision_encoder(
clip_embed_dim=128,
clip_num_layers=4,
num_heads=4,
tile_size=18,
patch_size=9,
max_num_tiles=2,
in_channels=3,
clip_hidden_states=[0, 1],
num_layers_projection=2,
decoder_embed_dim=128,
)
vision_decoder = llama3_2_vision_decoder(
vocab_size=128256,
num_layers=4,
fusion_interval=2,
num_special_tokens=2,
num_heads=8,
num_kv_heads=4,
embed_dim=128,
max_seq_len=4096,
encoder_max_seq_len=4096,
)

model = DeepFusionModel(
encoder=vision_encoder,
decoder=vision_decoder,
encoder_trainable=False,
decoder_trainable=False,
fusion_trainable=False,
)
return model


def lora_llama2_test_config(
lora_attn_modules,
apply_lora_to_mlp: bool = False,
Expand Down Expand Up @@ -199,6 +251,27 @@ def write_hf_ckpt_config(ckpt_dir: str):
json.dump(config, f)


def write_hf_vision_ckpt_config(ckpt_dir: str):
config = {
"text_config": {
"num_attention_heads": 8,
"num_key_value_heads": 4,
"hidden_size": 128,
"vocab_size": 128256,
"cross_attention_layers": [1, 4],
},
"vision_config": {
"hidden_size": 128,
"image_size": 18,
"max_num_tiles": 2,
"supported_aspect_ratios": [[1, 1], [1, 2], [2, 1]],
},
}
config_file = Path.joinpath(Path(ckpt_dir), "config.json")
with config_file.open("w") as f:
json.dump(config, f)


MODEL_TEST_CONFIGS = {
"llama2": llama2_test_config(),
"llama3": llama3_test_config(),
Expand Down
2 changes: 2 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
"llama2_hf": "/tmp/test-artifacts/small-ckpt-hf-03082024.pt",
"llama2_reward_hf": "/tmp/test-artifacts/small-ckpt-hf-reward-07122024.pt",
"llama3_tune": "/tmp/test-artifacts/small-ckpt-tune-llama3-05052024.pt",
"llama3_2_vision_hf": "/tmp/test-artifacts/small-ckpt-hf-vision-10172024.pt",
"llama3_2_vision_meta": "/tmp/test-artifacts/small-ckpt-meta-vision-10172024.pt",
"llama2_7b": "/tmp/test-artifacts/llama2-7b-torchtune.pt",
}

Expand Down
1 change: 1 addition & 0 deletions torchtune/models/llama3_2_vision/_component_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ def llama3_2_vision_decoder(
by :func:`~torchtune.modules.KVCache`.
encoder_max_seq_len (int): maximum sequence length the encoder will be run with, as used
by :func:`~torchtune.modules.KVCache`.
rope_base (int): base for the rotary positional embeddings. Default: 500_000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did our linter not pick this up...

intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`.

Expand Down
Loading