From d214f521d806ddbf772b9461092863a859f544d7 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Fri, 8 Nov 2024 16:09:37 +0000
Subject: [PATCH] tidy tidy tidy tidy fresh clean

---
 tests/recipes/test_eleuther_eval.py |  6 ------
 tests/recipes/utils.py              | 17 ++++++-----------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 255634e27b..78c39e128f 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -246,10 +246,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
 
         pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
 
-        # Find all matches in the table text
         matches = re.findall(pattern, out, re.MULTILINE)
-
-        # Print the task names and their corresponding accuracy scores
         for task_name, _, accuracy in matches:
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
@@ -291,9 +288,6 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
 
         pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
 
-        # Find all matches in the table text
         matches = re.findall(pattern, out, re.MULTILINE)
-
-        # Print the task names and their corresponding accuracy scores
         for task_name, _, accuracy in matches:
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
index 5696a289fa..7c35eedc2a 100644
--- a/tests/recipes/utils.py
+++ b/tests/recipes/utils.py
@@ -137,11 +137,6 @@ def llama3_2_vision_test_config() -> List[str]:
         "tokenizer.tile_size=18",
         "tokenizer.max_seq_len=4096",
     ]
-    return [
-        "model._component_=torchtune.modules.model_fusion.DeepFusionModel",
-        "model.encoder._component_=torchtune.models.llama3_2_vision._component_builders.llama3_2_vision_encoder",
-        "model.encoder._component_=torchtune.models.llama3_2_vision._component_builders.llama3_2_vision_decoder",
-    ]
 
 
 def dummy_vision_model():
@@ -259,16 +254,16 @@ def write_hf_ckpt_config(ckpt_dir: str):
 def write_hf_vision_ckpt_config(ckpt_dir: str):
     config = {
         "text_config": {
-            "num_attention_heads": 8,  # Ensure this matches your expectations
-            "num_key_value_heads": 4,  # This should match your expected key
-            "hidden_size": 128,  # Corresponds to dim
+            "num_attention_heads": 8,
+            "num_key_value_heads": 4,
+            "hidden_size": 128,
             "vocab_size": 128256,
             "cross_attention_layers": [1, 4],
         },
         "vision_config": {
-            "hidden_size": 128,  # Corresponds to encoder_dim
-            "image_size": 18,  # This corresponds to tile_size
-            "max_num_tiles": 2,  # Corresponds to num_tiles
+            "hidden_size": 128,
+            "image_size": 18,
+            "max_num_tiles": 2,
             "supported_aspect_ratios": [[1, 1], [1, 2], [2, 1]],
         },
     }