HabanaAI · michalkuligowski · Nov 26, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
@@ -220,10 +220,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
-        if is_hpu:
-            import os
-            self.config_hidden_layers = int(
-                os.getenv('VLLM_CONFIG_HIDDEN_LAYERS', '1'))
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
@@ -252,8 +248,6 @@ def forward(
             hidden_states = layer(hidden_states,
                                   kv_caches[i - self.start_layer],
                                   attn_metadata)
-            if is_hpu and i % self.config_hidden_layers == 0:
-                htorch.core.mark_step()
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.ln_f(hidden_states)

@@ -315,11 +315,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
-        if is_hpu:
-            import os
-            self.config_hidden_layers = int(
-                os.getenv('VLLM_CONFIG_HIDDEN_LAYERS', '1'))
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
 
@@ -346,13 +341,12 @@ def forward(
         if is_hpu:
             import habana_frameworks.torch as htorch
             htorch.core.mark_step()
+
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(positions, hidden_states,
                                             kv_caches[i - self.start_layer],
                                             attn_metadata, residual)
-            if is_hpu and i % self.config_hidden_layers == 0:
-                htorch.core.mark_step()
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,

@@ -328,9 +328,6 @@ def forward(
                 attn_metadata,
                 residual,
             )
-            if current_platform.is_hpu():
-                htorch.core.mark_step()
-
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,

@@ -278,17 +278,25 @@ def flatten(in_list):
     return list(itertools.chain(*in_list))
 
 
-def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"):
-    if module.__class__.__name__.endswith(suffix):
+def modify_decoder_layer(module: torch.nn.Module,
+                         suffix="DecoderLayer",
+                         n=1,
+                         counter=None):
 
-        def forward_hook(module, args, output):
-            htorch.core.mark_step()
-            return output
+    def forward_hook(module, args, output):
+        htorch.core.mark_step()
+        return output
 
-        module.register_forward_hook(forward_hook)
+    if counter is None:
+        counter = [0]
 
     for child_name, child_module in module.named_children():
-        modify_decoder_layer(child_module)
+        if child_module.__class__.__name__.endswith(suffix):
+            counter[0] += 1
+            if counter[0] % n == 0:
+                child_module.register_forward_hook(forward_hook)
+        else:
+            modify_decoder_layer(child_module, suffix, n, counter)
 
 
 class HpuModelAdapter:
@@ -756,7 +764,13 @@ def load_model(self) -> None:
             elif not is_fake_hpu():
                 self.model = self.model.to("hpu")
                 htcore.mark_step()
-            modify_decoder_layer(self.model)
+
+            hidden_layer_markstep = int(
+                os.getenv('VLLM_CONFIG_HIDDEN_LAYERS', '1'))
+            hideen_layer_suffix = os.getenv('VLLM_CONFIG_HIDDEN_LAYERS_SUFFIX',
+                                            'DecodeLayer')
+            modify_decoder_layer(self.model, hideen_layer_suffix,
+                                 hidden_layer_markstep)
             torch.hpu.synchronize()
 
             with HabanaMemoryProfiler() as m_wrap: