From 0789e253d1680408d0a96862a13a13da395ca204 Mon Sep 17 00:00:00 2001 From: Tomasz Pawlowski Date: Fri, 25 Oct 2024 15:30:11 +0200 Subject: [PATCH] Enable Dynamic MoE for Mixtral on 1.19.0 (#425) Move Dynamic MoE implementation to habana_main. It was previously implemented for 1.18, but had to be modified as ops have been moved to [github.com/HabanaAI/vllm-hpu-extension](https://github.com/HabanaAI/vllm-hpu-extension). Works with bf16, uses static (legacy) mode when running with quantization. Related PRs: - https://github.com/HabanaAI/vllm-fork/pull/303 - https://github.com/HabanaAI/vllm-hpu-extension/pull/13 ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

Adding or changing kernels

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--- requirements-hpu.txt | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 22 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 20f4dc74a3955..4019950062efe 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 457450cda2ce6..8f6bdaa7ab44a 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -226,9 +226,13 @@ def __init__( self.num_expert_group = num_expert_group self.topk_group = topk_group self.custom_routing_function = custom_routing_function - if current_platform.is_hpu(): - from vllm_hpu_extension.ops import StaticFusedMOE - self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts) + if is_hpu: + from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE + + from vllm.model_executor.layers.quantization.inc import INCConfig + selected_fused_moe = (StaticFusedMOE if isinstance( + quant_config, INCConfig) else DynamicFusedMOE) + self.hpu_static_fused_moe = selected_fused_moe(self.num_experts) if quant_config is None: self.quant_method: Optional[QuantizeMethodBase] = ( @@ -321,8 +325,10 @@ def _load_w13(self, expert_data.copy_(loaded_weight) if is_hpu: - self.hpu_static_fused_moe.w13_list[expert_id].set_weight( - orig_exp_data) + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w13_list[expert_id].set_weight( + orig_exp_data) def _load_w2(self, expert_data: torch.Tensor, @@ -341,8 +347,10 @@ def _load_w2(self, # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) if is_hpu: - self.hpu_static_fused_moe.w2_list[expert_id].set_weight( - expert_data) + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w2_list[expert_id].set_weight( + expert_data) def _load_single_value(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int):