add pack and isp support for internlm1 and internlm2

InternLM · Aug 11, 2024 · 6bfd957 · 6bfd957
1 parent 8a7fcea
commit 6bfd957
Show file tree

Hide file tree

Showing 7 changed files with 489 additions and 40 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,53 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/psf/black
+    rev: '22.8.0'
+    hooks:
+    -   id: black
+        args:
+        - --line-length=120
+-   repo: https://github.com/pycqa/isort
+    rev: '5.12.0'
+    hooks:
+    -   id: isort
+        name: isort
+        files: "\\.(py)$"
+        args:
+        - --profile=black
+-   repo: https://github.com/PyCQA/flake8
+    rev: '3.8.4'
+    hooks:
+    -   id: flake8
+        args:
+        - --ignore=F403,F405,W504,W503,E203
+        - --max-line-length=120
+-   repo: https://github.com/pre-commit/pygrep-hooks
+    rev: v1.9.0
+    hooks:
+    -   id: python-check-blanket-noqa
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-added-large-files
+        args: ['--maxkb=100',--enforce-all]
+    -   id: check-json
+    -   id: check-docstring-first
+    -   id: check-yaml
+    -   id: debug-statements
+    -   id: mixed-line-ending
+-   repo: https://github.com/PyCQA/pylint/
+    rev: v2.17.2
+    hooks:
+    -   id: pylint
+        name: pylint
+        entry: pylint
+        language: system
+        types: [python]
+        args:
+            [
+                '--rcfile=.pylintrc',
+                '--disable=C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203'
+            ]
diff --git a/examples/internlm/internlm2_7b/train.py b/examples/internlm/internlm2_7b/train.py
@@ -13,6 +13,7 @@
 from internlm.train import initialize_model
 from internlm.utils.common import parse_args
 
+from huggingface_model.dispatch_utils import hf_model_dispatch
 from huggingface_model.internlm.internlm2_7b.configuration_internlm2 import (
     InternLM2Config,
 )
@@ -28,7 +29,7 @@ def main(args):
     hf_config_initializer.register_module(gpc.config.model_type, InternLM2Config)
 
     # initialize model
-    model = initialize_model()
+    model = initialize_model(model_dispatch_func=hf_model_dispatch)
 
     # initialize train dataloader
     train_dl, dataset_types = build_train_loader_with_data_type()

diff --git a/examples/internlm/internlm_7b/train.py b/examples/internlm/internlm_7b/train.py
@@ -13,6 +13,7 @@
 from internlm.train import initialize_model
 from internlm.utils.common import parse_args
 
+from huggingface_model.dispatch_utils import hf_model_dispatch
 from huggingface_model.internlm.internlm_7b.configuration_internlm import InternLMConfig
 from huggingface_model.internlm.internlm_7b.modeling_internlm import InternLMForCausalLM
 
@@ -24,7 +25,7 @@ def main(args):
     hf_config_initializer.register_module(gpc.config.model_type, InternLMConfig)
 
     # initialize model
-    model = initialize_model()
+    model = initialize_model(model_dispatch_func=hf_model_dispatch)
 
     # initialize train dataloader
     train_dl, dataset_types = build_train_loader_with_data_type()

diff --git a/huggingface_model/README.md b/huggingface_model/README.md
@@ -0,0 +1,115 @@
+# Adapting HuggingFace Models for InternEvo Packed and ISP Training
+
+## Background
+
+When HuggingFace models are being integrated with the InternEvo framework, we want packed training and ISP be supproted to:
+1. Improve GPU computation utilization (reduce wasting computation on meaningless padded tokens)
+2. Support training with long sequences (use the latest parallel techniques from InternEvo framework)
+
+This requires adapting the models to support:
+1. Packed training
+2. ISP (Intern Sequence Parallelism) training
+
+## Supporting Packed Training
+
+### Example for modeling_internlm.py
+
+Step 1. Obtain `cu_seqlens` and `max_seqlen` from `gpc` for the current batch.
+
+```python
+use_packed_dataset = gpc.config.data.get("use_packed_dataset", False)
+
+if use_packed_dataset:
+    assert bsz == 1, "hidden_states should be packed into bsz=1 when use_packed_dataset=True"
+    cu_seqlens = gpc.config.data[f"cu_seqlens_data_rank{gpc.get_local_rank(ParallelMode.DATA)}"]
+    max_seqlen = gpc.config.data[f"max_seqlen_data_rank{gpc.get_local_rank(ParallelMode.DATA)}"]
+```
+
+Optional Step 2. If the rotary embedding logic cannot meet the requirement of packed training, please use InternEvo `apply_rotary_emb`. 
+Otherwise, just use the original logic and skip this step.
+
+```python
+if use_packed_dataset:
+    cos, sin = self.rotary_emb(value_states, max_seqlen)
+    cos = cos[position_ids].squeeze(0)
+    sin = sin[position_ids].squeeze(0)
+    assert sin.shape == cos.shape, "cos and sin must have the same shape"
+    _, rotary_dim = cos.shape
+    rotary_dim_half = rotary_dim // 2
+    cos_half = cos[:q_len, :rotary_dim_half]
+    sin_half = sin[:q_len, :rotary_dim_half]
+    query_states = apply_rotary_emb(query_states, cos_half, sin_half)
+    key_states = apply_rotary_emb(key_states, cos_half, sin_half) 
+```
+
+Step 3. Pass `cu_seqlens` and `max_seqlen` to flash attention varlen kernel for variable-length attention calculation.
+
+```python
+if use_packed_dataset:
+    attn_output = isp_flash_attn_varlen_func(
+        query_states,
+        key_states,
+        value_states,
+        cu_seqlens=cu_seqlens,
+        max_seqlen=max_seqlen,
+        causal=True,
+    )
+```
+
+
+### Example for modeling_internlm2.py
+
+Step 1. Obtain `cu_seqlens` and `max_seqlen` from gpc for the current batch.
+
+```python
+use_packed_dataset = gpc.config.data.get("use_packed_dataset", False)
+
+if use_packed_dataset:
+    assert bsz == 1, "hidden_states should be packed into bsz=1 when use_packed_dataset=True"
+    cu_seqlens = gpc.config.data[f"cu_seqlens_data_rank{gpc.get_local_rank(ParallelMode.DATA)}"]
+    max_seqlen = gpc.config.data[f"max_seqlen_data_rank{gpc.get_local_rank(ParallelMode.DATA)}"]
+```
+
+Step 2. Pass `cu_seqlens` and `max_seqlen` to flash attention varlen kernel for variable-length attention calculation.
+
+```python
+if use_packed_dataset:
+    attn_output = isp_flash_attn_varlen_func(
+        query_states,
+        key_states,
+        value_states,
+        cu_seqlens=cu_seqlens,
+        max_seqlen=max_seqlen,
+        causal=True,
+    )
+```
+
+
+## Supporting ISP Training
+
+### Automatic dispatch
+
+For simplicity, you can just create model with `hf_model_dispatch` like that:
+
+```
+model = initialize_model(model_dispatch_func=hf_model_dispatch)
+```
+
+And you can also modify `huggingface_model/dispatch_utils/__init__.py` to add custom patterns for automatic dispatch.
+
+For the config, you need to set ISP size like that:
+
+```python
+parallel = dict(
+    zero1=dict(size=-1),
+    tensor=dict(size=2, mode="isp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    weight=dict(size=1, overlap=False, memory_pool=True),
+)
+```
+
+- Set `tensor` size and mode for ISP.
+
+### Manual code adaption dispatch
+
+T.B.A.