diff --git a/plugins/framework/src/fms_acceleration/framework_plugin.py b/plugins/framework/src/fms_acceleration/framework_plugin.py index cf1764d5..28fecebf 100644 --- a/plugins/framework/src/fms_acceleration/framework_plugin.py +++ b/plugins/framework/src/fms_acceleration/framework_plugin.py @@ -206,7 +206,7 @@ def _check_config_and_maybe_check_values( t = list(t.keys())[0] # otherwise take the first value if t not in values: - if default is None: + if t is not None or default is None: raise AccelerationPluginConfigError( f"{self.__class__.__name__}: Value at '{key}' was '{t}'. " f"Not found in expected set '{values}'." diff --git a/plugins/fused-ops-and-kernels/.isort.cfg b/plugins/fused-ops-and-kernels/.isort.cfg index 4aa62fac..a6206218 100644 --- a/plugins/fused-ops-and-kernels/.isort.cfg +++ b/plugins/fused-ops-and-kernels/.isort.cfg @@ -10,4 +10,5 @@ known_firstparty= known_localfolder=tuning # skip code imported from unsloth -skip_glob=**/unsloth*/** +skip_glob=**/unsloth*/**, + **/liger*/** diff --git a/plugins/fused-ops-and-kernels/.pylintrc b/plugins/fused-ops-and-kernels/.pylintrc index 31cb902c..cfe9aeb7 100644 --- a/plugins/fused-ops-and-kernels/.pylintrc +++ b/plugins/fused-ops-and-kernels/.pylintrc @@ -53,7 +53,9 @@ ignore=CVS,protobufs # format. Because '\\' represents the directory delimiter on Windows systems, # it can't be used as an escape character. # NOTE: do not lint code imported from unsloth -ignore-paths=.*fused_ops/unsloth_lora.*,.*kernels/unsloth* +ignore-paths=.*fused_ops/unsloth_lora.*, + .*fused_ops/liger_ce.*, + .*kernels/unsloth*, # Files or directories matching the regular expression patterns are skipped. # The regex matches against base names, not paths. The default value ignores diff --git a/plugins/fused-ops-and-kernels/README.md b/plugins/fused-ops-and-kernels/README.md index 0d66a357..0331c550 100644 --- a/plugins/fused-ops-and-kernels/README.md +++ b/plugins/fused-ops-and-kernels/README.md @@ -79,10 +79,23 @@ It is realtively easy by following an existing template, in what follows we use ) ``` +### Running Liger Kernel Benchmarks + +Using the [scenarios-liger.yaml](../../scripts/benchmarks/scenarios-liger.yaml), this will run full fine tuning, lora peft, autoGPTQ lora peft, and bits-and-bytes lora peft with the triton kernels (Fast RMS, RoPE, CrossEnt) as a base and then run with the liger kernel for LigerFusedLinearCrossEntropy as well as Fast RMS, RoPE to compare results. It only runs against mistral and llama models. + +The benchmarks were ran separately for each `num_gpu` entry; they can be run together in a single command, but this is more efficient. + +```sh +tox -e run-benches -- 1 "4 8 16 32" benchmark_outputs_1 scenarios-liger.yaml none +tox -e run-benches -- 2 "8 16 32 64" benchmark_outputs_2 scenarios-liger.yaml none +tox -e run-benches -- 4 "16 32 64 128" benchmark_outputs_3 scenarios-liger.yaml none +``` + + ## Known Issues - MixedPrecision `--fp16` or `--bf16` should be used with `fast_lora`. - `fast_lora` has issues with FSDP V1 with the `peft` style of FSDP wrapping. * This is because the adapter's forward functions are bypassed in the fused ops. * For AutoGPTQ/QLoRA this is addressed by distributing the adapters using DDP so they will be unsharded in time for the fused ops. -- `fast_rope_embeddings` does not work with position_ids. Currently `position_ids` are ignored and could give wrong results. \ No newline at end of file +- `fast_rope_embeddings` does not work with `postion_ids`, it seems like HF has depracated passing these ids into the rope embedding methods. \ No newline at end of file diff --git a/plugins/fused-ops-and-kernels/configs/fast_kernels.yaml b/plugins/fused-ops-and-kernels/configs/fast_kernels.yaml index 476daa91..45f0051e 100644 --- a/plugins/fused-ops-and-kernels/configs/fast_kernels.yaml +++ b/plugins/fused-ops-and-kernels/configs/fast_kernels.yaml @@ -22,4 +22,4 @@ training: fast_rms_layernorm: True # fast RoPE embedding triton kernels - fast_rope_embeddings: True \ No newline at end of file + fast_rope_embeddings: True diff --git a/plugins/fused-ops-and-kernels/configs/fast_kernels_liger.yaml b/plugins/fused-ops-and-kernels/configs/fast_kernels_liger.yaml new file mode 100644 index 00000000..a154b95b --- /dev/null +++ b/plugins/fused-ops-and-kernels/configs/fast_kernels_liger.yaml @@ -0,0 +1,25 @@ +training: + + fused_ops_and_kernels: + + # if under training stanza, then putting + # base_layer and fused_lora will be a misnomer + # - this should be in peft.quantized + # However, if it is specified, it will still + # be read. This is useful in use cases where + # the yaml is system generated and not shown + # to a user. + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + # fast loss triton kernels + fast_loss: fused_ce_liger + + # fast rms norm triton kernels + fast_rms_layernorm: True + + # fast RoPE embedding triton kernels + fast_rope_embeddings: True diff --git a/plugins/fused-ops-and-kernels/configs/fast_quantized_peft_liger.yaml b/plugins/fused-ops-and-kernels/configs/fast_quantized_peft_liger.yaml new file mode 100644 index 00000000..c6655d34 --- /dev/null +++ b/plugins/fused-ops-and-kernels/configs/fast_quantized_peft_liger.yaml @@ -0,0 +1,30 @@ +# PEFT-related acceleration +peft: + + # quantization-releated acceleration + # e.g., kernels for quantized base weights + quantization: + + fused_ops_and_kernels: + + # load unsloth optimizations for these 4bit base layer weights. + # currently only support "auto_gptq" and "bitsandbytes" + base_layer: auto_gptq + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + + # fused kernels for lora linear layers + fused_lora: True + + # fast loss triton kernels + fast_loss: fused_ce_liger + + # fast rms norm triton kernels + fast_rsm_layernorm: True + + # fast RoPE embedding triton kernels + fast_rope_embeddings: True diff --git a/plugins/fused-ops-and-kernels/pyproject.toml b/plugins/fused-ops-and-kernels/pyproject.toml index d9acec60..516b2756 100644 --- a/plugins/fused-ops-and-kernels/pyproject.toml +++ b/plugins/fused-ops-and-kernels/pyproject.toml @@ -29,3 +29,11 @@ only-include = ["src/fms_acceleration_foak"] [tool.hatch.build.targets.wheel.sources] "src" = "" + +[tool.black] +force-exclude = ''' +/( +.*unsloth.* +| .*liger.* +)/ +''' diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py index 0bf35fbb..7948a98c 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py @@ -23,8 +23,8 @@ import torch # Local -from .utils import lora_adapters_switch_ddp_from_fsdp from .models.utils import filter_mp_rules +from .utils import lora_adapters_switch_ddp_from_fsdp # consider rewriting register_foak_model_patch_rules into something @@ -73,7 +73,10 @@ def register_foak_model_patch_rules( # maybe this we should define envvars FILTER_MAP = { "fused_lora": {"qkvo", "mlp"}, - "fast_loss": "cross-ent", + "fast_loss": { + True: "cross-ent", + "fused_ce_liger": "fused-lce", + }, "fast_rms_layernorm": "rms", "fast_rope_embeddings": "rope", } @@ -109,19 +112,19 @@ def __init__(self, configurations: Dict[str, Dict]): key="base_layer", values=["auto_gptq", "bitsandbytes"], default="auto_gptq" ) self.configurations["fused_lora"] = self._check_config_and_maybe_check_values( - key="fused_lora", values=[False, True], default=True + key="fused_lora", values=[False, True], default=False ) self.configurations["fast_loss"] = self._check_config_and_maybe_check_values( - key="fast_loss", values=[False, True], default=True + key="fast_loss", values=[False, True, "fused_ce_liger"], default=False ) self.configurations["fast_rms_layernorm"] = ( self._check_config_and_maybe_check_values( - key="fast_rms_layernorm", values=[False, True], default=True + key="fast_rms_layernorm", values=[False, True], default=False ) ) self.configurations["fast_rope_embeddings"] = ( self._check_config_and_maybe_check_values( - key="fast_rope_embeddings", values=[False, True], default=True + key="fast_rope_embeddings", values=[False, True], default=False ) ) @@ -162,6 +165,8 @@ def augmentation( if k in FILTER_MAP and k not in omitted: ts = FILTER_MAP[k] + if isinstance(ts, dict) and v in ts: + ts = ts[v] if isinstance(ts, str): ts = {ts} diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/__init__.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/__init__.py new file mode 100644 index 00000000..3a6da048 --- /dev/null +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 Byron Hsu & Linkedin team. All rights reserved. +# +# BSD 2-CLAUSE LICENSE +# Copyright 2024 LinkedIn Corporation +# All Rights Reserved. +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from .fused_linear_cross_entropy_loss import lce_forward \ No newline at end of file diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/cross_entropy.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/cross_entropy.py new file mode 100644 index 00000000..bbd5a05f --- /dev/null +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/cross_entropy.py @@ -0,0 +1,222 @@ +# Copyright 2024 Byron Hsu & Linkedin team. All rights reserved. +# +# BSD 2-CLAUSE LICENSE +# Copyright 2024 LinkedIn Corporation +# All Rights Reserved. +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import torch +import triton +import triton.language as tl + +@triton.jit +def liger_cross_entropy_kernel( + X_ptr, + X_stride, + Y_ptr, + Y_stride, + loss_ptr, + loss_stride, + n_cols, + n_non_ignore, + ignore_index, + label_smoothing: tl.constexpr, + reduction: tl.constexpr, # set it as constexpr since reduction is always known at compile time + BLOCK_SIZE: tl.constexpr, +): + """ + This kernel computes both cross entropy loss and the gradient of the input. + We only consider hard label + mean reduction for now. Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math. + + Parameters: + X_ptr: Pointer to input tensor. + X_stride (int): The stride of the input tensor. + Y_ptr: Pointer to target tensor. + Y_stride (int): The stride of the target tensor. + loss_ptr: Pointer to tensor to store the loss. + loss_stride (int): The stride of the loss tensor. + n_cols (int): The number of columns in the input tensor. + n_non_ignore (int): The number of non-ignored elements in the batch. + ignore_index (int): The index to ignore in the target. + label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing. + reduction (str): The string for the reduction to apply + BLOCK_SIZE (int): The block size for Triton operations. + """ + + # https://github.com/triton-lang/triton/issues/1058 + # If B*T*V is too large, program_id * stride will overflow out of int32, so we convert to int64 + program_id = tl.program_id(0).to(tl.int64) + + # 1. Load Y_ptr first because if the target is ignore_index, we can return right away + Y_ptr += program_id * Y_stride + y = tl.load(Y_ptr) + + # 2. locate the start index + X_ptr += program_id * X_stride + + if y == ignore_index: + # set all X_ptr as 0 + for i in range(0, n_cols, BLOCK_SIZE): + X_offsets = i + tl.arange(0, BLOCK_SIZE) + tl.store(X_ptr + X_offsets, 0.0, mask=X_offsets < n_cols) + return + + loss_ptr += program_id * loss_stride + + # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax) + # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867 + + # 3. [Online softmax] first pass: find max + sum + m = float("-inf") # m is the max value. use the notation from the paper + d = 0.0 # d is the sum. use the notation from the paper + ori_X_y = tl.load( + X_ptr + y + ) # we need to store the original value of X_y for the loss calculation + + # Label smoothing is a general case of normal cross entropy + # See the full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issue-2503665310 + scaled_x_sum = 0.0 + eps = label_smoothing / n_cols + + for i in range(0, n_cols, BLOCK_SIZE): + X_offsets = i + tl.arange(0, BLOCK_SIZE) + X_block = tl.load( + X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf") + ) + block_max = tl.max(X_block) + if label_smoothing > 0: + # scale X beforehand to avoid overflow + scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0)) + m_new = tl.maximum(m, block_max) + d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new)) + m = m_new + + # 4. [Online Softmax] Second pass: compute gradients + # For 'mean' reduction, gradients are normalized by number of non-ignored elements (N) + # dx_y = (softmax(x_y) - 1) / N + # dx_i = softmax(x_i) / N, i != y + # For label smoothing: + # dx_i = (softmax(x_y) - label_smoothing / V) / N, V = n_cols, i != y + # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) / N + # = dx_i - (1 - label_smoothing) / N + # + # For 'sum' reduction, no normalization is applied: + # dx_y = softmax(x_y) - 1 + # dx_i = softmax(x_i), for i ≠ y + # For label smoothing: + # dx_i = (softmax(x_y) - label_smoothing / V), V = n_cols, i != y + # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) + # = dx_i - (1 - label_smoothing) + + for i in range(0, n_cols, BLOCK_SIZE): + X_offsets = i + tl.arange(0, BLOCK_SIZE) + X_block = tl.load( + X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf") + ) + if reduction == "mean": + X_block = (tl.exp(X_block - m) / d - eps) / (n_non_ignore) + else: + X_block = tl.exp(X_block - m) / d - eps + + tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols) + + # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in + # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34 + tl.debug_barrier() + + # 5. Calculate the loss + + # loss = log (softmax(X_y)) = log ((e ^ (X_y - max(X)) / sum(e ^ (X - max(X)))) + # = (X_y - max(X)) - log(sum(e ^ (X - max(X)))) + # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1 + # So we can safely calculate log (softmax(X_y)) without overflow + loss = -(ori_X_y - m - tl.log(d)) + + # Orginal loss = H(q, p), with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps + # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p) + # = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i)) + # By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as: + # = (1 - label_smoothing) * H(q, p) + (-sum(x_i * eps) + label_smoothing * (m + logd)) + # Refer to H(q', p) in section 7 of the paper: https://arxiv.org/pdf/1512.00567 + # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516 + # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087 + if label_smoothing > 0: + smooth_loss = scaled_x_sum + label_smoothing * (m + tl.log(d)) + loss = loss * (1 - label_smoothing) + smooth_loss + + # Normalize the loss by the number of non-ignored elements if reduction is "mean" + if reduction == "mean": + loss = loss / n_non_ignore + + # 6. Specially handle the i==y case where `dx_y = (softmax(x_y) - (1 - label_smoothing) / N` + X_y = tl.load(X_ptr + y) + if reduction == "mean": + X_y += -(1 - label_smoothing) / (n_non_ignore) + else: + X_y += -(1 - label_smoothing) + + tl.store(loss_ptr, loss) + tl.store(X_ptr + y, X_y) + + +# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19 +# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling +# The optimal maximum block size depends on your hardware, your kernel, and your dtype +MAX_FUSED_SIZE = 65536 // 2 # the best size we found by manually tuning + + +@triton.jit +def element_mul_kernel( + X_ptr, + X_stride, + grad_output_ptr, + n_cols, + BLOCK_SIZE: tl.constexpr, +): + """ + This function multiplies each element of the tensor pointed by X_ptr with the value pointed by grad_output_ptr. + The multiplication is performed in-place on the tensor pointed by X_ptr. + + Parameters: + X_ptr: Pointer to the input tensor. + X_stride (int): The stride of the input tensor. + grad_output_ptr: Pointer to the gradient output value. + n_cols (int): The number of columns in the input tensor. + BLOCK_SIZE (int): The block size for Triton operations. + """ + + # Get the program ID and convert it to int64 to avoid overflow + program_id = tl.program_id(0).to(tl.int64) + + # Locate the start index + X_ptr += program_id * X_stride + + # Load the gradient output value + grad_output = tl.load(grad_output_ptr) + + # Perform the element-wise multiplication + for i in range(0, n_cols, BLOCK_SIZE): + X_offsets = i + tl.arange(0, BLOCK_SIZE) + X_block = tl.load(X_ptr + X_offsets, mask=X_offsets < n_cols) + tl.store(X_ptr + X_offsets, X_block * grad_output, mask=X_offsets < n_cols) + diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/fused_linear_cross_entropy_loss.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/fused_linear_cross_entropy_loss.py new file mode 100644 index 00000000..edc655f6 --- /dev/null +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/liger_ce/fused_linear_cross_entropy_loss.py @@ -0,0 +1,417 @@ +# Copyright 2024 Byron Hsu & Linkedin team. All rights reserved. +# +# BSD 2-CLAUSE LICENSE +# Copyright 2024 LinkedIn Corporation +# All Rights Reserved. +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from typing import List, Optional, Tuple, Union +import torch +import triton +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss +from transformers.modeling_outputs import CausalLMOutputWithPast + +from .cross_entropy import ( + element_mul_kernel, + liger_cross_entropy_kernel, +) + +# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19 +# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling +# The optimal maximum block size depends on your hardware, your kernel, and your dtype +MAX_FUSED_SIZE = 65536 // 2 + + +def fused_linear_cross_entropy_forward( + _input, + weight, + target, + bias=None, + ignore_index=-100, + label_smoothing=0.0, + reduction="mean", +): + dtype = ( + torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else _input.dtype + ) + device = _input.device + + # inputs have shape: BT x H + # materialized activations will have shape: BT x V + # the increase in memory = BT x V + # reduction can be achieved by partitioning the number of tokens BT into smaller chunks. + # for ex: if we were to achieve the same memory consumption as BT x H, then the chunk size should be: + # inc_factor = (V+H-1)//H, chunk_size = (BT + inc_factor - 1)//inc_factor + # for ex: BT = 4096*4, V = 32000, H = 4096 ==> inc_factor = 8, chunk_size = 2048 + BT, H = _input.shape + V = weight.shape[0] + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V)) + + inc_factor = triton.cdiv(V, H) # (V + H - 1) // H + chunk_size = triton.next_power_of_2( + triton.cdiv(BT, inc_factor) + ) # (BT + inc_factor - 1) // inc_factor + num_chunks = triton.cdiv(BT, chunk_size) # (BT + chunk_size - 1) // chunk_size + + grad_weight = ( + torch.zeros_like(weight, device=device) if weight.requires_grad else None + ) + grad_input = torch.zeros_like(_input, device=device) + grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None + # we use fp32 for loss accumulator + loss_1d = torch.zeros(BT, dtype=torch.float32, device=device) + + # NOTE: skip .item() here to avoid CUDA synchronization + total_n_non_ignore = (target != ignore_index).sum() + + for chunk_id in range(num_chunks): + start_idx = chunk_id * chunk_size + end_idx = min((chunk_id + 1) * chunk_size, BT) + _input_chunk = _input[start_idx:end_idx] # chunk_size x H + + # when doing matmul, use the original precision + logits_chunk = _input_chunk @ weight.t() # chunk_size x V + if bias is not None: + logits_chunk = logits_chunk + bias + target_chunk = target[start_idx:end_idx] # chunk_size, + + n_rows = logits_chunk.shape[0] + + # unreduced loss + loss_1d_slice = loss_1d[start_idx:end_idx] # chunk_size, + n_non_ignore = (target_chunk != ignore_index).sum().item() + + # when doing CE, use the upcasted precision + logits_chunk = logits_chunk.float() + + # ensure _input and target are contiguous + logits_chunk = logits_chunk.contiguous() + target_chunk = target_chunk.contiguous() + + # Here we calculate the gradient of logits_chunk in place so we can save memory. + liger_cross_entropy_kernel[(n_rows,)]( + X_ptr=logits_chunk, + X_stride=logits_chunk.stride(-2), + Y_ptr=target_chunk, + Y_stride=target_chunk.stride(-1), # always 1 + loss_ptr=loss_1d_slice, + loss_stride=loss_1d_slice.stride(-1), # always 1 + n_cols=V, + n_non_ignore=n_non_ignore, + ignore_index=ignore_index, + label_smoothing=label_smoothing, + reduction=reduction, + BLOCK_SIZE=BLOCK_SIZE, + num_warps=32, + ) + + # gradient of logits_chunk is computed in-place by the above triton kernel. + # Following HuggingFace model source code, we do the forward and backward + # w.r.t. logits in fp32 for numerical stability especially as the num classes (vocab size) is huge. + # (reference: https://github.com/huggingface/transformers/blob/v4.42.4/src/transformers/models/llama/modeling_llama.py#L1194) + # Propagating to lm_head's backward, we'll switch back to the original dtype. + logits_chunk = logits_chunk.to(dtype) + + # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V + # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H + # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only + # on `n_non_ignore` tokens. However, the gradient of the input should be calculated for all tokens. + # Thus, we need an additional scaling factor of (n_non_ignore/total_n_non_ignore) to scale the gradients. + + if reduction == "mean": + alpha = n_non_ignore / total_n_non_ignore if total_n_non_ignore > 0 else 0.0 + else: + alpha = 1.0 + + loss_1d[start_idx:end_idx] = loss_1d_slice * alpha + grad_logits_chunk = logits_chunk * alpha # chunk_size x V + + grad_input[start_idx:end_idx] = grad_logits_chunk @ weight + + if grad_weight is not None: + torch.addmm( + input=grad_weight, + mat1=logits_chunk.t(), + mat2=_input_chunk, + out=grad_weight, + alpha=alpha, + beta=1.0, + ) + + if bias is not None: + torch.add( + input=grad_bias, + other=logits_chunk.sum(dim=0), + out=grad_bias, + alpha=alpha, + ) + + loss = torch.sum(loss_1d) + return loss, grad_input, grad_weight, grad_bias + + +def fused_linear_cross_entropy_backward( + grad_output, grad_input, grad_weight, grad_bias +): + # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time + if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)): + # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place + # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton. + BT, H = grad_input.shape + n_rows = BT + BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(H)) + + element_mul_kernel[(n_rows,)]( + grad_input, + grad_input.stride(-2), + grad_output, + H, + BLOCK_SIZE=BLOCK_SIZE, + num_warps=32, + ) + + # handle grad_weight + if grad_weight is not None: + V, H = grad_weight.shape + n_rows = V + + element_mul_kernel[(n_rows,)]( + grad_weight, + grad_weight.stride(-2), + grad_output, + H, + BLOCK_SIZE=BLOCK_SIZE, + num_warps=32, + ) + + if grad_bias is not None: + V = grad_bias.shape[0] + n_rows = V + + element_mul_kernel[(n_rows,)]( + grad_bias, + grad_bias.stride(-1), + grad_output, + 1, + BLOCK_SIZE=BLOCK_SIZE, + num_warps=32, + ) + return grad_input, grad_weight, grad_bias + +class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function): + @staticmethod + def forward( + ctx, + _input, + weight, + target, + bias=None, + ignore_index=-100, + label_smoothing=0.0, + reduction="mean", + ): + """ + Fusing the last linear layer with cross-entropy loss + Reference: https://github.com/mgmalek/efficient_cross_entropy + + Handle the forward and backward pass of the final linear layer via cross-entropy loss by avoiding + the materialization of the large logits tensor. Since Cross Entropy Loss is the last layer, we can + compute the gradient at the forward pass. By doing so, we don't have to store the _input and target + for the backward pass. + + _input: (B*T, H) where B is batch size, T is sequence length, H is hidden dimension. + target: (B*T) where each value is in [0, V-1] + weight: (V, H) where V is the number of classes + bias: (V) where V is the number of classes + ignore_index: the index to ignore in the target + label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing. + reduction: reduction to apply + """ + loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward( + _input, weight, target, bias, ignore_index, label_smoothing, reduction + ) + # downcast to dtype and store for backward + ctx.save_for_backward( + grad_input.detach(), + grad_weight.detach() if grad_weight is not None else None, + grad_bias.detach() if bias is not None else None, + ) + return loss + + @staticmethod + def backward(ctx, grad_output): + (grad_input, grad_weight, grad_bias) = ctx.saved_tensors + grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward( + grad_output, grad_input, grad_weight, grad_bias + ) + return (grad_input, grad_weight, None, grad_bias, None, None, None) + +class LigerFusedLinearCrossEntropyLoss(CrossEntropyLoss): + def __init__(self, *args, **kwargs): + super(LigerFusedLinearCrossEntropyLoss, self).__init__(*args, **kwargs) + + def forward(self, lin_weight, _input, target, bias=None): + return LigerFusedLinearCrossEntropyFunction.apply( + _input, + lin_weight, + target, + bias, + self.ignore_index, + self.label_smoothing, + self.reduction, + ) + +def lce_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, +) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Copy paste llama forward but replace torch cross entropy with liger fused linear cross entropy + + + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM + + >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + + loss = None + logits = None + + # patch change + if self.training and (labels is not None): + shift_hidden_states = hidden_states[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + + # flatten tokens + shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size) + shift_labels = shift_labels.view(-1) + + lce = LigerFusedLinearCrossEntropyLoss() + loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels) + + else: + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split( + self.vocab_size // self.config.pretraining_tp, dim=0 + ) + logits = [ + F.linear(hidden_states, lm_head_slices[i]) + for i in range(self.config.pretraining_tp) + ] + logits = torch.cat(logits, dim=-1) + else: + # TODO: differing line below in granite models compared to llama/mistral model type + # logits = logits / self.config.logits_scaling + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + + if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) \ No newline at end of file diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py index 5bb744c0..e4b58572 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py @@ -26,6 +26,7 @@ from transformers import PretrainedConfig # Local +from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding @@ -51,6 +52,7 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None): # Third Party from transformers.models.granite.modeling_granite import ( # pylint: disable=import-outside-toplevel GraniteAttention, + GraniteForCausalLM, GraniteMLP, GraniteRMSNorm, ) @@ -130,6 +132,11 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None): "transformers.models.granite.modeling_granite", ), ), + ModelPatcherRule( + rule_id="granite-fused-lce", + trigger=ModelPatcherTrigger(check=GraniteForCausalLM), + forward=lce_forward, + ), # TODO: have a generic version of this rule # - get the module name # - check if "apply_rotary_pos_emb" exists diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py index e5248c7d..94fab82f 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py @@ -26,11 +26,13 @@ from transformers import PretrainedConfig from transformers.models.llama.modeling_llama import ( LlamaAttention, + LlamaForCausalLM, LlamaMLP, LlamaRMSNorm, ) # Local +from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding @@ -115,8 +117,11 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None): base_type=base_type, ), ), - # TODO: have a generic version of this rule - # - get the module_name and reload on that + ModelPatcherRule( + rule_id="llama-fused-lce", + trigger=ModelPatcherTrigger(check=LlamaForCausalLM), + forward=lce_forward, + ), ModelPatcherRule( rule_id="llama-cross-ent", import_and_maybe_reload=( diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py index d2ee619b..64e65274 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py @@ -26,11 +26,13 @@ from transformers import PretrainedConfig from transformers.models.mistral.modeling_mistral import ( MistralAttention, + MistralForCausalLM, MistralMLP, MistralRMSNorm, ) # Local +from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding @@ -120,6 +122,11 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None): "transformers.models.mistral.modeling_mistral", ), ), + ModelPatcherRule( + rule_id="mistral-fused-lce", + trigger=ModelPatcherTrigger(check=MistralForCausalLM), + forward=lce_forward, + ), ModelPatcherRule( rule_id="mistral-rope", import_and_maybe_reload=( diff --git a/plugins/fused-ops-and-kernels/tox.ini b/plugins/fused-ops-and-kernels/tox.ini index c3a38721..b436fbe7 100644 --- a/plugins/fused-ops-and-kernels/tox.ini +++ b/plugins/fused-ops-and-kernels/tox.ini @@ -40,9 +40,8 @@ deps = black>=22.12 isort>=5.11 commands = - # exclude the code ported from unsloth - black --exclude .*unsloth.* src - black --exclude .*unsloth.* tests + black src + black tests isort . [testenv:build] diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml index b3b1deec..778af468 100644 --- a/sample-configurations/CONTENTS.yaml +++ b/sample-configurations/CONTENTS.yaml @@ -27,12 +27,24 @@ framework_configs: - fused-ops-and-kernels filename: accelerated-peft-autogptq-foak-sample-configuration.yaml + - shortname: accelerated-peft-autogptq-foak-liger + plugins: + - accelerated-peft + - fused-ops-and-kernels + filename: accelerated-peft-autogptq-foak-liger-sample-configuration.yaml + - shortname: accelerated-peft-bnb-foak plugins: - accelerated-peft - fused-ops-and-kernels filename: accelerated-peft-bnb-nf4-foak-sample-configuration.yaml + - shortname: accelerated-peft-bnb-foak-liger + plugins: + - accelerated-peft + - fused-ops-and-kernels + filename: accelerated-peft-bnb-nf4-foak-liger-sample-configuration.yaml + - shortname: aadp-padding-free plugins: - attention-and-distributed-packing @@ -74,6 +86,10 @@ framework_configs: - fused-ops-and-kernels filename: foak-fast-kernels-sample-configuration.yaml + - shortname: foak-fast-kernels-liger + plugins: + - fused-ops-and-kernels + filename: foak-fast-kernels-liger-sample-configuration.yaml # ------- MOE CONFIGS ---------- - shortname: moe-scattermoe-granite-ep1 plugins: @@ -138,4 +154,4 @@ framework_configs: plugins: - accelerated-moe - fused-ops-and-kernels - filename: moe-scattermoe-granite-ep8-foak-sample-configuration.yaml \ No newline at end of file + filename: moe-scattermoe-granite-ep8-foak-sample-configuration.yaml diff --git a/sample-configurations/accelerated-peft-autogptq-foak-liger-sample-configuration.yaml b/sample-configurations/accelerated-peft-autogptq-foak-liger-sample-configuration.yaml new file mode 100644 index 00000000..1126b4f8 --- /dev/null +++ b/sample-configurations/accelerated-peft-autogptq-foak-liger-sample-configuration.yaml @@ -0,0 +1,52 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + # PEFT-related acceleration + peft: + + # quantization-releated acceleration + # e.g., kernels for quantized base weights + quantization: + + # AutoGPTQ quantized base weights. + auto_gptq: + + # Kernel to be used for GPTQ linear laeyer + # NOTE: Not all kernels are suitable for PEFT training; need to use + # kernels that support autograd forward / backward. The best + # recommendation at the moment is "triton_v2". + kernel: triton_v2 + + # If true, then will already expect quantized checkpoint + # passed into TrainingArguments.model_name_or_path + from_quantized: true + + # Setting to false, will create GPTQ-LORA using the local autogptq package. + # if true, will create legacy implementation of GPTQ-LORA using external + # `auto_gptq`. Refer to README for installation instructions + use_external_lib: false + fused_ops_and_kernels: + + # load unsloth optimizations for these 4bit base layer weights. + # currently only support "auto_gptq" and "bitsandbytes" + base_layer: auto_gptq + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + + # fused kernels for lora linear layers + fused_lora: true + + # fast loss triton kernels + fast_loss: fused_ce_liger + + # fast rms norm triton kernels + fast_rsm_layernorm: true + + # fast RoPE embedding triton kernels + fast_rope_embeddings: true diff --git a/sample-configurations/accelerated-peft-bnb-nf4-foak-liger-sample-configuration.yaml b/sample-configurations/accelerated-peft-bnb-nf4-foak-liger-sample-configuration.yaml new file mode 100644 index 00000000..71c305ac --- /dev/null +++ b/sample-configurations/accelerated-peft-bnb-nf4-foak-liger-sample-configuration.yaml @@ -0,0 +1,47 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + # PEFT-related acceleration + peft: + + # quantization-releated acceleration + # e.g., kernels for quantized base weights + quantization: + + # For loading BitsAndBytes quantized layers + # to serve as 4bit base-weights for LoRA PEFT-tuning. + # NOTE: currently AutoGPTQ is not properly integrated into huggingface / + # bitsandbytes, thus recommended quant_type to be either "nf4" + # or "fp4". + # bitsandbytes: + bitsandbytes: + quant_type: nf4 + + # If True, then no get_peft_model and prepare_model_for_kbit_training + # will be called. + no_peft_model: false + fused_ops_and_kernels: + + # load unsloth optimizations for these 4bit base layer weights. + # currently only support "auto_gptq" and "bitsandbytes" + base_layer: bitsandbytes + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + + # fused kernels for lora linear layers + fused_lora: true + + # fast loss triton kernels + fast_loss: fused_ce_liger + + # fast rms norm triton kernels + fast_rsm_layernorm: true + + # fast RoPE embedding triton kernels + fast_rope_embeddings: true diff --git a/sample-configurations/foak-fast-kernels-liger-sample-configuration.yaml b/sample-configurations/foak-fast-kernels-liger-sample-configuration.yaml new file mode 100644 index 00000000..1752755f --- /dev/null +++ b/sample-configurations/foak-fast-kernels-liger-sample-configuration.yaml @@ -0,0 +1,30 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + training: + + fused_ops_and_kernels: + + # if under training stanza, then putting + # base_layer and fused_lora will be a misnomer + # - this should be in peft.quantized + # However, if it is specified, it will still + # be read. This is useful in use cases where + # the yaml is system generated and not shown + # to a user. + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + # fast loss triton kernels + fast_loss: fused_ce_liger + + # fast rms norm triton kernels + fast_rms_layernorm: true + + # fast RoPE embedding triton kernels + fast_rope_embeddings: true diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py index 38fe6679..df314868 100644 --- a/scripts/benchmarks/benchmark.py +++ b/scripts/benchmarks/benchmark.py @@ -723,7 +723,7 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset): if ( not args.run_only_scenarios - and scenarios.slow + and scenario.slow ): # unfiltered runs omit all "slow" marked scenarios print(f"Skipping slow scenario '{_scn_name}' beacuse run_only_scenarios=None.") diff --git a/scripts/benchmarks/refs/a100_80gb_liger.csv b/scripts/benchmarks/refs/a100_80gb_liger.csv new file mode 100644 index 00000000..e43666db --- /dev/null +++ b/scripts/benchmarks/refs/a100_80gb_liger.csv @@ -0,0 +1,193 @@ +bf16,epoch,fp16,framework_config,learning_rate,lora_alpha,lora_dropout,mem_nvidia_mem_reserved,mem_peak_torch_mem_alloc_in_bytes,mem_torch_mem_alloc_in_bytes,model_name_or_path,num_gpus,peft_method,per_device_train_batch_size,r,target_modules,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second +True,0.17,,foak-fast-kernels,2e-05,,,77695.0,80318097408.0,48198051840.0,meta-llama/Meta-Llama-3-8B,1,,4,,,bfloat16,0.991794786453247,496.5063,0.806,0.201,3299.857 +True,,,foak-fast-kernels,2e-05,,,77437.0,,,meta-llama/Meta-Llama-3-8B,1,,8,,,bfloat16,,,,, +True,,,foak-fast-kernels,2e-05,,,52497.0,,,meta-llama/Meta-Llama-3-8B,1,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels,2e-05,,,68225.0,,,meta-llama/Meta-Llama-3-8B,1,,32,,,bfloat16,,,,, +True,0.17,,foak-fast-kernels-liger,2e-05,,,77537.0,80318097408.0,48198051840.0,meta-llama/Meta-Llama-3-8B,1,,4,,,bfloat16,0.9917966079711914,498.2461,0.803,0.201,3288.335 +True,0.34,,foak-fast-kernels-liger,2e-05,,,79881.0,80318490624.0,48198445056.0,meta-llama/Meta-Llama-3-8B,1,,8,,,bfloat16,0.9880468559265136,961.3168,0.832,0.104,3408.658 +True,,,foak-fast-kernels-liger,2e-05,,,79925.0,,,meta-llama/Meta-Llama-3-8B,1,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels-liger,2e-05,,,78987.0,,,meta-llama/Meta-Llama-3-8B,1,,32,,,bfloat16,,,,, +True,0.17,,foak-fast-kernels,0.0002,16.0,0.1,50711.0,37644015616.0,16241584128.0,meta-llama/Meta-Llama-3-8B,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0148053932189942,427.8517,0.935,0.234,3829.365 +True,0.34,,foak-fast-kernels,0.0002,16.0,0.1,58301.0,59017447424.0,16241977344.0,meta-llama/Meta-Llama-3-8B,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0162366390228272,849.3577,0.942,0.118,3857.974 +True,,,foak-fast-kernels,0.0002,16.0,0.1,57695.0,,,meta-llama/Meta-Llama-3-8B,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,,,foak-fast-kernels,0.0002,16.0,0.1,67261.0,,,meta-llama/Meta-Llama-3-8B,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.17,,foak-fast-kernels-liger,0.0002,16.0,0.1,25465.0,24905936896.0,16241584128.0,meta-llama/Meta-Llama-3-8B,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0148304843902587,426.2687,0.938,0.235,3843.585 +True,0.34,,foak-fast-kernels-liger,0.0002,16.0,0.1,34681.0,33567698944.0,16241977344.0,meta-llama/Meta-Llama-3-8B,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.01575608253479,844.8273,0.947,0.118,3878.662 +True,0.68,,foak-fast-kernels-liger,0.0002,16.0,0.1,53115.0,50891223040.0,16242763776.0,meta-llama/Meta-Llama-3-8B,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.014188413619995,1684.0095,0.95,0.059,3891.665 +True,,,foak-fast-kernels-liger,0.0002,16.0,0.1,79051.0,,,meta-llama/Meta-Llama-3-8B,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.17,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,41221.0,27259280384.0,5884111872.0,meta-llama/Meta-Llama-3-8B,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0356837940216064,410.0877,0.975,0.244,3995.243 +True,0.34,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,75995.0,48632712192.0,5884505088.0,meta-llama/Meta-Llama-3-8B,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0364081382751464,803.4691,0.996,0.124,4078.315 +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,57877.0,,,meta-llama/Meta-Llama-3-8B,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,77591.0,,,meta-llama/Meta-Llama-3-8B,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.17,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,15527.0,14411297792.0,5884111872.0,meta-llama/Meta-Llama-3-8B,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0357236671447754,408.6648,0.979,0.245,4009.154 +True,0.34,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,24359.0,22936744960.0,5884505088.0,meta-llama/Meta-Llama-3-8B,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0366782855987549,798.2473,1.002,0.125,4104.994 +True,0.68,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,42409.0,39987639296.0,5885291520.0,meta-llama/Meta-Llama-3-8B,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0348288249969482,1582.9171,1.011,0.063,4140.204 +True,1.35,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,78505.0,74089427968.0,5886864384.0,meta-llama/Meta-Llama-3-8B,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0354842090606688,3150.7966,1.016,0.032,4159.964 +,0.17,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,40823.0,27289066496.0,5913897984.0,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.0433240985870362,429.8265,0.931,0.233,3811.771 +,0.34,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,75341.0,48662498304.0,5914291200.0,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.0526717376708985,840.1997,0.952,0.119,3900.025 +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,73875.0,,,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,77553.0,,,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,0.17,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,15449.0,14441084416.0,5913897984.0,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.0416254806518557,424.7732,0.942,0.235,3857.117 +,0.34,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,24601.0,22966531584.0,5914291200.0,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.05357590675354,835.1457,0.958,0.12,3923.627 +,0.68,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,42905.0,40017425920.0,5915077632.0,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,1.0603761863708496,1657.3815,0.965,0.06,3954.189 +,1.35,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,78747.0,74119214592.0,5916650496.0,TechxGenus/Meta-Llama-3-8B-GPTQ,1,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,1.0691538333892825,3291.3639,0.972,0.03,3982.301 +True,0.34,,foak-fast-kernels,2e-05,,,81143.0,80320219648.0,64259672576.0,meta-llama/Meta-Llama-3-8B,2,,4,,,bfloat16,0.9677656173706056,690.8416,1.158,0.145,2371.6 +True,,,foak-fast-kernels,2e-05,,,68775.0,,,meta-llama/Meta-Llama-3-8B,2,,8,,,bfloat16,,,,, +True,,,foak-fast-kernels,2e-05,,,74233.0,,,meta-llama/Meta-Llama-3-8B,2,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels,2e-05,,,80729.0,,,meta-llama/Meta-Llama-3-8B,2,,32,,,bfloat16,,,,, +True,0.34,,foak-fast-kernels-liger,2e-05,,,80809.0,80320219648.0,64259672576.0,meta-llama/Meta-Llama-3-8B,2,,4,,,bfloat16,0.9677164840698242,623.5916,1.283,0.16,2627.361 +True,0.68,,foak-fast-kernels-liger,2e-05,,,81135.0,80320612864.0,64260065792.0,meta-llama/Meta-Llama-3-8B,2,,8,,,bfloat16,0.9606661891937256,1128.4084,1.418,0.089,2903.913 +True,,,foak-fast-kernels-liger,2e-05,,,79908.0,,,meta-llama/Meta-Llama-3-8B,2,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels-liger,2e-05,,,80729.0,,,meta-llama/Meta-Llama-3-8B,2,,32,,,bfloat16,,,,, +True,0.34,,foak-fast-kernels,0.0002,16.0,0.1,47323.0,31606073856.0,8156781056.0,meta-llama/Meta-Llama-3-8B,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0190155601501465,455.9939,1.754,0.219,3593.031 +True,0.68,,foak-fast-kernels,0.0002,16.0,0.1,60685.0,52979505664.0,8157174272.0,meta-llama/Meta-Llama-3-8B,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0157581710815429,894.78,1.788,0.112,3662.129 +True,,,foak-fast-kernels,0.0002,16.0,0.1,72428.0,,,meta-llama/Meta-Llama-3-8B,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,,,foak-fast-kernels,0.0002,16.0,0.1,66893.0,,,meta-llama/Meta-Llama-3-8B,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.34,,foak-fast-kernels-liger,0.0002,16.0,0.1,26877.0,23383704064.0,8156781056.0,meta-llama/Meta-Llama-3-8B,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0187576484680176,453.693,1.763,0.22,3611.252 +True,0.68,,foak-fast-kernels-liger,0.0002,16.0,0.1,41607.0,36071997952.0,8157174272.0,meta-llama/Meta-Llama-3-8B,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0161932468414308,888.8348,1.8,0.113,3686.624 +True,1.35,,foak-fast-kernels-liger,0.0002,16.0,0.1,70589.0,61448585728.0,8157960704.0,meta-llama/Meta-Llama-3-8B,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0166991901397706,1754.3381,1.824,0.057,3735.654 +True,,,foak-fast-kernels-liger,0.0002,16.0,0.1,78800.0,,,meta-llama/Meta-Llama-3-8B,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.34,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,43054.0,26509831680.0,3033275904.0,meta-llama/Meta-Llama-3-8B,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.042497215270996,422.108,1.895,0.237,3881.471 +True,0.68,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,78267.0,47883263488.0,3033669120.0,meta-llama/Meta-Llama-3-8B,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0393894767761231,815.2901,1.962,0.123,4019.183 +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,67822.0,,,meta-llama/Meta-Llama-3-8B,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,78756.0,,,meta-llama/Meta-Llama-3-8B,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.34,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,22164.0,17850271232.0,3033275904.0,meta-llama/Meta-Llama-3-8B,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0428221797943116,420.5608,1.902,0.238,3895.75 +True,0.68,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,35872.0,30402250240.0,3033669120.0,meta-llama/Meta-Llama-3-8B,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0388935470581055,811.0503,1.973,0.123,4040.193 +True,1.35,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,63878.0,55506208256.0,3034455552.0,meta-llama/Meta-Llama-3-8B,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0399011611938476,1601.8839,1.998,0.062,4091.183 +True,,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,79215.0,,,meta-llama/Meta-Llama-3-8B,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.34,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,43748.0,26526510592.0,3049954816.0,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.072213077545166,438.1959,1.826,0.228,3738.967 +,0.68,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,79025.0,47899942400.0,3050348032.0,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.0689481258392337,852.041,1.878,0.117,3845.824 +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,68244.0,,,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,78983.0,,,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,0.34,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,22853.0,17867802624.0,3049954816.0,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.0653237342834472,435.5815,1.837,0.23,3761.409 +,0.68,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,36549.0,30419781632.0,3050348032.0,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.0656037616729737,847.3836,1.888,0.118,3866.962 +,1.35,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,64583.0,55523739648.0,3051134464.0,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,1.0683570766448975,1677.4704,1.908,0.06,3906.835 +,,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,79665.0,,,TechxGenus/Meta-Llama-3-8B-GPTQ,2,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +True,0.68,,foak-fast-kernels,2e-05,,,66986.0,47584840192.0,32138562048.0,meta-llama/Meta-Llama-3-8B,4,,4,,,bfloat16,0.9608215522766114,511.7436,3.127,0.195,3201.603 +True,1.35,,foak-fast-kernels,2e-05,,,81081.5,68958272000.0,32138955264.0,meta-llama/Meta-Llama-3-8B,4,,8,,,bfloat16,0.9529002285003664,1447.107,2.211,0.069,2264.38 +True,,,foak-fast-kernels,2e-05,,,52173.0,,,meta-llama/Meta-Llama-3-8B,4,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels,2e-05,,,71790.0,,,meta-llama/Meta-Llama-3-8B,4,,32,,,bfloat16,,,,, +True,0.68,,foak-fast-kernels-liger,2e-05,,,54707.5,40168839680.0,32138562048.0,meta-llama/Meta-Llama-3-8B,4,,4,,,bfloat16,0.9608566761016846,512.2228,3.124,0.195,3198.608 +True,1.35,,foak-fast-kernels-liger,2e-05,,,63613.0,44061352448.0,32138955264.0,meta-llama/Meta-Llama-3-8B,4,,8,,,bfloat16,0.9528849792480468,980.3948,3.264,0.102,3342.327 +True,2.7,,foak-fast-kernels-liger,2e-05,,,79252.5,60306944512.0,32139741696.0,meta-llama/Meta-Llama-3-8B,4,,16,,,bfloat16,0.9459449291229248,1914.3344,3.343,0.052,3423.435 +True,,,foak-fast-kernels-liger,2e-05,,,77731.0,,,meta-llama/Meta-Llama-3-8B,4,,32,,,bfloat16,,,,, +True,0.68,,foak-fast-kernels,0.0002,16.0,0.1,43643.0,27551078912.0,4088154624.0,meta-llama/Meta-Llama-3-8B,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0161590099334716,456.0277,3.509,0.219,3592.764 +True,1.35,,foak-fast-kernels,0.0002,16.0,0.1,78865.0,48924510720.0,4088547840.0,meta-llama/Meta-Llama-3-8B,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0162527561187744,885.3585,3.614,0.113,3701.1 +True,,,foak-fast-kernels,0.0002,16.0,0.1,72808.5,,,meta-llama/Meta-Llama-3-8B,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,,,foak-fast-kernels,0.0002,16.0,0.1,79491.0,,,meta-llama/Meta-Llama-3-8B,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.68,,foak-fast-kernels-liger,0.0002,16.0,0.1,23197.0,19315503616.0,4088154624.0,meta-llama/Meta-Llama-3-8B,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0156009674072266,454.3664,3.521,0.22,3605.901 +True,1.35,,foak-fast-kernels-liger,0.0002,16.0,0.1,37981.0,32003797504.0,4088547840.0,meta-llama/Meta-Llama-3-8B,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0163630485534667,880.5812,3.634,0.114,3721.179 +True,2.7,,foak-fast-kernels-liger,0.0002,16.0,0.1,66909.0,57380385280.0,4089334272.0,meta-llama/Meta-Llama-3-8B,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0160735416412354,1740.1593,3.678,0.057,3766.092 +True,,,foak-fast-kernels-liger,0.0002,16.0,0.1,80867.0,,,meta-llama/Meta-Llama-3-8B,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.68,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,42590.5,25112976896.0,1636421120.0,meta-llama/Meta-Llama-3-8B,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0383064079284667,423.731,3.776,0.236,3866.604 +True,1.35,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,77795.5,46486408704.0,1636814336.0,meta-llama/Meta-Llama-3-8B,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.039291534423828,817.7661,3.913,0.122,4007.014 +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,66703.0,,,meta-llama/Meta-Llama-3-8B,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,77649.0,,,meta-llama/Meta-Llama-3-8B,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.68,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,21677.0,16453416448.0,1636421120.0,meta-llama/Meta-Llama-3-8B,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0390117359161375,421.9715,3.792,0.237,3882.726 +True,1.35,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,35376.0,29005395456.0,1636814336.0,meta-llama/Meta-Llama-3-8B,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0401603603363037,812.9813,3.936,0.123,4030.597 +True,2.7,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,63405.0,54109353472.0,1637600768.0,meta-llama/Meta-Llama-3-8B,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.037994260787964,1602.714,3.993,0.062,4089.064 +True,,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,78893.5,,,meta-llama/Meta-Llama-3-8B,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.68,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,44207.5,25122840064.0,1646284288.0,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.0636010646820069,439.0306,3.644,0.228,3731.858 +,1.35,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,79475.5,46496271872.0,1646677504.0,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.0646323871612549,853.2938,3.75,0.117,3840.178 +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,67956.0,,,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,78902.0,,,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,0.68,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,23330.0,16464132096.0,1646284288.0,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.0635689449310304,436.8828,3.662,0.229,3750.205 +,1.35,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,36974.0,29016111104.0,1646677504.0,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.059765977859497,849.659,3.766,0.118,3856.606 +,2.7,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,65315.5,54120069120.0,1647463936.0,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,1.068314094543457,1676.21,3.818,0.06,3909.773 +,,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,79377.5,,,TechxGenus/Meta-Llama-3-8B-GPTQ,4,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +True,0.15,,foak-fast-kernels,2e-05,,,76071.0,72432723456.0,43466827264.0,mistralai/Mistral-7B-v0.1,1,,4,,,bfloat16,0.838842716217041,479.0213,0.835,0.209,3420.307 +True,0.29,,foak-fast-kernels,2e-05,,,70035.0,72433116672.0,43467220480.0,mistralai/Mistral-7B-v0.1,1,,8,,,bfloat16,0.8388796520233154,932.3966,0.858,0.107,3514.384 +True,,,foak-fast-kernels,2e-05,,,79167.0,,,mistralai/Mistral-7B-v0.1,1,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels,2e-05,,,73171.0,,,mistralai/Mistral-7B-v0.1,1,,32,,,bfloat16,,,,, +True,0.15,,foak-fast-kernels-liger,2e-05,,,70285.0,72432723456.0,43466827264.0,mistralai/Mistral-7B-v0.1,1,,4,,,bfloat16,0.8386862182617187,479.4765,0.834,0.209,3417.06 +True,0.29,,foak-fast-kernels-liger,2e-05,,,74829.0,72433116672.0,43467220480.0,mistralai/Mistral-7B-v0.1,1,,8,,,bfloat16,0.8387984752655029,931.6364,0.859,0.107,3517.252 +True,0.58,,foak-fast-kernels-liger,2e-05,,,79041.0,77144641024.0,43468006912.0,mistralai/Mistral-7B-v0.1,1,,16,,,bfloat16,0.8310897159576416,1837.2742,0.871,0.054,3567.023 +True,,,foak-fast-kernels-liger,2e-05,,,80539.0,,,mistralai/Mistral-7B-v0.1,1,,32,,,bfloat16,,,,, +True,0.15,,foak-fast-kernels,0.0002,16.0,0.1,27511.0,23530188288.0,14664508928.0,mistralai/Mistral-7B-v0.1,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8601302146911621,420.488,0.951,0.238,3896.426 +True,0.29,,foak-fast-kernels,0.0002,16.0,0.1,40271.0,32393276928.0,14664902144.0,mistralai/Mistral-7B-v0.1,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8624934101104736,832.3628,0.961,0.12,3936.745 +True,0.58,,foak-fast-kernels,0.0002,16.0,0.1,65793.0,50119454208.0,14665688576.0,mistralai/Mistral-7B-v0.1,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8582536506652833,1660.5425,0.964,0.06,3946.662 +True,,,foak-fast-kernels,0.0002,16.0,0.1,73377.0,,,mistralai/Mistral-7B-v0.1,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.15,,foak-fast-kernels-liger,0.0002,16.0,0.1,23833.0,23530188288.0,14664508928.0,mistralai/Mistral-7B-v0.1,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8599755954742432,419.5123,0.953,0.238,3905.488 +True,0.29,,foak-fast-kernels-liger,0.0002,16.0,0.1,32921.0,32393276928.0,14664902144.0,mistralai/Mistral-7B-v0.1,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8625451850891114,831.0694,0.963,0.12,3942.872 +True,0.58,,foak-fast-kernels-liger,0.0002,16.0,0.1,51099.0,50119454208.0,14665688576.0,mistralai/Mistral-7B-v0.1,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8574607944488526,1658.6224,0.965,0.06,3951.231 +True,,,foak-fast-kernels-liger,0.0002,16.0,0.1,78031.0,,,mistralai/Mistral-7B-v0.1,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.15,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,18809.0,13064809472.0,4306512384.0,mistralai/Mistral-7B-v0.1,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8750344657897949,403.2783,0.992,0.248,4062.703 +True,0.29,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,31953.0,21823466496.0,4306905600.0,mistralai/Mistral-7B-v0.1,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8779727077484131,786.5424,1.017,0.127,4166.082 +True,0.58,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,59137.0,39338994688.0,4307692032.0,mistralai/Mistral-7B-v0.1,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8731959533691406,1559.0449,1.026,0.064,4203.599 +True,1.16,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,78209.0,74371837952.0,4309264896.0,mistralai/Mistral-7B-v0.1,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.872735185623169,3097.7032,1.033,0.032,4231.264 +True,0.15,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,14171.0,13035024896.0,4306512384.0,mistralai/Mistral-7B-v0.1,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8749488735198975,401.8581,0.995,0.249,4077.061 +True,0.29,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,23323.0,21761798656.0,4306905600.0,mistralai/Mistral-7B-v0.1,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8784359264373779,785.9738,1.018,0.127,4169.096 +True,0.58,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,41883.0,39215346176.0,4307692032.0,mistralai/Mistral-7B-v0.1,1,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8728489208221436,1555.8043,1.028,0.064,4212.355 +True,1.16,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,79005.0,74122441216.0,4309264896.0,mistralai/Mistral-7B-v0.1,1,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8713663196563721,3091.0715,1.035,0.032,4240.342 +,0.15,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,18553.0,13095119872.0,4336822784.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,0.9928509044647216,414.9265,0.964,0.241,3948.652 +,0.29,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,32337.0,21853776896.0,4337216000.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,0.9894316577911376,816.7654,0.979,0.122,4011.923 +,0.58,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,59649.0,39369305088.0,4338002432.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,0.982344207763672,1619.3601,0.988,0.062,4047.031 +,1.15,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,78475.0,74402148352.0,4339575296.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,0.9788006114959716,3190.2664,1.003,0.031,4108.497 +,0.15,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,14235.0,13065335808.0,4336822784.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.0111017036437988,414.9576,0.964,0.241,3948.355 +,0.29,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,23707.0,21792109568.0,4337216000.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,0.9891231441497804,817.2045,0.979,0.122,4009.768 +,0.58,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,42395.0,39245657088.0,4338002432.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,0.9979404735565186,1618.9493,0.988,0.062,4048.057 +,1.15,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,79517.0,74152752128.0,4339575296.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,1.0506530570983887,3184.3913,1.005,0.031,4116.077 +True,0.29,,foak-fast-kernels,2e-05,,,75537.0,72434853888.0,57951373312.0,mistralai/Mistral-7B-v0.1,2,,4,,,bfloat16,0.8286205387115478,504.3425,1.586,0.198,3248.586 +True,0.58,,foak-fast-kernels,2e-05,,,81209.0,72435247104.0,57951766528.0,mistralai/Mistral-7B-v0.1,2,,8,,,bfloat16,0.8197736072540284,1094.8571,1.461,0.091,2992.902 +True,,,foak-fast-kernels,2e-05,,,80994.0,,,mistralai/Mistral-7B-v0.1,2,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels,2e-05,,,72883.0,,,mistralai/Mistral-7B-v0.1,2,,32,,,bfloat16,,,,, +True,0.29,,foak-fast-kernels-liger,2e-05,,,78251.0,72434853888.0,57951373312.0,mistralai/Mistral-7B-v0.1,2,,4,,,bfloat16,0.8286735343933106,503.738,1.588,0.199,3252.484 +True,0.58,,foak-fast-kernels-liger,2e-05,,,79908.0,72435247104.0,57951766528.0,mistralai/Mistral-7B-v0.1,2,,8,,,bfloat16,0.8198539209365845,959.9081,1.667,0.104,3413.66 +True,,,foak-fast-kernels-liger,2e-05,,,80553.0,,,mistralai/Mistral-7B-v0.1,2,,16,,,bfloat16,,,,, +True,,,foak-fast-kernels-liger,2e-05,,,78785.0,,,mistralai/Mistral-7B-v0.1,2,,32,,,bfloat16,,,,, +True,0.29,,foak-fast-kernels,0.0002,16.0,0.1,23845.0,21219418112.0,7368243200.0,mistralai/Mistral-7B-v0.1,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8595081615447998,446.6717,1.791,0.224,3668.019 +True,0.58,,foak-fast-kernels,0.0002,16.0,0.1,38141.0,34109038592.0,7368636416.0,mistralai/Mistral-7B-v0.1,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8571683597564698,875.4589,1.828,0.114,3742.951 +True,1.16,,foak-fast-kernels,0.0002,16.0,0.1,66197.0,59888279552.0,7369422848.0,mistralai/Mistral-7B-v0.1,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8570511913299561,1727.5688,1.852,0.058,3793.539 +True,,,foak-fast-kernels,0.0002,16.0,0.1,80157.0,,,mistralai/Mistral-7B-v0.1,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.29,,foak-fast-kernels-liger,0.0002,16.0,0.1,24391.0,21219418112.0,7368243200.0,mistralai/Mistral-7B-v0.1,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.859750566482544,446.2936,1.793,0.224,3671.126 +True,0.58,,foak-fast-kernels-liger,0.0002,16.0,0.1,38599.0,34109038592.0,7368636416.0,mistralai/Mistral-7B-v0.1,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8571057224273682,874.4754,1.83,0.114,3747.161 +True,1.16,,foak-fast-kernels-liger,0.0002,16.0,0.1,67119.0,59888279552.0,7369422848.0,mistralai/Mistral-7B-v0.1,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8565524101257325,1726.117,1.854,0.058,3796.73 +True,,,foak-fast-kernels-liger,0.0002,16.0,0.1,78998.0,,,mistralai/Mistral-7B-v0.1,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.29,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,19778.0,15685985280.0,2244738048.0,mistralai/Mistral-7B-v0.1,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8795236587524414,412.1764,1.941,0.243,3974.997 +True,0.58,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,33700.0,28439290880.0,2245131264.0,mistralai/Mistral-7B-v0.1,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8764357566833496,798.0677,2.005,0.125,4105.917 +True,1.16,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,61093.0,53945902080.0,2245917696.0,mistralai/Mistral-7B-v0.1,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8760993480682373,1576.0592,2.03,0.063,4158.22 +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,79604.0,,,mistralai/Mistral-7B-v0.1,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.29,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,19963.0,15685985280.0,2244738048.0,mistralai/Mistral-7B-v0.1,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.878936014175415,411.7593,1.943,0.243,3979.023 +True,0.58,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,34082.0,28439290880.0,2245131264.0,mistralai/Mistral-7B-v0.1,2,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8762027359008789,797.1272,2.007,0.125,4110.762 +True,1.16,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,61730.0,53945902080.0,2245917696.0,mistralai/Mistral-7B-v0.1,2,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8754921627044677,1573.6818,2.033,0.064,4164.501 +True,,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,79707.0,,,mistralai/Mistral-7B-v0.1,2,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.29,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,20581.0,15703516672.0,2261416960.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.1056310272216796,425.0725,1.882,0.235,3854.401 +,0.58,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,34556.0,28456822272.0,2261810176.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,0.9874585056304932,830.9632,1.925,0.12,3943.375 +,1.15,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,61815.0,53963433472.0,2262596608.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,0.9895050239562988,1637.2755,1.954,0.061,4002.747 +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,78173.0,,,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,0.29,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,20743.0,15703516672.0,2261416960.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.0293195629119871,425.1526,1.882,0.235,3853.675 +,0.58,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,34887.0,28456822272.0,2261810176.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,0.98430645942688,830.265,1.927,0.12,3946.692 +,1.15,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,62481.0,53963433472.0,2262596608.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,1.004049482345581,1635.4433,1.957,0.061,4007.232 +,,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,79547.0,,,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +True,0.58,,foak-fast-kernels,2e-05,,,51099.5,36226152448.0,28984412160.0,mistralai/Mistral-7B-v0.1,4,,4,,,bfloat16,0.8197227716445923,492.7218,3.247,0.203,3325.203 +True,1.16,,foak-fast-kernels,2e-05,,,58746.5,39783938560.0,28984805376.0,mistralai/Mistral-7B-v0.1,4,,8,,,bfloat16,0.810418028831482,948.1775,3.375,0.105,3455.893 +True,2.33,,foak-fast-kernels,2e-05,,,76774.0,57299466752.0,28985591808.0,mistralai/Mistral-7B-v0.1,4,,16,,,bfloat16,0.7932196092605591,1860.555,3.44,0.054,3522.39 +True,,,foak-fast-kernels,2e-05,,,81199.0,,,mistralai/Mistral-7B-v0.1,4,,32,,,bfloat16,,,,, +True,0.58,,foak-fast-kernels-liger,2e-05,,,47698.5,36226152448.0,28984412160.0,mistralai/Mistral-7B-v0.1,4,,4,,,bfloat16,0.8197373056411743,492.5593,3.248,0.203,3326.3 +True,1.16,,foak-fast-kernels-liger,2e-05,,,55799.5,39732781056.0,28984805376.0,mistralai/Mistral-7B-v0.1,4,,8,,,bfloat16,0.8104191637039184,948.2377,3.375,0.105,3455.674 +True,2.33,,foak-fast-kernels-liger,2e-05,,,71515.5,56381022208.0,28985591808.0,mistralai/Mistral-7B-v0.1,4,,16,,,bfloat16,0.7934608507156372,1858.6873,3.443,0.054,3525.929 +True,,,foak-fast-kernels-liger,2e-05,,,81213.0,,,mistralai/Mistral-7B-v0.1,4,,32,,,bfloat16,,,,, +True,0.58,,foak-fast-kernels,0.0002,16.0,0.1,20758.0,17544448000.0,3692847104.0,mistralai/Mistral-7B-v0.1,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8567444038391113,446.5088,3.583,0.224,3669.356 +True,1.16,,foak-fast-kernels,0.0002,16.0,0.1,34894.0,30434068480.0,3693240320.0,mistralai/Mistral-7B-v0.1,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8568990898132324,868.1502,3.686,0.115,3774.462 +True,2.33,,foak-fast-kernels,0.0002,16.0,0.1,62814.0,56213309440.0,3694026752.0,mistralai/Mistral-7B-v0.1,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8558178234100342,1715.4042,3.731,0.058,3820.441 +True,,,foak-fast-kernels,0.0002,16.0,0.1,80781.0,,,mistralai/Mistral-7B-v0.1,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.58,,foak-fast-kernels-liger,0.0002,16.0,0.1,21304.0,17544448000.0,3692847104.0,mistralai/Mistral-7B-v0.1,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8567856788635254,445.9824,3.588,0.224,3673.687 +True,1.16,,foak-fast-kernels-liger,0.0002,16.0,0.1,35551.0,30434068480.0,3693240320.0,mistralai/Mistral-7B-v0.1,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8569093990325928,867.0898,3.691,0.115,3779.078 +True,2.33,,foak-fast-kernels-liger,0.0002,16.0,0.1,63711.0,56213309440.0,3694026752.0,mistralai/Mistral-7B-v0.1,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8561593055725097,1712.623,3.737,0.058,3826.645 +True,,,foak-fast-kernels-liger,0.0002,16.0,0.1,81094.0,,,mistralai/Mistral-7B-v0.1,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.58,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,19553.5,14682360832.0,1241113600.0,mistralai/Mistral-7B-v0.1,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.873923568725586,414.3866,3.861,0.241,3953.796 +True,1.16,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,33586.0,27435666432.0,1241506816.0,mistralai/Mistral-7B-v0.1,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8746437931060791,800.1313,3.999,0.125,4095.328 +True,2.33,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,61156.0,52942277632.0,1242293248.0,mistralai/Mistral-7B-v0.1,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.873357572555542,1576.6662,4.059,0.063,4156.619 +True,,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,80633.0,,,mistralai/Mistral-7B-v0.1,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +True,0.58,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,19793.5,14682360832.0,1241113600.0,mistralai/Mistral-7B-v0.1,4,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8740975952148438,413.8023,3.867,0.242,3959.378 +True,1.16,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,33930.0,27435666432.0,1241506816.0,mistralai/Mistral-7B-v0.1,4,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8744520854949951,799.4148,4.003,0.125,4098.999 +True,2.33,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,61638.5,52942277632.0,1242293248.0,mistralai/Mistral-7B-v0.1,4,lora,16,16.0,q_proj k_proj v_proj o_proj,bfloat16,0.8735318374633789,1575.5873,4.062,0.063,4159.465 +True,,,accelerated-peft-bnb-foak-liger,0.0002,16.0,0.1,80901.0,,,mistralai/Mistral-7B-v0.1,4,lora,32,16.0,q_proj k_proj v_proj o_proj,bfloat16,,,,, +,0.58,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,21609.0,14693076480.0,1250976768.0,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,0.9958928966522216,427.3794,3.744,0.234,3833.596 +,1.15,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,35645.0,27446382080.0,1251369984.0,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.017822380065918,830.5326,3.853,0.12,3945.42 +,2.27,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,62819.5,52952993280.0,1252156416.0,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,1.0091158390045163,1638.8153,3.905,0.061,3998.986 +,,True,accelerated-peft-autogptq-foak,0.0002,16.0,0.1,78291.0,,,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, +,0.58,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,21592.5,14693076480.0,1250976768.0,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,4,16.0,q_proj k_proj v_proj o_proj,float16,1.024254894256592,428.6955,3.732,0.233,3821.827 +,1.15,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,35828.5,27446382080.0,1251369984.0,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,8,16.0,q_proj k_proj v_proj o_proj,float16,1.007179250717163,830.4849,3.853,0.12,3945.647 +,2.27,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,63458.0,52952993280.0,1252156416.0,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,16,16.0,q_proj k_proj v_proj o_proj,float16,0.996524419784546,1636.0771,3.912,0.061,4005.679 +,,True,accelerated-peft-autogptq-foak-liger,0.0002,16.0,0.1,80537.0,,,TheBloke/Mistral-7B-v0.1-GPTQ,4,lora,32,16.0,q_proj k_proj v_proj o_proj,float16,,,,, diff --git a/scripts/benchmarks/refs/requirements_liger.txt b/scripts/benchmarks/refs/requirements_liger.txt new file mode 100644 index 00000000..fff46200 --- /dev/null +++ b/scripts/benchmarks/refs/requirements_liger.txt @@ -0,0 +1,87 @@ +accelerate==1.0.1 +aiohappyeyeballs==2.4.3 +aiohttp==3.11.0 +aiosignal==1.3.1 +async-timeout==5.0.1 +attrs==24.2.0 +bitsandbytes==0.43.3 +certifi==2024.8.30 +charset-normalizer==3.4.0 +contourpy==1.3.1 +cycler==0.12.1 +datasets==2.21.0 +dill==0.3.8 +docstring_parser==0.16 +einops==0.8.0 +filelock==3.16.1 +flash-attn==2.7.0.post2 +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@d58960c46f990e3a805ce95a2d4cdee7dc831e19#egg=fms_acceleration&subdirectory=plugins/framework +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@d58960c46f990e3a805ce95a2d4cdee7dc831e19#egg=fms_acceleration_aadp&subdirectory=plugins/attention-and-distributed-packing +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@d58960c46f990e3a805ce95a2d4cdee7dc831e19#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@d58960c46f990e3a805ce95a2d4cdee7dc831e19#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft +fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@398c2a8fe26d734344240555585d95e05299faa8 +fonttools==4.54.1 +frozenlist==1.5.0 +fsspec==2024.6.1 +huggingface-hub==0.26.2 +idna==3.10 +Jinja2==3.1.4 +kiwisolver==1.4.7 +llvmlite==0.43.0 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +matplotlib==3.9.2 +mdurl==0.1.2 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +networkx==3.4.2 +numba==0.60.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.77 +nvidia-nvtx-cu12==12.1.105 +packaging==24.2 +pandas==2.2.3 +peft==0.13.2 +pillow==11.0.0 +propcache==0.2.0 +protobuf==5.28.3 +psutil==6.1.0 +pyarrow==18.0.0 +Pygments==2.18.0 +pyparsing==3.2.0 +python-dateutil==2.9.0.post0 +pytz==2024.2 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +safetensors==0.4.5 +sentencepiece==0.2.0 +shtab==1.7.1 +simpleeval==0.9.13 +six==1.16.0 +sympy==1.13.3 +threadpoolctl==3.5.0 +tokenizers==0.20.3 +torch==2.4.1 +tqdm==4.67.0 +transformers==4.45.2 +triton==3.0.0 +trl==0.11.4 +typing_extensions==4.12.2 +tyro==0.8.14 +tzdata==2024.2 +urllib3==2.2.3 +xxhash==3.5.0 +yarl==1.17.1 diff --git a/scripts/benchmarks/scenarios-liger.yaml b/scripts/benchmarks/scenarios-liger.yaml new file mode 100644 index 00000000..8004441d --- /dev/null +++ b/scripts/benchmarks/scenarios-liger.yaml @@ -0,0 +1,101 @@ +# This file holds a list of scenarios to may be run. +# - to limit to a number of scenarios, use the --run-only-scenarios flag. +# - Each scenario will be run against a particular acceleration framework +# config, if the framework_config: key is specified. +# * a particular framework configuration +# - the arguments tag will hold arguments to be passed to sft_trainer +# * the arguments are singular except for model_name_or_path which can handle +# multiple arguments. +# - So anything that is critical for the scenario MUST be specified here +# and not in the defaults, e.g. fp16 + +# This stanza will be used in future to replace the custom processing functions in data_processing.py +# data_processing: +# dataset_name: yahma/alpaca-cleaned +# chat_template: | +# {%- for message in messages %} +# {% if message['input'] != '' %} +# Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + +# {% else %} +# Below is an instruction that describes a task. Write a response that appropriately completes the request. + +# {% endif %} +# ### Instruction: +# {{ message['instruction'] }} + +# {% if message['input'] != '' %} +# ### Input: +# {{ message['input'] }} + +# {% endif %} +# ### Response: +# {{ message['output'] + eos_token }} +# {% endfor %} +# tokenize: True + + +scenarios: + - name: full-finetuning + framework_config: + - foak-fast-kernels + - foak-fast-kernels-liger + arguments: + learning_rate: 2e-5 + model_name_or_path: + - 'mistralai/Mistral-7B-v0.1' + - 'meta-llama/Meta-Llama-3-8B' + torch_dtype: bfloat16 + bf16: True + + - name: standard-peft + framework_config: + - foak-fast-kernels + - foak-fast-kernels-liger + arguments: + bf16: True + learning_rate: 2e-4 + torch_dtype: bfloat16 + peft_method: lora + r: 16 + lora_alpha: 16 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + model_name_or_path: + - 'mistralai/Mistral-7B-v0.1' + - 'meta-llama/Meta-Llama-3-8B' + + - name: accelerated-peft-bnb + framework_config: + - accelerated-peft-bnb-foak + - accelerated-peft-bnb-foak-liger + arguments: + bf16: True + learning_rate: 2e-4 + torch_dtype: bfloat16 + peft_method: lora + r: 16 + lora_alpha: 16 + lora_dropout: 0.1 + per_device_train_batch_size: + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + model_name_or_path: + - 'mistralai/Mistral-7B-v0.1' + - 'meta-llama/Meta-Llama-3-8B' + + - name: accelerated-peft-gptq + framework_config: + - accelerated-peft-autogptq-foak + - accelerated-peft-autogptq-foak-liger + arguments: + learning_rate: 2e-4 + fp16: True # running gptq-lora in float16 is more performant, see issue + torch_dtype: float16 # https://github.com/foundation-model-stack/fms-acceleration/issues/84 + peft_method: lora + r: 16 + lora_alpha: 16 + lora_dropout: 0.1 + target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"] + model_name_or_path: + - 'TheBloke/Mistral-7B-v0.1-GPTQ' + - 'TechxGenus/Meta-Llama-3-8B-GPTQ' diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py index ff775c8e..6232dce6 100644 --- a/scripts/generate_sample_configurations.py +++ b/scripts/generate_sample_configurations.py @@ -144,10 +144,13 @@ def read_configuration(path: str) -> Dict: KEY_BNB_NF4 = "bnb-nf4" KEY_BNB_NF4_BASELINE = "baseline-bnb-nf4" KEY_AUTO_GPTQ_FOAK = "auto-gptq-foak" +KEY_AUTO_GPTQ_FOAK_LIGER = "auto-gptq-foak-liger" KEY_BNB_NF4_FOAK = "bnb-nf4-foak" +KEY_BNB_NF4_FOAK_LIGER = "bnb-nf4-foak-liger" KEY_AADP_PADDING_FREE = "aadp-padding-free" KEY_AADP_MULTIPACK = "aadp-multipack" KEY_FAST_KERNELS = "foak-fast-kernels" +KEY_FAST_KERNELS_LIGER = "foak-fast-kernels-liger" KEY_SCATTERMOE_EP1 = "moe-scattermoe-ep1" KEY_SCATTERMOE_EP2 = 'moe-scattermoe-ep2' KEY_SCATTERMOE_EP4 = 'moe-scattermoe-ep4' @@ -170,13 +173,22 @@ def read_configuration(path: str) -> Dict: "plugins/fused-ops-and-kernels/configs/fast_quantized_peft.yaml", [("peft.quantization.fused_ops_and_kernels.base_layer", "auto_gptq")], ), + KEY_AUTO_GPTQ_FOAK_LIGER: ( + "plugins/fused-ops-and-kernels/configs/fast_quantized_peft_liger.yaml", + [("peft.quantization.fused_ops_and_kernels.base_layer", "auto_gptq")], + ), KEY_BNB_NF4_FOAK: ( "plugins/fused-ops-and-kernels/configs/fast_quantized_peft.yaml", [("peft.quantization.fused_ops_and_kernels.base_layer", "bitsandbytes")], ), + KEY_BNB_NF4_FOAK_LIGER: ( + "plugins/fused-ops-and-kernels/configs/fast_quantized_peft_liger.yaml", + [("peft.quantization.fused_ops_and_kernels.base_layer", "bitsandbytes")], + ), KEY_AADP_PADDING_FREE: "plugins/attention-and-distributed-packing/configs/padding_free.yaml", KEY_AADP_MULTIPACK: "plugins/attention-and-distributed-packing/configs/multipack.yaml", KEY_FAST_KERNELS: "plugins/fused-ops-and-kernels/configs/fast_kernels.yaml", + KEY_FAST_KERNELS_LIGER: "plugins/fused-ops-and-kernels/configs/fast_kernels_liger.yaml", KEY_SCATTERMOE_EP1: "plugins/accelerated-moe/configs/scattermoe.yaml", KEY_SCATTERMOE_EP2: ( "plugins/accelerated-moe/configs/scattermoe.yaml", @@ -203,6 +215,8 @@ def read_configuration(path: str) -> Dict: ("baseline-peft-bnb-nf4", (KEY_BNB_NF4_BASELINE,)), ("accelerated-peft-autogptq-foak", (KEY_AUTO_GPTQ, KEY_AUTO_GPTQ_FOAK)), ("accelerated-peft-bnb-nf4-foak", (KEY_BNB_NF4, KEY_BNB_NF4_FOAK)), + ("accelerated-peft-autogptq-foak-liger", (KEY_AUTO_GPTQ, KEY_AUTO_GPTQ_FOAK_LIGER)), + ("accelerated-peft-bnb-nf4-foak-liger", (KEY_BNB_NF4, KEY_BNB_NF4_FOAK_LIGER)), ("aadp-padding-free", (KEY_AADP_PADDING_FREE,)), ("accelerated-peft-autogptq-padding-free", (KEY_AADP_PADDING_FREE,KEY_AUTO_GPTQ)), ("accelerated-peft-bnb-nf4-padding-free", (KEY_AADP_PADDING_FREE,KEY_BNB_NF4)), @@ -210,6 +224,7 @@ def read_configuration(path: str) -> Dict: ("accelerated-peft-bnb-nf4-foak-padding-free", (KEY_AADP_PADDING_FREE,KEY_BNB_NF4, KEY_BNB_NF4_FOAK)), ("aadp-padding-free-multipack", (KEY_AADP_PADDING_FREE, KEY_AADP_MULTIPACK)), ("foak-fast-kernels", (KEY_FAST_KERNELS,)), + ("foak-fast-kernels-liger", (KEY_FAST_KERNELS_LIGER,)), ("moe-scattermoe-granite-ep1", (KEY_SCATTERMOE_EP1,)), ("moe-scattermoe-granite-ep1-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP1,)), ("moe-scattermoe-granite-ep1-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP1,)), @@ -223,7 +238,6 @@ def read_configuration(path: str) -> Dict: ("moe-scattermoe-granite-ep8-foak", (KEY_FAST_KERNELS, KEY_SCATTERMOE_EP8,)), ] - # TODO: throw error if merge conflicts def merge_configs(config_contents: List[Dict]): "helper function to merge configuration contents."