diff --git a/.github/azure-gpu-test.yml b/.github/azure-gpu-test.yml
new file mode 100644
index 0000000..9ed3462
--- /dev/null
+++ b/.github/azure-gpu-test.yml
@@ -0,0 +1,64 @@
+trigger:
+  branches:
+    include:
+      - "main"
+      - "wip"
+
+pr:
+  branches:
+    include:
+      - "main"
+      - "wip"
+      - "carmocca/*"
+
+jobs:
+  - job: testing
+    timeoutInMinutes: "20"
+    cancelTimeoutInMinutes: "2"
+    pool: "lit-rtx-3090"
+    variables:
+      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
+    container:
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
+      options: "--gpus=all --shm-size=8gb"
+    workspace:
+      clean: all
+    steps:
+
+    - bash: |
+        echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+      displayName: 'set env. vars'
+
+    - bash: |
+        echo $(DEVICES)
+        echo $CUDA_VISIBLE_DEVICES
+        whereis nvidia
+        nvidia-smi
+        which python && which pip
+        python --version
+        pip --version
+        pip list
+      displayName: "Image info & NVIDIA"
+
+    - script: |
+        pip install -r requirements-all.txt pytest pytest-rerunfailures transformers>=4.38.0 einops protobuf
+      displayName: 'Install dependencies'
+
+    - bash: |
+        set -e
+        pip list
+        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
+      displayName: "Env details"
+
+    - bash: pytest -v --disable-pytest-warnings --strict-markers --color=yes
+      displayName: 'Ordinary tests'
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      timeoutInMinutes: "5"
+
+    - bash: bash run_standalone_tests.sh
+      workingDirectory: tests
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: "Standalone tests"
+      timeoutInMinutes: "5"
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
new file mode 100644
index 0000000..3b0de8d
--- /dev/null
+++ b/.github/workflows/cpu-tests.yml
@@ -0,0 +1,75 @@
+name: CPU tests
+
+on:
+  push:
+    branches: [main, wip]
+  pull_request:
+    branches: [main, "carmocca/*", wip]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  cpu-tests:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - {os: "macOS-12", python-version: "3.10"}
+          - {os: "ubuntu-22.04", python-version: "3.11"}
+          - {os: "ubuntu-22.04", python-version: "3.10"}
+          - {os: "ubuntu-22.04", python-version: "3.9"}
+          - {os: "ubuntu-20.04", python-version: "3.8"}
+          - {os: "windows-2022", python-version: "3.10"}
+    timeout-minutes: 25
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+        cache-dependency-path: |
+          requirements.txt
+          requirements-all.txt
+          setup.py
+
+    - name: Install minimal dependencies
+      run: |
+        pip install -r requirements.txt
+        pip list
+        # make sure all modules are importable
+        modules=$(
+          find * -type f -name "*.py" | \
+          grep -v tests | grep "/" | grep -v lm_eval | grep -v xla | grep -v prepare_slimpajama | grep -v prepare_starcoder | \
+          sed 's/\.py$//' | sed 's/\//./g' | \
+          sed 's/.__init__//g' | xargs -I {} echo "import {};"
+        )
+        echo "$modules"
+        python -c "$modules"
+
+    - name: Install all dependencies
+      run: |
+        pip install -r requirements-all.txt pytest pytest-rerunfailures pytest-timeout transformers>=4.38.0 einops protobuf
+        pip list
+
+    - name: Run tests without the package installed
+      run: |
+        pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 120
+
+    - name: Run tests
+      run: |
+        pip install . --no-deps
+
+        pytest -v --disable-pytest-warnings --strict-markers --color=yes --timeout 120
diff --git a/.gitignore b/.gitignore
index 5ea2657..6e6500e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,8 +9,11 @@ build
 # data
 data
 datasets
+!lit_gpt/data
+!tests/data
 checkpoints
 out
 wandb
+events.out.tfevents*
 
-tests/original_phi_1_5.py
+tests/reference_models
diff --git a/README.md b/README.md
index cac1d38..3e5e461 100644
--- a/README.md
+++ b/README.md
@@ -26,23 +26,28 @@ Hackable [implementation](lit_gpt/model.py) of state-of-the-art open-source larg
 
 Supports the following popular model checkpoints:
 
-| Model and usage                                                                | Model size                         | Reference                                                                                        |
-|--------------------------------------------------------------------------------|------------------------------------|--------------------------------------------------------------------------------------------------|
-| Meta AI [Llama 2](tutorials/download_llama_2.md)                               | 7B, 13B, 70B                       | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                          |
-| Stability AI [FreeWilly2](tutorials/download_freewilly_2.md) (Stable Beluga 2) | 70B                                | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models) |
-| Stability AI StableCode                                                        | 3B                                 | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)               |
-| TII UAE [Falcon](tutorials/download_falcon.md)                                 | 7B, 40B, 180B                      | [TII 2023](https://falconllm.tii.ae)                                                             |
-| OpenLM Research [OpenLLaMA](tutorials/download_openllama.md)                   | 3B, 7B, 13B                        | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                 |
-| LMSYS [Vicuna](tutorials/download_vicuna.md)                                   | 7B, 13B, 33B                       | [Li et al. 2023](https://lmsys.org/blog/2023-03-30-vicuna/)                                      |
-| LMSYS [LongChat](tutorials/download_longchat.md)                               | 7B, 13B                            | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                |
-| Together [RedPajama-INCITE](tutorials/download_redpajama_incite.md)            | 3B, 7B                             | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                    |
-| EleutherAI [Pythia](tutorials/download_pythia.md)                              | {70,160,410}M, {1,1.4,2.8,6.9,12}B | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                         |
-| StabilityAI [StableLM](tutorials/download_stablelm.md)                         | 3B, 7B                             | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                    |
-| Platypus                                                                       | 7B, 13B, 70B                       | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                   |
-| NousResearch Nous-Hermes                                                       | 7B, 13B, 70B                       | [Org page](https://huggingface.co/NousResearch)                                                  |
-| Meta AI [Code Llama](tutorials/download_code_llama.md)                         | 7B, 13B, 34B                       | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                          |
-| Microsoft Research [phi-1.5](tutorials/download_phi15.md)                      | 1.3B                               | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                               |
-| Mistral AI [Mistral](tutorials/download_mistral.md)                            | 7B                                 | [Mistral  website](https://mistral.ai/)                                                          |
+| Model                                                                                | Model size                               | Reference                                                                                                                    |
+|--------------------------------------------------------------------------------------|------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|
+| [Code Llama](tutorials/download_code_llama.md) by Meta AI                            | 7B, 13B, 34B, 70B                        | [Rozière et al. 2023](https://arxiv.org/abs/2308.12950)                                                                      |
+| [Dolly](tutorials/download_dolly.md) by Databricks                                   | 3B, 7B, 12B                              | [Conover et al. 2023](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) |
+| [Falcon](tutorials/download_falcon.md) by TII UAE                                    | 7B, 40B, 180B                            | [TII 2023](https://falconllm.tii.ae)                                                                                         |
+| [FreeWilly2](tutorials/download_freewilly_2.md) (Stable Beluga 2) by Stability AI    | 70B                                      | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models)                             |
+| [Function Calling Llama 2](tutorials/download_function_calling_llama_2.md) by Trelis | 7B                                       | [Trelis et al. 2023](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2)                                   |
+| [Gemma](tutorials/download_gemma.md) by Google                                       | 2B, 7B                                   | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf)                         |
+| [Llama 2](tutorials/download_llama_2.md) by Meta AI                                  | 7B, 13B, 70B                             | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                      |
+| [LongChat](tutorials/download_longchat.md) by LMSYS                                  | 7B, 13B                                  | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                            |
+| [Mistral and Mixtral](tutorials/download_mistral.md) by Mistral AI                   | 7B                                       | [Mistral website](https://mistral.ai/)                                                                                       |
+| [Nous-Hermes](https://huggingface.co/NousResearch/Nous-Hermes-13b) by NousResearch   | 7B, 13B, 70B                             | [Org page](https://huggingface.co/NousResearch)                                                                              |
+| [OpenLLaMA](tutorials/download_openllama.md) by OpenLM Research                      | 3B, 7B, 13B                              | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                                             |
+| [Phi](tutorials/download_phi.md) by Microsoft Research                               | 1.3B, 2.7B                               | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                           |
+| [Platypus](https://huggingface.co/garage-bAInd/Platypus-30B) by Lee at el.           | 7B, 13B, 70B                             | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
+| [Pythia](tutorials/download_pythia.md) by EleutherAI                                 | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                                                     |
+| [RedPajama-INCITE](tutorials/download_redpajama_incite.md) by Together               | 3B, 7B                                   | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                |
+| [StableCode](tutorials/download_stablecode.md) by Stability AI      | 3B                                       | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                           |
+| [StableLM](tutorials/download_stablelm.md) by Stability AI                           | 3B, 7B                                   | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                                                |
+| [StableLM Zephyr](tutorials/download_stablelm.md) by Stability AI                    | 3B                                       | [Stability AI 2023](https://stability.ai/blog/stablecode-llm-generative-ai-coding)                                           |
+| [TinyLlama](tutorials/download_tinyllama.md) by Zhang et al.                         | 1.1B                                     | [Zhang et al. 2023](https://github.com/jzhang38/TinyLlama)                                                                   |
+| [Vicuna](tutorials/download_vicuna.md) by LMSYS                                      | 7B, 13B, 33B                             | [Li et al. 2023](https://lmsys.org/blog/2023-03-30-vicuna/)                                                                  |
 
 This implementation extends on [Lit-LLaMA](https://github.com/lightning-AI/lit-llama) and [nanoGPT](https://github.com/karpathy/nanoGPT), and it's **powered by [Lightning Fabric](https://lightning.ai/docs/fabric/stable/) ⚡**.
 
@@ -52,18 +57,12 @@ This implementation extends on [Lit-LLaMA](https://github.com/lightning-AI/lit-l
 
 **🏆 NeurIPS 2023 Large Language Model Efficiency Challenge: 1 LLM + 1 GPU + 1 Day**
 
-The Lit-GPT repository is the official starter kit for the [NeurIPS 2023 LLM Efficiency Challenge](https://llm-efficiency-challenge.github.io), which is a competition focused on finetuning an existing non-instruction tuned LLM for 24 hours on a single GPU. The competition has two tracks, one for the A100 and another for the 4090 GPUs.
-
-If you are interested in participating, you can learn more about the NeurIPS LLM Efficiency Challenge on the official website [here](https://llm-efficiency-challenge.github.io). Also see the [Lit-GPT NeurIPS Challenge Quickstart Guide](tutorials/neurips_challenge_quickstart.md) for helpful tips.
-
-**The submission deadline is Oct 25th, 2023.**
+The Lit-GPT repository was the official starter kit for the [NeurIPS 2023 LLM Efficiency Challenge](https://llm-efficiency-challenge.github.io), which is a competition focused on finetuning an existing non-instruction tuned LLM for 24 hours on a single GPU.
 
 ---
 
-
 &nbsp;
 
-
 ## Lit-GPT design principles
 
 This repository follows the main principle of **openness through clarity**.
@@ -94,26 +93,12 @@ git clone https://github.com/Lightning-AI/lit-gpt
 cd lit-gpt
 ```
 
-Install the minimal dependencies:
-
-```bash
-pip install -r requirements.txt
-```
-
-Install with all dependencies (including quantization, sentencepiece, tokenizers for Llama models, etc.):
+Install with all dependencies (including CLI, quantization, tokenizers for all models, etc.):
 
 ```bash
 pip install -r requirements-all.txt
 ```
 
-**(Optional) install Flash Attention 2**
-
-```bash
-MAX_JOBS=4 pip install flash-attn --no-build-isolation
-```
-
-You are all set! 🎉
-
 &nbsp;
 
 ## Use the model
@@ -140,7 +125,7 @@ python chat/base.py
 
 ### Run large models on smaller consumer devices
 
-We support 4-bit quantization (as in QLoRA), (bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq, gptq.int4) and 8-bit quantization (bnb.int8) for inference by following [this guide](tutorials/quantize.md).
+We support 4-bit quantization (as in QLoRA), (bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq) and 8-bit quantization (bnb.int8) for inference by following [this guide](tutorials/quantize.md).
 
 &nbsp;
 
@@ -220,7 +205,7 @@ Follow this guide to start pretraining on
 
 ## Supported datasets
 
-Lit-GPT includes a variety of dataset preparation scripts for finetuning and pretraining. Additional information about the datasets and dataset preparation is provided in the [Preparing Datasets](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/prepare_dataset.md) tutorial.
+Lit-GPT includes a variety of dataset preparation scripts for finetuning and pretraining. Additional information about the datasets and dataset preparation is provided in the [Preparing Datasets](tutorials/prepare_dataset.md) tutorial.
 
 
 &nbsp;
@@ -263,12 +248,26 @@ Don't forget to [join our Discord](https://discord.gg/VptPCZkGNa)!
 - [@karpathy](https://github.com/karpathy) for [nanoGPT](https://github.com/karpathy/nanoGPT)
 - [@EleutherAI](https://github.com/EleutherAI) for [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) and the [Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
 - [@TimDettmers](https://github.com/TimDettmers) for [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
-- [@IST-DASLab](https://github.com/IST-DASLab) for [GPTQ](https://github.com/IST-DASLab/gptq)
 - [@Microsoft](https://github.com/microsoft) for [LoRA](https://github.com/microsoft/LoRA)
 - [@tridao](https://github.com/tridao) for [Flash Attention 2](https://github.com/Dao-AILab/flash-attention)
 
 &nbsp;
 
+## Citation
+
+If you use Lit-GPT in your research, please cite the following work:
+
+```bibtex
+@misc{lit-gpt-2023,
+  author       = {Lightning AI},
+  title        = {Lit-GPT},
+  howpublished = {\url{https://github.com/Lightning-AI/lit-gpt}},
+  year         = {2023},
+}
+```
+
+&nbsp;
+
 ## License
 
 Lit-GPT is released under the [Apache 2.0](https://github.com/Lightning-AI/lit-gpt/blob/main/LICENSE) license.
diff --git a/chat/base.py b/chat/base.py
index c52ec6c..51ca893 100644
--- a/chat/base.py
+++ b/chat/base.py
@@ -1,6 +1,9 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import re
 import sys
 import time
+from json import dumps
 from pathlib import Path
 from typing import Iterator, List, Literal, Optional, Tuple
 
@@ -12,19 +15,15 @@
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
+from generate.base import next_token
 from lit_gpt import GPT, Config, Tokenizer
-from lit_gpt.utils import (
-    check_valid_checkpoint_dir,
-    get_default_supported_precision,
-    gptq_quantization,
-    load_checkpoint,
-)
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint
 
 
 @torch.inference_mode()
 def generate(
     model: GPT,
-    idx: torch.Tensor,
+    prompt: torch.Tensor,
     max_returned_tokens: int,
     *,
     temperature: float = 1.0,
@@ -35,13 +34,13 @@ def generate(
 
     Args:
         model: The model to use.
-        idx: Tensor of shape (T) with indices of the prompt sequence.
+        prompt: Tensor of shape (T) with indices of the prompt sequence.
         max_returned_tokens: The maximum number of tokens to return (given plus generated).
         temperature: Scales the predicted logits by 1 / temperature
         top_k: If specified, only sample among the tokens with the k highest probabilities
         stop_tokens: If specified, stop generating any more token once one of this list is generated.
     """
-    T = idx.size(0)
+    T = prompt.size(0)
     assert max_returned_tokens > T
     if model.max_seq_length < max_returned_tokens - 1:
         # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a
@@ -49,81 +48,66 @@ def generate(
         # not support it to avoid negatively impacting the overall speed
         raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}")
 
-    device = idx.device
-    stop_tokens = [torch.tensor(tokens, device=device) for tokens in stop_tokens]
-    input_pos = torch.arange(0, T, device=device)
-
-    # buffer holds the tokens that haven't been yield yet
+    device = prompt.device
     buffer_length = max((len(tokens) for tokens in stop_tokens), default=1)
-    buffer = torch.full((buffer_length,), -999, device=device)  # fill with non-existing token
-
-    yield_i = -1
-    # generate up to a fixed number of tokens
-    for t in range(max_returned_tokens - T):
-        # forward
-        logits = model(idx.view(1, -1), input_pos)
-        logits = logits[0, -1] / temperature
-
-        # optionally crop the logits to only the top k options
-        if top_k is not None:
-            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-            logits = torch.where(logits < v[[-1]], -float("Inf"), logits)
-
-        probs = torch.nn.functional.softmax(logits, dim=-1)
-        idx = torch.multinomial(probs, num_samples=1)
-
-        # advance
-        input_pos = input_pos[-1:] + 1
-
-        # concatenate the new generation
-        buffer[min(t, buffer_length - 1)] = idx
-
+    yield_i = 0
+    input_pos = torch.arange(0, T, device=device)
+    tokens = []
+    token = prompt
+    for t in range(1, max_returned_tokens - T + 1):
+        token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k)
+        tokens.append(token)
         # check the stop condition
-        for tokens in stop_tokens:
-            l = len(tokens)
-            if torch.equal(buffer[-l:], tokens):
-                # stop token hit, yield any leftovers that aren't part of it
-                if buffer_length > l:  # avoid an empty yield
-                    yield buffer[:-l]
-                return
+        if any((l := len(st)) <= len(tokens) and all(a == b for a, b in zip(tokens[-l:], st)) for st in stop_tokens):
+            return
         # if the buffer is full
         if t - yield_i >= buffer_length:
             # we know this idx is not part of stop tokens, safe to yield
-            yield buffer[0]
-            # roll once to the left, as next generation will be put at the end
-            buffer = torch.roll(buffer, -1, 0)
-            yield_i += 1
+            yield from tokens[yield_i:t]
+            yield_i = t
+        input_pos = input_pos[-1:].add_(1)
 
 
 def decode(fabric: L.Fabric, tokenizer: Tokenizer, token_stream: Iterator[torch.Tensor]) -> int:
     tokens_generated = 0
     if tokenizer.backend == "huggingface":
-        for token in token_stream:
-            fabric.print(tokenizer.decode(token), end="", flush=True)
-            tokens_generated += 1
+        try:
+            for token in token_stream:
+                fabric.print(tokenizer.decode(token), end="", flush=True)
+                tokens_generated += 1
+        except KeyboardInterrupt:
+            # support stopping generation
+            return tokens_generated
     elif tokenizer.backend == "sentencepiece":
         # sentencepiece does not support decoding token-by-token because it adds spaces based on the surrounding tokens
         # meaning that we need to decode everything each time
         so_far = torch.tensor([], dtype=torch.long, device=fabric.device)
         decoded_so_far = ""
-        for token in token_stream:
-            so_far = torch.cat((so_far, token.view(-1)))
-            decoded_new = tokenizer.decode(so_far)
-            fabric.print(decoded_new[len(decoded_so_far) :], end="", flush=True)
-            decoded_so_far = decoded_new
-            tokens_generated += 1
+        try:
+            for token in token_stream:
+                so_far = so_far.to(device=token.device)
+                so_far = torch.cat((so_far, token.view(-1)))
+                decoded_new = tokenizer.decode(so_far)
+                fabric.print(decoded_new[len(decoded_so_far) :], end="", flush=True)
+                decoded_so_far = decoded_new
+                tokens_generated += 1
+        except KeyboardInterrupt:
+            # support stopping generation
+            return tokens_generated
     else:
         raise NotImplementedError(tokenizer.backend)
     return tokens_generated
 
 
+@torch.inference_mode()
 def main(
     *,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-tuned-alpha-3b"),
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     precision: Optional[str] = None,
+    compile: bool = False,
 ) -> None:
     """Starts a conversation with a tuned GPT model.
 
@@ -135,9 +119,9 @@ def main(
         quantize: Whether to quantize the model and using which method:
             - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
             - bnb.int8: 8-bit quantization from bitsandbytes
-            - gptq.int4: 4-bit quantization from GPTQ
             for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
         precision: Indicates the Fabric precision setting to use.
+        compile: Whether to use compilation to speed up token generation. Will increase startup time.
     """
     precision = precision or get_default_supported_precision(training=False)
 
@@ -155,25 +139,29 @@ def main(
 
     config = Config.from_json(checkpoint_dir / "lit_config.json")
 
-    if quantize == "gptq.int4":
-        model_file = "lit_model_gptq.4bit.pth"
-        if not (checkpoint_dir / model_file).is_file():
-            raise ValueError("Please run `python quantize/gptq.py` first")
-    else:
-        model_file = "lit_model.pth"
-    checkpoint_path = checkpoint_dir / model_file
+    checkpoint_path = checkpoint_dir / "lit_model.pth"
 
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    with fabric.init_module(empty_init=True):
         model = GPT(config)
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
     load_checkpoint(fabric, model, checkpoint_path)
-
     model.eval()
+
+    if compile:
+        torch._dynamo.config.automatic_dynamic_shapes = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        global next_token
+        next_token = torch.compile(next_token, mode="reduce-overhead", dynamic=True)
+
     model = fabric.setup_module(model)
 
     tokenizer = Tokenizer(checkpoint_dir)
     system_prompt, stop_tokens = prompt_config(checkpoint_dir, tokenizer)
 
+    L.seed_everything(1234)
     while True:
         try:
             prompt = input(">> Prompt: ")
@@ -183,30 +171,26 @@ def main(
             break
         prompt = system_prompt.format(prompt=prompt)
         encoded_prompt = tokenizer.encode(prompt, device=fabric.device)
-
-        with fabric.init_tensor():
-            # enable the kv cache
-            model.set_kv_cache(batch_size=1)
-
         y = generate(
             model, encoded_prompt, model.max_seq_length, temperature=temperature, top_k=top_k, stop_tokens=stop_tokens
         )
         fabric.print(">> Reply: ", end="")
-        try:
-            t0 = time.perf_counter()
-            tokens_generated = decode(fabric, tokenizer, y)
-            t = time.perf_counter() - t0
-            fabric.print(
-                f"\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
-            )
-        except KeyboardInterrupt:
-            # support stopping generation
-            pass
+        t0 = time.perf_counter()
+        tokens_generated = decode(fabric, tokenizer, y)
+        t = time.perf_counter() - t0
+        for block in model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
+        fabric.print(
+            f"\nTime for inference: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec,"
+            f" {tokens_generated} tokens",
+            file=sys.stderr,
+        )
         fabric.print()
 
 
 def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tuple[List[int], ...]]:
     checkpoint_name = str(checkpoint_dir)
+
     if re.search(r"stabilityai.*tuned-alpha", checkpoint_name):
         system_prompt = (
             "<|SYSTEM|># StableLM Tuned (Alpha version)\n- StableLM is a helpful and harmless open-source AI language"
@@ -222,6 +206,17 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
             [tokenizer.token_to_id("<|USER|>")],
         )
         return system_prompt, stop_tokens
+
+    if re.search(r"stabilityai/stablelm-zephyr-3b", checkpoint_name):
+        system_prompt = "<|user|>\n{prompt}<|endoftext|>\n<|assistant|>\n"
+        stop_tokens = ([tokenizer.eos_id],)
+        return system_prompt, stop_tokens
+
+    if re.search("stablecode-instruct", checkpoint_name):
+        system_prompt = "###Instruction\n{prompt}###Response\n"
+        stop_tokens = ([tokenizer.eos_id],)
+        return system_prompt, stop_tokens
+
     if re.search(r"togethercomputer.*Chat", checkpoint_name):
         system_prompt = "<human>: {prompt}\n<bot>:"
         lt, gt = tokenizer.token_to_id("<"), tokenizer.token_to_id(">:")
@@ -232,6 +227,7 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
             [lt, tokenizer.token_to_id("bot"), gt],
         )
         return system_prompt, stop_tokens
+
     if re.search(r"togethercomputer.*Instruct", checkpoint_name):
         system_prompt = "Q: {prompt}\nA:"
         colon = tokenizer.token_to_id(":")
@@ -247,6 +243,7 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
             [2756],  # '\n\n\n'
         )
         return system_prompt, stop_tokens
+
     if re.search(r"falcon.*-instruct", checkpoint_name):
         # First line could be modified. AFAIK Falcon doesn't impose a specific system prompt
         # The instruction to not prefix its replies doesn't work always, but better than nothing
@@ -260,6 +257,7 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
             [193, tokenizer.token_to_id("User")],  # 193: '\n'
         )
         return system_prompt, stop_tokens
+
     if re.search(r"vicuna|longchat", checkpoint_name):
         # https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
         system_prompt = (
@@ -268,6 +266,32 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
         )
         stop_tokens = ([tokenizer.eos_id],)
         return system_prompt, stop_tokens
+
+    if re.search("Llama-2-7b-chat-hf-function-calling-v2", checkpoint_name):
+        # Has to be before the llama config
+        b_func, e_func = "<FUNCTIONS>", "</FUNCTIONS>\n\n"
+        b_inst, e_inst = "[INST]", "[/INST]"
+        b_sys, e_sys = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        # This is an example for how to format functions for the model
+        function_metadata = {
+            "function": "search_bing",
+            "description": (
+                "Search the web for content on Bing. This allows users to search online/the internet/the web for"
+                " content."
+            ),
+            "arguments": [{"name": "query", "type": "string", "description": "The search query string"}],
+        }
+
+        system_prompt = (
+            "You are a helpful, respectful and honest assistant. Always answer as helpfully as"
+            "possible. Your only response should be JSON formatted functions"
+        )
+        # replace the curly braces with double curly braces to escape them
+        function_list = dumps(function_metadata).replace("{", "{{").replace("}", "}}")
+        system_prompt = f"{b_func}{function_list.strip()}{e_func}{b_inst}{b_sys}{system_prompt.strip()}{e_sys}{'{prompt}'}{e_inst}\n\n"
+        stop_tokens = ([tokenizer.eos_id],)
+        return system_prompt, stop_tokens
+
     if re.search("Llama-2.*-chat", checkpoint_name):
         b_inst, e_inst = "[INST]", "[/INST]"
         b_sys, e_sys = "<<SYS>>\n", "\n<</SYS>>\n\n"
@@ -303,11 +327,6 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
         stop_tokens = ([tokenizer.eos_id],)
         return system_prompt, stop_tokens
 
-    if re.search("stablecode-instruct", checkpoint_name):
-        system_prompt = "###Instruction\n{prompt}###Response\n"
-        stop_tokens = ([tokenizer.eos_id],)
-        return system_prompt, stop_tokens
-
     if re.search("CodeLlama|Mistral.*Instruct", checkpoint_name):
         # for CodeLLama, we don't set a default system prompt, but it is supported:
         # https://huggingface.co/blog/codellama#conversational-instructions
@@ -317,7 +336,7 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
         stop_tokens = ([tokenizer.eos_id],)
         return system_prompt, stop_tokens
 
-    if re.search("phi", checkpoint_name):
+    if re.search("phi-1", checkpoint_name):
         system_prompt = "{prompt}\n\nAnswer:"
 
         stop_tokens = (
@@ -330,12 +349,32 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
         )
         return system_prompt, stop_tokens
 
+    if re.search("phi-2", checkpoint_name):
+        system_prompt = "Instruct:{prompt}\nOutput:"
+        stop_tokens = ([tokenizer.eos_id],)
+        return system_prompt, stop_tokens
+
+    if re.search(r"TinyLlama.*Chat", checkpoint_name):
+        system_prompt = (
+            "<|system|>\n"
+            "You are a friendly chatbot who always gives helpful, detailed, and polite answers.</s>\n"
+            "<|user|>\n"
+            "{prompt}</s>\n"
+            "<|assistant|>\n"
+        )
+        stop_tokens = ([tokenizer.eos_id],)
+        return system_prompt, stop_tokens
+
+    if re.search(r"gemma.*-it", checkpoint_name):
+        system_prompt = "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
+        stop_tokens = ([tokenizer.eos_id],)
+        return system_prompt, stop_tokens
+
     # default format
     return "{prompt}", ([tokenizer.eos_id],)
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     torch.set_float32_matmul_precision("high")
+
     CLI(main)
diff --git a/eval/lm_eval_harness.py b/eval/lm_eval_harness.py
index a55f005..b7099b4 100644
--- a/eval/lm_eval_harness.py
+++ b/eval/lm_eval_harness.py
@@ -1,7 +1,9 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import sys
 from pathlib import Path
-from typing import List, Literal, Optional
+from typing import Dict, List, Literal, Optional
 
 import lightning as L
 import torch
@@ -15,24 +17,20 @@
 
 from generate.base import generate
 from lit_gpt import GPT, Config, Tokenizer
-from lit_gpt.utils import (
-    check_valid_checkpoint_dir,
-    get_default_supported_precision,
-    gptq_quantization,
-    load_checkpoint,
-)
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint
 
 
 class EvalHarnessBase(BaseLM):
     # Credits:
     # https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py
-    def __init__(self, fabric: L.Fabric, model: GPT, tokenizer: Tokenizer, batch_size: int, temperature: float):
+    def __init__(self, fabric: L.Fabric, model: GPT, tokenizer: Tokenizer, batch_size: int):
         super().__init__()
         self.fabric = fabric
         self.model = model
         self.tokenizer = tokenizer
         self.batch_size_per_gpu = batch_size
-        self.temperature = temperature
+        with fabric.init_tensor():
+            model.set_kv_cache(batch_size=batch_size)
 
     @classmethod
     def create_from_arg_string(cls, arg_string, additional_config=None):
@@ -64,45 +62,30 @@ def batch_size(self):
     def device(self):
         return self.fabric.device
 
-    def tok_encode(self, string: str):
+    def tok_encode(self, string: str) -> List[int]:
         return self.tokenizer.encode(string, bos=False, eos=False).tolist()
 
-    def tok_decode(self, tokens):
+    def tok_decode(self, tokens: List[int]) -> str:
         t = torch.tensor(tokens)
         return self.tokenizer.decode(t)
 
     @torch.inference_mode()
     def _model_call(self, inps):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
         return self.model(inps)
 
-    def _model_generate(self, context, max_length, eos_token_id):
+    @torch.inference_mode()
+    def _model_generate(self, context, max_length, eos_token_id) -> torch.Tensor:
+        # this only supports batch size 1
         assert context.shape[0] == 1
-        out = generate(
-            self.model, context[0], max_length, temperature=self.temperature, top_k=None, eos_id=eos_token_id
-        )
-
-        return self.tokenizer.decode(out)
+        out = generate(self.model, context[0], max_length, eos_id=eos_token_id)
+        for block in self.model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
+        return out.unsqueeze(0)
 
     @torch.inference_mode()
     def run_eval(
-        self,
-        eval_tasks=None,
-        num_fewshot=0,
-        bootstrap_iters=2,
-        description_dict=None,
-        use_cache=True,
-        name="lit-gpt",
-        limit=None,
-    ):
-        if eval_tasks is None:
-            eval_tasks = ["arc_challenge", "piqa", "hellaswag", "hendrycksTest-*"]
-
+        self, eval_tasks: List[str], num_fewshot: int, limit: Optional[int], bootstrap_iters: int, no_cache: bool
+    ) -> Dict:
         # Returns a list containing all values of the task registry that
         # match at least one of the patterns
         import fnmatch
@@ -128,29 +111,25 @@ def pattern_match(patterns, source_list):
         tasks.get_task_dict(eval_tasks)
 
         lm = self
-        if use_cache:
-            lm = base.CachingLM(lm, "lm_cache/" + name + ".db")
+        if not no_cache:
+            lm = base.CachingLM(lm, "lm_cache/lit-gpt.db")
 
         results = evaluator.evaluate(
             lm=lm,
             task_dict=tasks.get_task_dict(eval_tasks),
-            description_dict=description_dict,
             num_fewshot=num_fewshot,
             limit=limit,
             bootstrap_iters=bootstrap_iters,
         )
-
-        results["config"] = {
-            "model": self.model.config.name,
-            "num_fewshot": num_fewshot,
-            "batch_size": self.batch_size,
-            "device": str(self.device),
-            "no_cache": not use_cache,
-            "limit": limit,
-            "bootstrap_iters": bootstrap_iters,
-            "description_dict": description_dict,
-        }
-
+        results["config"] = dict(
+            model=self.model.config.name,
+            batch_size=self.batch_size,
+            device=str(self.device),
+            num_fewshot=num_fewshot,
+            limit=limit,
+            bootstrap_iters=bootstrap_iters,
+            no_cache=no_cache,
+        )
         return results
 
 
@@ -158,13 +137,13 @@ def pattern_match(patterns, source_list):
 def run_eval_harness(
     checkpoint_dir: Path,
     precision: Optional[str] = None,
-    batch_size=1,
-    temperature=1.0,
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
-    eval_tasks: Optional[List[str]] = None,
-    num_fewshot=0,
-    bootstrap_iters=2,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
+    eval_tasks: List[str] = ["arc_challenge", "piqa", "hellaswag", "hendrycksTest-*"],
     save_filepath: Optional[Path] = None,
+    num_fewshot: int = 0,
+    limit: Optional[int] = None,
+    bootstrap_iters: int = 100000,
+    no_cache: bool = True,
 ):
     if precision is None:
         precision = get_default_supported_precision(training=False)
@@ -179,22 +158,15 @@ def run_eval_harness(
 
     fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
 
-    checkpoint_dir = Path(checkpoint_dir)
     check_valid_checkpoint_dir(checkpoint_dir)
     tokenizer = Tokenizer(checkpoint_dir)
 
     config = Config.from_json(checkpoint_dir / "lit_config.json")
 
-    if quantize == "gptq.int4":
-        model_file = "lit_model_gptq.4bit.pth"
-        if not (checkpoint_dir / model_file).is_file():
-            raise ValueError("Please run `python quantize/gptq.py` first")
-    else:
-        model_file = "lit_model.pth"
-    checkpoint_path = checkpoint_dir / model_file
+    checkpoint_path = checkpoint_dir / "lit_model.pth"
 
     print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    with fabric.init_module(empty_init=True):
         model = GPT(config)
 
     model.eval()
@@ -202,22 +174,20 @@ def run_eval_harness(
 
     load_checkpoint(fabric, model, checkpoint_path)
 
-    eval_harness = EvalHarnessBase(fabric, model, tokenizer, batch_size, temperature)
+    eval_harness = EvalHarnessBase(fabric, model, tokenizer, 1)
 
-    results = eval_harness.run_eval(
-        eval_tasks=eval_tasks, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters, use_cache=False
-    )
+    results = eval_harness.run_eval(eval_tasks, num_fewshot, limit, bootstrap_iters, no_cache)
     if save_filepath is None:
         print(results)
     else:
         print(f"Saving results to {str(save_filepath)!r}")
+        save_filepath.parent.mkdir(parents=True, exist_ok=True)
         data = json.dumps(results)
         with open(save_filepath, "w") as fw:
             fw.write(data)
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     torch.set_float32_matmul_precision("high")
-    CLI(run_eval_harness, as_positional=False)
+
+    CLI(run_eval_harness)
diff --git a/finetune/adapter.py b/finetune/adapter.py
index 671a28b..1114e7d 100644
--- a/finetune/adapter.py
+++ b/finetune/adapter.py
@@ -1,13 +1,17 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+import dataclasses
 import os
 import sys
 import time
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Literal, Optional, Tuple
 
 import lightning as L
 import torch
 from lightning.fabric.loggers import CSVLogger
+from lightning.fabric.plugins import BitsandbytesPrecision
 from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities import ThroughputMonitor
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -15,50 +19,60 @@
 
 from generate.base import generate
 from lit_gpt.adapter import GPT, Block, Config, adapter_filter, mark_only_adapter_as_trainable
-from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
-from lit_gpt.speed_monitor import estimate_flops, measure_flops
+from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 from lit_gpt.tokenizer import Tokenizer
 from lit_gpt.utils import (
+    CLI,
     check_valid_checkpoint_dir,
     chunked_cross_entropy,
     get_default_supported_precision,
-    lazy_load,
+    load_checkpoint,
     num_parameters,
 )
 from scripts.prepare_alpaca import generate_prompt
 
-eval_interval = 600
-save_interval = 1000
-eval_iters = 100
-eval_max_new_tokens = 100
-log_interval = 1
-devices = 1
-
-# Hyperparameters
-learning_rate = 3e-3
-batch_size = 64 / devices
-micro_batch_size = 4
-gradient_accumulation_iters = batch_size // micro_batch_size
-assert gradient_accumulation_iters > 0
-epoch_size = 50000  # train dataset size
-num_epochs = 5
-max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
-weight_decay = 0.02
-warmup_steps = 2 * (epoch_size // micro_batch_size) // devices // gradient_accumulation_iters  # 2 epochs
-
-hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
-
 
 def setup(
-    data_dir: Path = Path("data/alpaca"),
-    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    out_dir: Path = Path("out/adapter/alpaca"),
     precision: Optional[str] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None,
+    devices: int = 1,
+    seed: int = 1337,
+    io: IOArgs = IOArgs(
+        train_data_dir=Path("data/alpaca"),
+        val_data_dir=Path("data/alpaca"),
+        checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+        out_dir=Path("out/adapter/alpaca"),
+    ),
+    train: TrainArgs = TrainArgs(
+        save_interval=1000,
+        log_interval=1,
+        global_batch_size=64,
+        micro_batch_size=4,
+        lr_warmup_steps=100,
+        epochs=5,
+        epoch_size=50000,
+        learning_rate=1e-3,
+        max_seq_length=None,
+    ),
+    eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100),
 ) -> None:
+    print(locals())
     precision = precision or get_default_supported_precision(training=True)
 
-    fabric_devices = devices
-    if fabric_devices > 1:
+    plugins = None
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    if devices > 1:
+        if quantize:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
+                " --quantize flag."
+            )
         strategy = FSDPStrategy(
             auto_wrap_policy={Block},
             activation_checkpointing_policy={Block},
@@ -69,155 +83,149 @@ def setup(
     else:
         strategy = "auto"
 
-    logger = CSVLogger(out_dir.parent, out_dir.name, flush_logs_every_n_steps=log_interval)
-    fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision, loggers=logger)
-    fabric.print(hparams)
-    fabric.launch(main, data_dir, checkpoint_dir, out_dir)
+    logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval)
+    fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins)
+    fabric.launch(main, devices, seed, Config.from_name(name=io.checkpoint_dir.name), io, train, eval)
+
 
+def main(fabric: L.Fabric, devices: int, seed: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    validate_args(io, train, eval)
 
-def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path) -> None:
-    check_valid_checkpoint_dir(checkpoint_dir)
+    steps_per_epoch = train.epoch_size // devices // train.batch_size(devices)
+    lr_max_steps = train.epochs * steps_per_epoch
 
-    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
+    check_valid_checkpoint_dir(io.checkpoint_dir)
 
-    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)
+    fabric.seed_everything(seed)  # same seed for every process to init model (FSDP)
 
     if fabric.global_rank == 0:
-        os.makedirs(out_dir, exist_ok=True)
+        os.makedirs(io.out_dir, exist_ok=True)
 
-    train_data = torch.load(data_dir / "train.pt")
-    val_data = torch.load(data_dir / "test.pt")
+    train_data = torch.load(io.train_data_dir / "train.pt")
+    val_data = torch.load(io.val_data_dir / "test.pt")
 
-    config = Config.from_name(name=checkpoint_dir.name)
-    checkpoint_path = checkpoint_dir / "lit_model.pth"
+    checkpoint_path = io.checkpoint_dir / "lit_model.pth"
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
-    with fabric.init_module(empty_init=False):
+    with fabric.init_module(empty_init=(devices > 1)):
         model = GPT(config)
-    checkpoint = lazy_load(checkpoint_path)
-    # strict=False because missing keys due to adapter weights not contained in state dict
-    model.load_state_dict(checkpoint, strict=False)
-
     mark_only_adapter_as_trainable(model)
 
     fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}")
     fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}")
+
+    model = fabric.setup_module(model)
+
     trainable_params = [p for p in model.parameters() if p.requires_grad]
+    if isinstance(fabric.strategy.precision, BitsandbytesPrecision):
+        import bitsandbytes as bnb
+
+        optimizer_cls = bnb.optim.PagedAdamW
+    else:
+        optimizer_cls = torch.optim.AdamW
+    optimizer = optimizer_cls(
+        trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
+    )
+    optimizer = fabric.setup_optimizers(optimizer)
+    scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
 
-    optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay)
-    model, optimizer = fabric.setup(model, optimizer)
+    # strict=False because missing keys due to Adapter weights not contained in state dict
+    load_checkpoint(fabric, model, checkpoint_path, strict=False)
 
     fabric.seed_everything(1337 + fabric.global_rank)
 
     train_time = time.perf_counter()
-    train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir, speed_monitor)
+    fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval)
     fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
     if fabric.device.type == "cuda":
         fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
 
     # Save the final checkpoint at the end of training
-    save_path = out_dir / "lit_model_adapter_finetuned.pth"
+    save_path = io.out_dir / "lit_model_adapter_finetuned.pth"
     save_adapter_checkpoint(fabric, model, save_path)
 
 
-def train(
+def fit(
     fabric: L.Fabric,
     model: GPT,
     optimizer: torch.optim.Optimizer,
+    scheduler: torch.optim.lr_scheduler,
     train_data: List[Dict],
     val_data: List[Dict],
-    checkpoint_dir: Path,
-    out_dir: Path,
-    speed_monitor: SpeedMonitor,
+    devices: int,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
 ) -> None:
-    tokenizer = Tokenizer(checkpoint_dir)
+    tokenizer = Tokenizer(io.checkpoint_dir)
     longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data)
-    model.max_seq_length = longest_seq_length
+    model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf"))
     fabric.print(
         f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is"
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_data, tokenizer)  # sanity check
-
-    with torch.device("meta"):
-        meta_model = GPT(model.config)
-        mark_only_adapter_as_trainable(meta_model)
-        # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
-        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
-        estimated_flops = estimate_flops(meta_model) * micro_batch_size
-        fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
-        # this assumes that all samples have a fixed length equal to the longest sequence length
-        # which is most likely false during finetuning
-        x = torch.randint(0, 1, (micro_batch_size, longest_seq_length))
-        measured_flops = measure_flops(meta_model, x)
-        fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
-        del meta_model, x
+    validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train)  # sanity check
 
+    throughput = ThroughputMonitor(fabric, window_size=50)
     step_count = 0
     total_lengths = 0
     total_t0 = time.perf_counter()
 
-    for iter_num in range(max_iters):
-        if step_count <= warmup_steps:
-            # linear warmup
-            lr = learning_rate * step_count / warmup_steps
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = lr
-
+    for iter_num in range(1, train.max_iters(devices) + 1):
         iter_t0 = time.perf_counter()
 
-        input_ids, targets = get_batch(fabric, train_data, longest_seq_ix if iter_num == 0 else None)
+        input_ids, targets = get_batch(
+            fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None
+        )
 
-        is_accumulating = (iter_num + 1) % gradient_accumulation_iters != 0
+        is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0
         with fabric.no_backward_sync(model, enabled=is_accumulating):
             logits = model(input_ids, lm_head_chunk_size=128)
             # shift the targets such that output n predicts token n+1
             logits[-1] = logits[-1][..., :-1, :]
             loss = chunked_cross_entropy(logits, targets[..., 1:])
-            fabric.backward(loss / gradient_accumulation_iters)
+            fabric.backward(loss / train.gradient_accumulation_iters(devices))
 
         if not is_accumulating:
             optimizer.step()
             optimizer.zero_grad()
+            scheduler.step()
             step_count += 1
 
-        t1 = time.perf_counter()
-        total_lengths += input_ids.size(1)
-        speed_monitor.on_train_batch_end(
-            (iter_num + 1) * micro_batch_size,
-            t1 - total_t0,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            fabric.world_size,
-            flops_per_batch=measured_flops,
-            lengths=total_lengths,
-        )
-        if iter_num % log_interval == 0:
+        total_lengths += input_ids.numel()
+        if iter_num % train.log_interval == 0:
+            loss_item = loss.item()  # expensive device-to-host synchronization
+            t1 = time.perf_counter()
+            throughput.update(
+                time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths
+            )
+            throughput.compute_and_log(step=iter_num)
             fabric.print(
-                f"iter {iter_num} step {step_count}: loss {loss.item():.4f}, iter time:"
-                f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
+                f"iter {iter_num} | step {step_count}: loss {loss_item:.4f}, iter time:"
+                f" {(t1 - iter_t0) * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}"
             )
 
-        if not is_accumulating and step_count % eval_interval == 0:
+        if not is_accumulating and step_count % eval.interval == 0:
             t0 = time.perf_counter()
-            val_loss = validate(fabric, model, val_data, tokenizer)
+            val_loss = validate(fabric, model, val_data, tokenizer, eval, train)
             t1 = time.perf_counter() - t0
-            speed_monitor.eval_end(t1)
-            fabric.print(f"step {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
+            fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms")
             fabric.barrier()
-        if not is_accumulating and step_count % save_interval == 0:
-            checkpoint_path = out_dir / f"iter-{iter_num:06d}-ckpt.pth"
+        if not is_accumulating and step_count % train.save_interval == 0:
+            checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth"
             save_adapter_checkpoint(fabric, model, checkpoint_path)
 
 
 # the adapter "kv cache" cannot be initialized under `inference_mode`
 @torch.no_grad()
-def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer) -> torch.Tensor:
+def validate(
+    fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs
+) -> torch.Tensor:
     fabric.print("Validating ...")
     model.eval()
-    losses = torch.zeros(eval_iters)
-    for k in range(eval_iters):
-        input_ids, targets = get_batch(fabric, val_data)
+    losses = torch.zeros(eval.max_iters)
+    for k in range(eval.max_iters):
+        input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length)
         logits = model(input_ids)
         losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0)
     val_loss = losses.mean()
@@ -231,7 +239,9 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
     with fabric.init_tensor():
         # do not set `max_seq_length=max_returned_token` because memory is not a concern here
         model.set_kv_cache(batch_size=1)
-    output = generate(model, encoded, max_returned_tokens=len(encoded) + eval_max_new_tokens, temperature=0.8)
+    output = generate(
+        model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id
+    )
     model.clear_kv_cache()
     output = tokenizer.decode(output)
     fabric.print(output)
@@ -241,7 +251,11 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
 
 
 def get_batch(
-    fabric: L.Fabric, data: List[Dict], longest_seq_ix: Optional[int] = None
+    fabric: L.Fabric,
+    data: List[Dict],
+    micro_batch_size: int,
+    max_seq_length: Optional[int],
+    longest_seq_ix: Optional[int] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     ix = torch.randint(len(data), (micro_batch_size,))
     if longest_seq_ix is not None:
@@ -262,6 +276,11 @@ def pad_right(x, pad_id):
     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
 
+    # Truncate if needed
+    if max_seq_length:
+        x = x[:, :max_seq_length]
+        y = y[:, :max_seq_length]
+
     if fabric.device.type == "cuda" and x.device.type == "cpu":
         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
     else:
@@ -269,6 +288,13 @@ def pad_right(x, pad_id):
     return x, y
 
 
+def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int):
+    # linear warmup followed by cosine annealing
+    scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps)
+    scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps))
+    return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps])
+
+
 def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
     # find out the minimum max_seq_length required during fine-tuning (saves memory!)
     lengths = [len(d["input_ids"]) for d in data]
@@ -282,11 +308,27 @@ def save_adapter_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path:
     fabric.save(file_path, {"model": model}, filter={"model": adapter_filter})
 
 
+def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    issues = []
+    unsupported = [(train, ["max_tokens", "max_norm"])]
+    for args, names in unsupported:
+        for name in names:
+            if getattr(args, name) is not None:
+                issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}")
+    required = [
+        (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]),
+        (train, ["epoch_size", "epochs"]),
+        (eval, ["max_new_tokens"]),
+    ]
+    for args, names in required:
+        for name in names:
+            if getattr(args, name) is None:
+                issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}")
+    if issues:
+        raise ValueError("\n".join(issues))
+
+
 if __name__ == "__main__":
-    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
-    # torch.backends.cuda.enable_flash_sdp(False)
     torch.set_float32_matmul_precision("high")
 
-    from jsonargparse import CLI
-
     CLI(setup)
diff --git a/finetune/adapter_v2.py b/finetune/adapter_v2.py
index 0f547d7..34e3607 100644
--- a/finetune/adapter_v2.py
+++ b/finetune/adapter_v2.py
@@ -1,13 +1,17 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+import dataclasses
 import os
 import sys
 import time
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Literal, Optional, Tuple
 
 import lightning as L
 import torch
 from lightning.fabric.loggers import CSVLogger
+from lightning.fabric.plugins import BitsandbytesPrecision
 from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities import ThroughputMonitor
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -15,50 +19,60 @@
 
 from generate.base import generate
 from lit_gpt.adapter_v2 import GPT, Block, Config, adapter_filter, mark_only_adapter_v2_as_trainable
-from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
-from lit_gpt.speed_monitor import estimate_flops, measure_flops
+from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 from lit_gpt.tokenizer import Tokenizer
 from lit_gpt.utils import (
+    CLI,
     check_valid_checkpoint_dir,
     chunked_cross_entropy,
     get_default_supported_precision,
-    lazy_load,
+    load_checkpoint,
     num_parameters,
 )
 from scripts.prepare_alpaca import generate_prompt
 
-eval_interval = 600
-save_interval = 1000
-eval_iters = 100
-eval_max_new_tokens = 100
-log_interval = 1
-devices = 1
-
-# Hyperparameters
-learning_rate = 3e-3
-batch_size = 128 / devices
-micro_batch_size = 2  # set to 2 because this is fit into 12GB Vram
-gradient_accumulation_iters = batch_size // micro_batch_size
-assert gradient_accumulation_iters > 0
-epoch_size = 50000  # train dataset size
-num_epochs = 5
-max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
-weight_decay = 0.02
-warmup_steps = 2 * (epoch_size // micro_batch_size) // devices // gradient_accumulation_iters  # 2 epochs
-
-hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
-
 
 def setup(
-    data_dir: Path = Path("data/alpaca"),
-    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    out_dir: Path = Path("out/adapter_v2/alpaca"),
     precision: Optional[str] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None,
+    devices: int = 1,
+    seed: int = 1337,
+    io: IOArgs = IOArgs(
+        train_data_dir=Path("data/alpaca"),
+        val_data_dir=Path("data/alpaca"),
+        checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+        out_dir=Path("out/adapter_v2/alpaca"),
+    ),
+    train: TrainArgs = TrainArgs(
+        save_interval=1000,
+        log_interval=1,
+        global_batch_size=128,
+        micro_batch_size=2,
+        lr_warmup_steps=100,
+        epochs=5,
+        epoch_size=50000,
+        learning_rate=1e-3,
+        max_seq_length=None,
+    ),
+    eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100),
 ) -> None:
+    print(locals())
     precision = precision or get_default_supported_precision(training=True)
 
-    fabric_devices = devices
-    if fabric_devices > 1:
+    plugins = None
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    if devices > 1:
+        if quantize:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
+                " --quantize flag."
+            )
         strategy = FSDPStrategy(
             auto_wrap_policy={Block},
             activation_checkpointing_policy={Block},
@@ -69,155 +83,149 @@ def setup(
     else:
         strategy = "auto"
 
-    logger = CSVLogger(out_dir.parent, out_dir.name, flush_logs_every_n_steps=log_interval)
-    fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision, loggers=logger)
-    fabric.print(hparams)
-    fabric.launch(main, data_dir, checkpoint_dir, out_dir)
+    logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval)
+    fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins)
+    fabric.launch(main, devices, seed, Config.from_name(name=io.checkpoint_dir.name), io, train, eval)
+
 
+def main(fabric: L.Fabric, devices: int, seed: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    validate_args(io, train, eval)
 
-def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path) -> None:
-    check_valid_checkpoint_dir(checkpoint_dir)
+    steps_per_epoch = train.epoch_size // devices // train.batch_size(devices)
+    lr_max_steps = train.epochs * steps_per_epoch
 
-    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
+    check_valid_checkpoint_dir(io.checkpoint_dir)
 
-    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)
+    fabric.seed_everything(seed)  # same seed for every process to init model (FSDP)
 
     if fabric.global_rank == 0:
-        os.makedirs(out_dir, exist_ok=True)
+        os.makedirs(io.out_dir, exist_ok=True)
 
-    train_data = torch.load(data_dir / "train.pt")
-    val_data = torch.load(data_dir / "test.pt")
+    train_data = torch.load(io.train_data_dir / "train.pt")
+    val_data = torch.load(io.val_data_dir / "test.pt")
 
-    config = Config.from_name(name=checkpoint_dir.name)
-    checkpoint_path = checkpoint_dir / "lit_model.pth"
+    checkpoint_path = io.checkpoint_dir / "lit_model.pth"
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
-    with fabric.init_module(empty_init=False):
+    with fabric.init_module(empty_init=(devices > 1)):
         model = GPT(config)
-    checkpoint = lazy_load(checkpoint_path)
-    # strict=False because missing keys due to adapter weights not contained in state dict
-    model.load_state_dict(checkpoint, strict=False)
-
     mark_only_adapter_v2_as_trainable(model)
 
     fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}")
     fabric.print(f"Number of non trainable parameters: {num_parameters(model, requires_grad=False):,}")
+
+    model = fabric.setup_module(model)
+
     trainable_params = [p for p in model.parameters() if p.requires_grad]
+    if isinstance(fabric.strategy.precision, BitsandbytesPrecision):
+        import bitsandbytes as bnb
+
+        optimizer_cls = bnb.optim.PagedAdamW
+    else:
+        optimizer_cls = torch.optim.AdamW
+    optimizer = optimizer_cls(
+        trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
+    )
+    optimizer = fabric.setup_optimizers(optimizer)
+    scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
 
-    optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay)
-    model, optimizer = fabric.setup(model, optimizer)
+    # strict=False because missing keys due to Adapter weights not contained in state dict
+    load_checkpoint(fabric, model, checkpoint_path, strict=False)
 
     fabric.seed_everything(1337 + fabric.global_rank)
 
     train_time = time.perf_counter()
-    train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir, speed_monitor)
+    fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval)
     fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
     if fabric.device.type == "cuda":
         fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
 
     # Save the final checkpoint at the end of training
-    save_path = out_dir / "lit_model_adapter_finetuned.pth"
+    save_path = io.out_dir / "lit_model_adapter_finetuned.pth"
     save_adapter_v2_checkpoint(fabric, model, save_path)
 
 
-def train(
+def fit(
     fabric: L.Fabric,
     model: GPT,
     optimizer: torch.optim.Optimizer,
+    scheduler: torch.optim.lr_scheduler,
     train_data: List[Dict],
     val_data: List[Dict],
-    checkpoint_dir: Path,
-    out_dir: Path,
-    speed_monitor: SpeedMonitor,
+    devices: int,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
 ) -> None:
-    tokenizer = Tokenizer(checkpoint_dir)
+    tokenizer = Tokenizer(io.checkpoint_dir)
     longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data)
-    model.max_seq_length = longest_seq_length
+    model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf"))
     fabric.print(
         f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is"
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_data, tokenizer)  # sanity check
-
-    with torch.device("meta"):
-        meta_model = GPT(model.config)
-        mark_only_adapter_v2_as_trainable(meta_model)
-        # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
-        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
-        estimated_flops = estimate_flops(meta_model) * micro_batch_size
-        fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
-        # this assumes that all samples have a fixed length equal to the longest sequence length
-        # which is most likely false during finetuning
-        x = torch.randint(0, 1, (micro_batch_size, longest_seq_length))
-        measured_flops = measure_flops(meta_model, x)
-        fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
-        del meta_model, x
+    validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train)  # sanity check
 
+    throughput = ThroughputMonitor(fabric, window_size=50)
     step_count = 0
     total_lengths = 0
     total_t0 = time.perf_counter()
 
-    for iter_num in range(max_iters):
-        if step_count <= warmup_steps:
-            # linear warmup
-            lr = learning_rate * step_count / warmup_steps
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = lr
-
+    for iter_num in range(1, train.max_iters(devices) + 1):
         iter_t0 = time.perf_counter()
 
-        input_ids, targets = get_batch(fabric, train_data, longest_seq_ix if iter_num == 0 else None)
+        input_ids, targets = get_batch(
+            fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None
+        )
 
-        is_accumulating = (iter_num + 1) % gradient_accumulation_iters != 0
+        is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0
         with fabric.no_backward_sync(model, enabled=is_accumulating):
             logits = model(input_ids, lm_head_chunk_size=128)
             # shift the targets such that output n predicts token n+1
             logits[-1] = logits[-1][..., :-1, :]
             loss = chunked_cross_entropy(logits, targets[..., 1:])
-            fabric.backward(loss / gradient_accumulation_iters)
+            fabric.backward(loss / train.gradient_accumulation_iters(devices))
 
         if not is_accumulating:
             optimizer.step()
             optimizer.zero_grad()
+            scheduler.step()
             step_count += 1
 
-        t1 = time.perf_counter()
-        total_lengths += input_ids.size(1)
-        speed_monitor.on_train_batch_end(
-            (iter_num + 1) * micro_batch_size,
-            t1 - total_t0,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            fabric.world_size,
-            flops_per_batch=measured_flops,
-            lengths=total_lengths,
-        )
-        if iter_num % log_interval == 0:
+        total_lengths += input_ids.numel()
+        if iter_num % train.log_interval == 0:
+            loss_item = loss.item()  # expensive device-to-host synchronization
+            t1 = time.perf_counter()
+            throughput.update(
+                time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths
+            )
+            throughput.compute_and_log(step=iter_num)
             fabric.print(
-                f"iter {iter_num} step {step_count}: loss {loss.item():.4f}, iter time:"
-                f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
+                f"iter {iter_num} | step {step_count}: loss {loss_item:.4f}, iter time:"
+                f" {(t1 - iter_t0) * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}"
             )
 
-        if not is_accumulating and step_count % eval_interval == 0:
+        if not is_accumulating and step_count % eval.interval == 0:
             t0 = time.perf_counter()
-            val_loss = validate(fabric, model, val_data, tokenizer)
+            val_loss = validate(fabric, model, val_data, tokenizer, eval, train)
             t1 = time.perf_counter() - t0
-            speed_monitor.eval_end(t1)
-            fabric.print(f"step {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
+            fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms")
             fabric.barrier()
-        if not is_accumulating and step_count % save_interval == 0:
-            checkpoint_path = out_dir / f"iter-{iter_num:06d}-ckpt.pth"
+        if not is_accumulating and step_count % train.save_interval == 0:
+            checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth"
             save_adapter_v2_checkpoint(fabric, model, checkpoint_path)
 
 
 # the adapter "kv cache" cannot be initialized under `inference_mode`
 @torch.no_grad()
-def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer) -> torch.Tensor:
+def validate(
+    fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs
+) -> torch.Tensor:
     fabric.print("Validating ...")
     model.eval()
-    losses = torch.zeros(eval_iters)
-    for k in range(eval_iters):
-        input_ids, targets = get_batch(fabric, val_data)
+    losses = torch.zeros(eval.max_iters)
+    for k in range(eval.max_iters):
+        input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length)
         logits = model(input_ids)
         losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0)
     val_loss = losses.mean()
@@ -231,7 +239,9 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
     with fabric.init_tensor():
         # do not set `max_seq_length=max_returned_token` because memory is not a concern here
         model.set_kv_cache(batch_size=1)
-    output = generate(model, encoded, max_returned_tokens=len(encoded) + eval_max_new_tokens, temperature=0.8)
+    output = generate(
+        model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id
+    )
     model.clear_kv_cache()
     output = tokenizer.decode(output)
     fabric.print(output)
@@ -241,7 +251,11 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
 
 
 def get_batch(
-    fabric: L.Fabric, data: List[Dict], longest_seq_ix: Optional[int] = None
+    fabric: L.Fabric,
+    data: List[Dict],
+    micro_batch_size: int,
+    max_seq_length: Optional[int],
+    longest_seq_ix: Optional[int] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     ix = torch.randint(len(data), (micro_batch_size,))
     if longest_seq_ix is not None:
@@ -262,6 +276,11 @@ def pad_right(x, pad_id):
     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
 
+    # Truncate if needed
+    if max_seq_length:
+        x = x[:, :max_seq_length]
+        y = y[:, :max_seq_length]
+
     if fabric.device.type == "cuda" and x.device.type == "cpu":
         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
     else:
@@ -269,6 +288,13 @@ def pad_right(x, pad_id):
     return x, y
 
 
+def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int):
+    # linear warmup followed by cosine annealing
+    scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps)
+    scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps))
+    return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps])
+
+
 def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
     # find out the minimum max_seq_length required during fine-tuning (saves memory!)
     lengths = [len(d["input_ids"]) for d in data]
@@ -282,11 +308,27 @@ def save_adapter_v2_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_pa
     fabric.save(file_path, {"model": model}, filter={"model": adapter_filter})
 
 
+def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    issues = []
+    unsupported = [(train, ["max_tokens", "max_norm"])]
+    for args, names in unsupported:
+        for name in names:
+            if getattr(args, name) is not None:
+                issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}")
+    required = [
+        (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]),
+        (train, ["epoch_size", "epochs"]),
+        (eval, ["max_new_tokens"]),
+    ]
+    for args, names in required:
+        for name in names:
+            if getattr(args, name) is None:
+                issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}")
+    if issues:
+        raise ValueError("\n".join(issues))
+
+
 if __name__ == "__main__":
-    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
-    # torch.backends.cuda.enable_flash_sdp(False)
     torch.set_float32_matmul_precision("high")
 
-    from jsonargparse import CLI
-
     CLI(setup)
diff --git a/finetune/full.py b/finetune/full.py
index 820e2d8..470fbb9 100644
--- a/finetune/full.py
+++ b/finetune/full.py
@@ -1,24 +1,28 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+import dataclasses
+import math
 import os
 import sys
 import time
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import lightning as L
 import torch
 from lightning.fabric.loggers import CSVLogger
 from lightning.fabric.strategies import FSDPStrategy
+from torchmetrics import RunningMean
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
 from generate.base import generate
+from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 from lit_gpt.model import GPT, Block, Config
-from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
-from lit_gpt.speed_monitor import estimate_flops, measure_flops
 from lit_gpt.tokenizer import Tokenizer
 from lit_gpt.utils import (
+    CLI,
     check_valid_checkpoint_dir,
     chunked_cross_entropy,
     get_default_supported_precision,
@@ -27,38 +31,35 @@
 )
 from scripts.prepare_alpaca import generate_prompt
 
-eval_interval = 600
-save_interval = 1000
-eval_iters = 100
-eval_max_new_tokens = 100
-log_interval = 1
-devices = 1
-
-# Hyperparameters
-learning_rate = 3e-3
-batch_size = 64 / devices
-micro_batch_size = 1
-gradient_accumulation_iters = batch_size // micro_batch_size
-assert gradient_accumulation_iters > 0
-epoch_size = 50000  # train dataset size
-num_epochs = 5
-max_iters = num_epochs * (epoch_size // micro_batch_size) // devices
-weight_decay = 0.02
-warmup_steps = 2 * (epoch_size // micro_batch_size) // devices // gradient_accumulation_iters  # 2 epochs
-
-hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
-
 
 def setup(
-    data_dir: Path = Path("data/alpaca"),
-    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    out_dir: Path = Path("out/full/alpaca"),
     precision: Optional[str] = None,
+    devices: int = 1,
+    resume: Union[bool, Path] = False,
+    seed: int = 1337,
+    io: IOArgs = IOArgs(
+        train_data_dir=Path("data/alpaca"),
+        val_data_dir=Path("data/alpaca"),
+        checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+        out_dir=Path("out/full/alpaca"),
+    ),
+    train: TrainArgs = TrainArgs(
+        save_interval=1000,
+        log_interval=1,
+        global_batch_size=64,
+        micro_batch_size=1,
+        lr_warmup_steps=100,
+        epochs=5,
+        epoch_size=50000,
+        learning_rate=3e-3,
+        max_seq_length=None,
+    ),
+    eval: EvalArgs = EvalArgs(interval=600, max_new_tokens=100, max_iters=100),
 ) -> None:
+    print(locals())
     precision = precision or get_default_supported_precision(training=True)
 
-    fabric_devices = devices
-    if fabric_devices > 1:
+    if devices > 1:
         strategy = FSDPStrategy(
             auto_wrap_policy={Block},
             activation_checkpointing_policy={Block},
@@ -69,148 +70,184 @@ def setup(
     else:
         strategy = "auto"
 
-    logger = CSVLogger(out_dir.parent, out_dir.name, flush_logs_every_n_steps=log_interval)
-    fabric = L.Fabric(devices=fabric_devices, strategy=strategy, precision=precision, loggers=logger)
-    fabric.print(hparams)
-    fabric.launch(main, data_dir, checkpoint_dir, out_dir)
+    logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval)
+    fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger)
+    fabric.launch(main, devices, resume, seed, Config.from_name(name=io.checkpoint_dir.name), io, train, eval)
 
 
-def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path) -> None:
-    check_valid_checkpoint_dir(checkpoint_dir)
+def main(
+    fabric: L.Fabric,
+    devices: int,
+    resume: Union[bool, Path],
+    seed: int,
+    config: Config,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
+) -> None:
+    validate_args(io, train, eval)
+
+    steps_per_epoch = train.epoch_size // devices // train.batch_size(devices)
+    lr_max_steps = train.epochs * steps_per_epoch
 
-    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
+    check_valid_checkpoint_dir(io.checkpoint_dir)
 
-    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)
+    fabric.seed_everything(seed)  # same seed for every process to init model (FSDP)
 
     if fabric.global_rank == 0:
-        os.makedirs(out_dir, exist_ok=True)
+        os.makedirs(io.out_dir, exist_ok=True)
 
-    train_data = torch.load(data_dir / "train.pt")
-    val_data = torch.load(data_dir / "test.pt")
+    train_data = torch.load(io.train_data_dir / "train.pt")
+    val_data = torch.load(io.val_data_dir / "test.pt")
 
-    config = Config.from_name(name=checkpoint_dir.name)
-    checkpoint_path = checkpoint_dir / "lit_model.pth"
+    checkpoint_path = io.checkpoint_dir / "lit_model.pth"
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
     with fabric.init_module(empty_init=(devices > 1)):
         model = GPT(config)
 
     fabric.print(f"Number of trainable parameters: {num_parameters(model, requires_grad=True):,}")
 
-    model = fabric.setup_module(model)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+    model = fabric.setup(model)
+    optimizer = torch.optim.AdamW(
+        model.parameters(), lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
+    )
     optimizer = fabric.setup_optimizers(optimizer)
-
-    load_checkpoint(fabric, model, checkpoint_path)
+    scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
+    state = {"model": model, "optimizer": optimizer, "scheduler": scheduler, "iter_num": 0, "step_count": 0}
+
+    if resume is True:
+        resume = max(io.out_dir.glob("*.pth"), key=(lambda p: int(p.name.split("-")[1])))
+    if resume:
+        fabric.print(f"Resuming training from {resume}")
+        fabric.load(resume, state)
+    else:
+        load_checkpoint(fabric, state["model"], checkpoint_path)
 
     fabric.seed_everything(1337 + fabric.global_rank)
 
     train_time = time.perf_counter()
-    train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir, speed_monitor)
+    fit(fabric, state, train_data, val_data, devices, resume, io, train, eval)
     fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
     if fabric.device.type == "cuda":
         fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
 
     # Save the final checkpoint at the end of training
-    save_path = out_dir / "lit_model_finetuned.pth"
-    save_checkpoint(fabric, model, save_path)
+    fabric.save(io.out_dir / "lit_model_finetuned.pth", {"model": state["model"]})
 
 
-def train(
+def fit(
     fabric: L.Fabric,
-    model: GPT,
-    optimizer: torch.optim.Optimizer,
+    state: Dict,
     train_data: List[Dict],
     val_data: List[Dict],
-    checkpoint_dir: Path,
-    out_dir: Path,
-    speed_monitor: SpeedMonitor,
+    devices: int,
+    resume: Union[bool, Path],
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
 ) -> None:
-    tokenizer = Tokenizer(checkpoint_dir)
+    model = state["model"]
+    optimizer = state["optimizer"]
+    scheduler = state["scheduler"]
+    tokenizer = Tokenizer(io.checkpoint_dir)
     longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data)
-    model.max_seq_length = longest_seq_length
+    model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf"))
     fabric.print(
         f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is"
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_data, tokenizer)  # sanity check
-
-    with torch.device("meta"):
-        meta_model = GPT(model.config)
-        # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
-        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
-        estimated_flops = estimate_flops(meta_model) * micro_batch_size
-        fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
-        # this assumes that all samples have a fixed length equal to the longest sequence length
-        # which is most likely false during finetuning
-        x = torch.randint(0, 1, (micro_batch_size, longest_seq_length))
-        measured_flops = measure_flops(meta_model, x)
-        fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
-        del meta_model, x
-
-    step_count = 0
-    total_lengths = 0
-    total_t0 = time.perf_counter()
-
-    for iter_num in range(max_iters):
-        if step_count <= warmup_steps:
-            # linear warmup
-            lr = learning_rate * step_count / warmup_steps
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = lr
+    validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train)  # sanity check
+    initial_iter = state["iter_num"]
+
+    # resume data loader state by fast-forwarding through all seen batches
+    if resume:
+        resume_t0 = time.perf_counter()
+        for resume_iter in range(initial_iter):
+            get_batch(fabric, train_data, None)
+            if resume_iter % 1000 == 0:
+                fabric.print(f"Resuming dataset: {resume_iter} / {initial_iter}")
+        fabric.barrier()
+        fabric.print(
+            f"Resuming data loader finished. Took {time.perf_counter() - resume_t0:.1f} seconds to reach iteration"
+            f" {initial_iter}."
+        )
 
+    running_loss = RunningMean(window=train.gradient_accumulation_iters(devices), sync_on_compute=False).to(
+        fabric.device
+    )
+    fabric.barrier()
+
+    for state["iter_num"] in range(state["iter_num"] + 1, train.max_iters(devices) + 1):
         iter_t0 = time.perf_counter()
 
-        input_ids, targets = get_batch(fabric, train_data, longest_seq_ix if iter_num == 0 else None)
+        input_ids, targets = get_batch(
+            fabric,
+            train_data,
+            train.micro_batch_size,
+            train.max_seq_length,
+            longest_seq_ix if state["iter_num"] == 1 else None,
+        )
 
-        is_accumulating = (iter_num + 1) % gradient_accumulation_iters != 0
+        is_accumulating = state["iter_num"] % train.gradient_accumulation_iters(devices) != 0
         with fabric.no_backward_sync(model, enabled=is_accumulating):
             logits = model(input_ids)
             # shift the targets such that output n predicts token n+1
-            loss = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0)
-            fabric.backward(loss / gradient_accumulation_iters)
+            loss = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:])
+            fabric.backward(loss / train.gradient_accumulation_iters(devices))
+
+        running_loss.update(loss.detach())
 
         if not is_accumulating:
             optimizer.step()
             optimizer.zero_grad()
-            step_count += 1
-
-        t1 = time.perf_counter()
-        total_lengths += input_ids.size(1)
-        speed_monitor.on_train_batch_end(
-            (iter_num + 1) * micro_batch_size,
-            t1 - total_t0,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            fabric.world_size,
-            flops_per_batch=measured_flops,
-            lengths=total_lengths,
-        )
-        if iter_num % log_interval == 0:
+            scheduler.step()
+            state["step_count"] += 1
+
+        if state["iter_num"] % train.log_interval == 0:
+            loss = running_loss.compute().item()  # expensive device-to-host synchronization
+            t1 = time.perf_counter()
+            metrics = {
+                "loss": loss,
+                "iter": state["iter_num"],
+                "step": state["step_count"],
+                "iter_time": t1 - iter_t0,
+                "tokens": state["iter_num"] * train.micro_batch_size * model.config.block_size,
+                "total_tokens": (
+                    state["iter_num"] * train.micro_batch_size * model.config.block_size * fabric.world_size
+                ),
+                # TODO: log learning rate
+            }
             fabric.print(
-                f"iter {iter_num} step {step_count}: loss {loss.item():.4f}, iter time:"
-                f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
+                f"iter {metrics['iter']} | step {metrics['step']}: loss {metrics['loss']:.4f}, iter time:"
+                f" {metrics['iter_time'] * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}"
             )
+            fabric.log_dict(metrics, step=state["iter_num"])
 
-        if not is_accumulating and step_count % eval_interval == 0:
+        if not is_accumulating and state["step_count"] % eval.interval == 0:
             t0 = time.perf_counter()
-            val_loss = validate(fabric, model, val_data, tokenizer)
+            val_loss = validate(fabric, model, val_data, tokenizer, eval, train)
             t1 = time.perf_counter() - t0
-            speed_monitor.eval_end(t1)
-            fabric.print(f"step {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
+            fabric.print(f"iter {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms")
+            metrics = {"val_loss": val_loss, "val_ppl": math.exp(val_loss)}
+            fabric.log_dict(metrics, step=state["iter_num"])
             fabric.barrier()
-        if not is_accumulating and step_count % save_interval == 0:
-            checkpoint_path = out_dir / f"iter-{iter_num:06d}-ckpt.pth"
-            save_checkpoint(fabric, model, checkpoint_path)
+        if not is_accumulating and state["step_count"] % train.save_interval == 0:
+            checkpoint_path = io.out_dir / f"step-{state['step_count']:06d}.pth"
+            fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
+            fabric.save(checkpoint_path, state)
 
 
-@torch.inference_mode()
-def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer) -> torch.Tensor:
+# FSDP has issues with `inference_mode`
+@torch.no_grad()
+def validate(
+    fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs
+) -> torch.Tensor:
     fabric.print("Validating ...")
     model.eval()
-    losses = torch.zeros(eval_iters)
-    for k in range(eval_iters):
-        input_ids, targets = get_batch(fabric, val_data)
+    losses = torch.zeros(eval.max_iters)
+    for k in range(eval.max_iters):
+        input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length)
         logits = model(input_ids)
         losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0)
     val_loss = losses.mean()
@@ -224,7 +261,9 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
     with fabric.init_tensor():
         # do not set `max_seq_length=max_returned_token` because memory is not a concern here
         model.set_kv_cache(batch_size=1)
-    output = generate(model, encoded, max_returned_tokens=len(encoded) + eval_max_new_tokens, temperature=0.8)
+    output = generate(
+        model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id
+    )
     model.clear_kv_cache()
     output = tokenizer.decode(output)
     fabric.print(output)
@@ -234,7 +273,11 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
 
 
 def get_batch(
-    fabric: L.Fabric, data: List[Dict], longest_seq_ix: Optional[int] = None
+    fabric: L.Fabric,
+    data: List[Dict],
+    micro_batch_size: int,
+    max_seq_length: Optional[int],
+    longest_seq_ix: Optional[int] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     ix = torch.randint(len(data), (micro_batch_size,))
     if longest_seq_ix is not None:
@@ -255,6 +298,11 @@ def pad_right(x, pad_id):
     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
 
+    # Truncate if needed
+    if max_seq_length:
+        x = x[:, :max_seq_length]
+        y = y[:, :max_seq_length]
+
     if fabric.device.type == "cuda" and x.device.type == "cpu":
         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
     else:
@@ -262,6 +310,13 @@ def pad_right(x, pad_id):
     return x, y
 
 
+def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int):
+    # linear warmup followed by cosine annealing
+    scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps)
+    scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps))
+    return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps])
+
+
 def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
     # find out the minimum max_seq_length required during fine-tuning (saves memory!)
     lengths = [len(d["input_ids"]) for d in data]
@@ -270,16 +325,27 @@ def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
     return longest_seq_length, longest_seq_ix
 
 
-def save_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Path) -> None:
-    fabric.print(f"Saving weights to {str(file_path)!r}")
-    fabric.save(file_path, {"model": model})
+def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    issues = []
+    unsupported = [(train, ["max_tokens", "max_norm"])]
+    for args, names in unsupported:
+        for name in names:
+            if getattr(args, name) is not None:
+                issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}")
+    required = [
+        (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]),
+        (train, ["epoch_size", "epochs"]),
+        (eval, ["max_new_tokens"]),
+    ]
+    for args, names in required:
+        for name in names:
+            if getattr(args, name) is None:
+                issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}")
+    if issues:
+        raise ValueError("\n".join(issues))
 
 
 if __name__ == "__main__":
-    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
-    # torch.backends.cuda.enable_flash_sdp(False)
     torch.set_float32_matmul_precision("high")
 
-    from jsonargparse import CLI
-
     CLI(setup)
diff --git a/finetune/lora.py b/finetune/lora.py
index d278f46..6f71b05 100644
--- a/finetune/lora.py
+++ b/finetune/lora.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+import dataclasses
 import os
 import sys
 import time
@@ -9,17 +11,18 @@
 from lightning.fabric.loggers import CSVLogger
 from lightning.fabric.plugins import BitsandbytesPrecision
 from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities import ThroughputMonitor
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
 from generate.base import generate
+from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 from lit_gpt.lora import GPT, Block, Config, lora_filter, mark_only_lora_as_trainable
-from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
-from lit_gpt.speed_monitor import estimate_flops, measure_flops
 from lit_gpt.tokenizer import Tokenizer
 from lit_gpt.utils import (
+    CLI,
     check_valid_checkpoint_dir,
     chunked_cross_entropy,
     get_default_supported_precision,
@@ -28,6 +31,7 @@
 )
 from scripts.prepare_alpaca import generate_prompt
 
+# 
 eval_interval = 100
 save_interval = 100
 eval_iters = 100
@@ -56,15 +60,42 @@
 warmup_steps = 100
 
 hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
-
+#
 
 def setup(
-    data_dir: Path = Path("data/alpaca"),
-    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    out_dir: Path = Path("out/lora/alpaca"),
     precision: Optional[str] = None,
     quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None,
+    devices: int = 1,
+    seed: int = 1337,
+    lora_r: int = 8,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+    lora_query: bool = True,
+    lora_key: bool = False,
+    lora_value: bool = True,
+    lora_projection: bool = False,
+    lora_mlp: bool = False,
+    lora_head: bool = False,
+    io: IOArgs = IOArgs(
+        train_data_dir=Path("data/alpaca"),
+        val_data_dir=Path("data/alpaca"),
+        checkpoint_dir=Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+        out_dir=Path("out/lora/alpaca"),
+    ),
+    train: TrainArgs = TrainArgs(
+        save_interval=1000,
+        log_interval=1,
+        global_batch_size=128,
+        micro_batch_size=4,
+        lr_warmup_steps=100,
+        epochs=5,
+        epoch_size=50000,
+        learning_rate=3e-4,
+        max_seq_length=None,
+    ),
+    eval: EvalArgs = EvalArgs(interval=100, max_new_tokens=100, max_iters=100),
 ) -> None:
+    print(locals())
     precision = precision or get_default_supported_precision(training=True)
 
     plugins = None
@@ -91,40 +122,51 @@ def setup(
     else:
         strategy = "auto"
 
-    logger = CSVLogger(out_dir.parent, out_dir.name, flush_logs_every_n_steps=log_interval)
+    logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval)
     fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger, plugins=plugins)
+
+    if not any((lora_query, lora_key, lora_value, lora_projection, lora_mlp, lora_head)):
+        fabric.print("Warning: all LoRA layers are disabled!")
     fabric.print(hparams)
-    fabric.launch(main, data_dir, checkpoint_dir, out_dir, quantize)
+    fabric.launch(
+        main,
+        devices,
+        seed,
+        Config.from_name(
+            name=io.checkpoint_dir.name,
+            r=lora_r,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+            to_query=lora_query,
+            to_key=lora_key,
+            to_value=lora_value,
+            to_projection=lora_projection,
+            to_mlp=lora_mlp,
+            to_head=lora_head,
+        ),
+        io,
+        train,
+        eval,
+    )
+
 
+def main(fabric: L.Fabric, devices: int, seed: int, config: Config, io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    validate_args(io, train, eval)
 
-def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path, quantize: Optional[str] = None) -> None:
-    check_valid_checkpoint_dir(checkpoint_dir)
+    steps_per_epoch = train.epoch_size // devices // train.batch_size(devices)
+    lr_max_steps = train.epochs * steps_per_epoch
 
-    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
+    check_valid_checkpoint_dir(io.checkpoint_dir)
 
-    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)
+    fabric.seed_everything(seed)  # same seed for every process to init model (FSDP)
 
     if fabric.global_rank == 0:
-        os.makedirs(out_dir, exist_ok=True)
+        os.makedirs(io.out_dir, exist_ok=True)
 
-    train_data = torch.load(data_dir / "train.pt")
-    val_data = torch.load(data_dir / "test.pt")
+    train_data = torch.load(io.train_data_dir / "train.pt")
+    val_data = torch.load(io.val_data_dir / "test.pt")
 
-    if not any((lora_query, lora_key, lora_value, lora_projection, lora_mlp, lora_head)):
-        fabric.print("Warning: all LoRA layers are disabled!")
-    config = Config.from_name(
-        name=checkpoint_dir.name,
-        r=lora_r,
-        alpha=lora_alpha,
-        dropout=lora_dropout,
-        to_query=lora_query,
-        to_key=lora_key,
-        to_value=lora_value,
-        to_projection=lora_projection,
-        to_mlp=lora_mlp,
-        to_head=lora_head,
-    )
-    checkpoint_path = checkpoint_dir / "lit_model.pth"
+    checkpoint_path = io.checkpoint_dir / "lit_model.pth"
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
     with fabric.init_module(empty_init=(devices > 1)):
         model = GPT(config)
@@ -139,11 +181,14 @@ def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path,
     if isinstance(fabric.strategy.precision, BitsandbytesPrecision):
         import bitsandbytes as bnb
 
-        optimizer = bnb.optim.PagedAdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay)
+        optimizer_cls = bnb.optim.PagedAdamW
     else:
-        optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate, weight_decay=weight_decay)
+        optimizer_cls = torch.optim.AdamW
+    optimizer = optimizer_cls(
+        trainable_params, lr=train.learning_rate, weight_decay=train.weight_decay, betas=(train.beta1, train.beta2)
+    )
     optimizer = fabric.setup_optimizers(optimizer)
-    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_iters // batch_size)
+    scheduler = get_lr_scheduler(optimizer, warmup_steps=train.lr_warmup_steps, max_steps=lr_max_steps)
 
     # strict=False because missing keys due to LoRA weights not contained in state dict
     load_checkpoint(fabric, model, checkpoint_path, strict=False)
@@ -151,120 +196,103 @@ def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path,
     fabric.seed_everything(1337 + fabric.global_rank)
 
     train_time = time.perf_counter()
-    train(fabric, model, optimizer, scheduler, train_data, val_data, checkpoint_dir, out_dir, speed_monitor)
+    fit(fabric, model, optimizer, scheduler, train_data, val_data, devices, io, train, eval)
     fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
     if fabric.device.type == "cuda":
         fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
 
     # Save the final LoRA checkpoint at the end of training
-    save_path = out_dir / "lit_model_lora_finetuned.pth"
+    save_path = io.out_dir / "lit_model_lora_finetuned.pth"
     save_lora_checkpoint(fabric, model, save_path)
 
 
-def train(
+def fit(
     fabric: L.Fabric,
     model: GPT,
     optimizer: torch.optim.Optimizer,
     scheduler: torch.optim.lr_scheduler,
     train_data: List[Dict],
     val_data: List[Dict],
-    checkpoint_dir: Path,
-    out_dir: Path,
-    speed_monitor: SpeedMonitor,
+    devices: int,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
 ) -> None:
-    tokenizer = Tokenizer(checkpoint_dir)
+    tokenizer = Tokenizer(io.checkpoint_dir)
     longest_seq_length, longest_seq_ix = get_longest_seq_length(train_data)
 
     # The existing code model.max_seq_length = longest_seq_length
     # sets the maximum length based on the training data, which seem to less. Hence setting it to a hardcoded number.
     model.max_seq_length = 500
+    # model.max_seq_length = min(longest_seq_length, train.max_seq_length or float("inf"))
+
     fabric.print(
         f"The longest sequence length in the train data is {longest_seq_length}, the model's maximum sequence length is"
         f" {model.max_seq_length} and context length is {model.config.block_size}"
     )
 
-    validate(fabric, model, val_data, tokenizer)  # sanity check
-
-    with torch.device("meta"):
-        meta_model = GPT(model.config)
-        mark_only_lora_as_trainable(meta_model)
-        # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
-        # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
-        estimated_flops = estimate_flops(meta_model) * micro_batch_size
-        fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
-        # this assumes that all samples have a fixed length equal to the longest sequence length
-        # which is most likely false during finetuning
-        x = torch.randint(0, 1, (micro_batch_size, longest_seq_length))
-        measured_flops = measure_flops(meta_model, x)
-        fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
-        del meta_model, x
+    validate(fabric, model, val_data, tokenizer, dataclasses.replace(eval, max_iters=2), train)  # sanity check
 
+    throughput = ThroughputMonitor(fabric, window_size=50)
     step_count = 0
     total_lengths = 0
     total_t0 = time.perf_counter()
 
-    for iter_num in range(max_iters):
-        if step_count <= warmup_steps:
-            # linear warmup
-            lr = learning_rate * step_count / warmup_steps
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = lr
-
+    for iter_num in range(1, train.max_iters(devices) + 1):
         iter_t0 = time.perf_counter()
 
-        input_ids, targets = get_batch(fabric, train_data, longest_seq_ix if iter_num == 0 else None)
+        input_ids, targets = get_batch(
+            fabric, train_data, train.micro_batch_size, train.max_seq_length, longest_seq_ix if iter_num == 1 else None
+        )
 
-        is_accumulating = (iter_num + 1) % gradient_accumulation_iters != 0
+        is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0
         with fabric.no_backward_sync(model, enabled=is_accumulating):
             logits = model(input_ids, lm_head_chunk_size=128)
             # shift the targets such that output n predicts token n+1
             logits[-1] = logits[-1][..., :-1, :]
             loss = chunked_cross_entropy(logits, targets[..., 1:])
-            fabric.backward(loss / gradient_accumulation_iters)
+            fabric.backward(loss / train.gradient_accumulation_iters(devices))
 
         if not is_accumulating:
             optimizer.step()
             optimizer.zero_grad()
-            if step_count > warmup_steps:
-                scheduler.step()
+            scheduler.step()
             step_count += 1
 
-        t1 = time.perf_counter()
-        total_lengths += input_ids.size(1)
-        speed_monitor.on_train_batch_end(
-            (iter_num + 1) * micro_batch_size,
-            t1 - total_t0,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            fabric.world_size,
-            flops_per_batch=measured_flops,
-            lengths=total_lengths,
-        )
-        if iter_num % log_interval == 0:
+        total_lengths += input_ids.numel()
+        if iter_num % train.log_interval == 0:
+            loss_item = loss.item()  # expensive device-to-host synchronization
+            t1 = time.perf_counter()
+            throughput.update(
+                time=t1 - total_t0, batches=iter_num, samples=iter_num * train.micro_batch_size, lengths=total_lengths
+            )
+            throughput.compute_and_log(step=iter_num)
             fabric.print(
-                f"iter {iter_num} step {step_count}: loss {loss.item():.4f}, iter time:"
-                f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
+                f"iter {iter_num} | step {step_count}: loss {loss_item:.4f}, iter time:"
+                f" {(t1 - iter_t0) * 1000:.2f} ms{' (optimizer.step)' if not is_accumulating else ''}"
             )
 
-        if not is_accumulating and step_count % eval_interval == 0:
+        if not is_accumulating and step_count % eval.interval == 0:
             t0 = time.perf_counter()
-            val_loss = validate(fabric, model, val_data, tokenizer)
+            val_loss = validate(fabric, model, val_data, tokenizer, eval, train)
             t1 = time.perf_counter() - t0
-            speed_monitor.eval_end(t1)
-            fabric.print(f"step {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
+            fabric.print(f"iter {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f} ms")
             fabric.barrier()
-        if not is_accumulating and step_count % save_interval == 0:
-            checkpoint_path = out_dir / f"iter-{iter_num:06d}-ckpt.pth"
+        if not is_accumulating and step_count % train.save_interval == 0:
+            checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth"
             save_lora_checkpoint(fabric, model, checkpoint_path)
 
 
-@torch.inference_mode()
-def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer) -> torch.Tensor:
+# FSDP has issues with `inference_mode`
+@torch.no_grad()
+def validate(
+    fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Tokenizer, eval: EvalArgs, train: TrainArgs
+) -> torch.Tensor:
     fabric.print("Validating ...")
     model.eval()
-    losses = torch.zeros(eval_iters)
-    for k in range(eval_iters):
-        input_ids, targets = get_batch(fabric, val_data)
+    losses = torch.zeros(eval.max_iters)
+    for k in range(eval.max_iters):
+        input_ids, targets = get_batch(fabric, val_data, train.micro_batch_size, train.max_seq_length)
         logits = model(input_ids)
         losses[k] = chunked_cross_entropy(logits[..., :-1, :], targets[..., 1:], chunk_size=0)
     val_loss = losses.mean()
@@ -278,7 +306,9 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
     with fabric.init_tensor():
         # do not set `max_seq_length=max_returned_token` because memory is not a concern here
         model.set_kv_cache(batch_size=1)
-    output = generate(model, encoded, max_returned_tokens=len(encoded) + eval_max_new_tokens, temperature=0.8)
+    output = generate(
+        model, encoded, max_returned_tokens=len(encoded) + eval.max_new_tokens, temperature=0.8, eos_id=tokenizer.eos_id
+    )
     model.clear_kv_cache()
     output = tokenizer.decode(output)
     fabric.print(output)
@@ -288,7 +318,11 @@ def validate(fabric: L.Fabric, model: GPT, val_data: List[Dict], tokenizer: Toke
 
 
 def get_batch(
-    fabric: L.Fabric, data: List[Dict], longest_seq_ix: Optional[int] = None
+    fabric: L.Fabric,
+    data: List[Dict],
+    micro_batch_size: int,
+    max_seq_length: Optional[int],
+    longest_seq_ix: Optional[int] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     ix = torch.randint(len(data), (micro_batch_size,))
     if longest_seq_ix is not None:
@@ -309,6 +343,11 @@ def pad_right(x, pad_id):
     x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
     y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
 
+    # Truncate if needed
+    if max_seq_length:
+        x = x[:, :max_seq_length]
+        y = y[:, :max_seq_length]
+
     if fabric.device.type == "cuda" and x.device.type == "cpu":
         x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
     else:
@@ -316,6 +355,13 @@ def pad_right(x, pad_id):
     return x, y
 
 
+def get_lr_scheduler(optimizer, warmup_steps: int, max_steps: int):
+    # linear warmup followed by cosine annealing
+    scheduler1 = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: step / warmup_steps)
+    scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(max_steps - warmup_steps))
+    return torch.optim.lr_scheduler.SequentialLR(optimizer, [scheduler1, scheduler2], milestones=[warmup_steps])
+
+
 def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]:
     # find out the minimum max_seq_length required during fine-tuning (saves memory!)
     lengths = [len(d["input_ids"]) for d in data]
@@ -329,11 +375,27 @@ def save_lora_checkpoint(fabric: L.Fabric, model: torch.nn.Module, file_path: Pa
     fabric.save(file_path, {"model": model}, filter={"model": lora_filter})
 
 
+def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    issues = []
+    unsupported = [(train, ["max_tokens", "max_norm"])]
+    for args, names in unsupported:
+        for name in names:
+            if getattr(args, name) is not None:
+                issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}")
+    required = [
+        (io, ["checkpoint_dir", "train_data_dir", "val_data_dir"]),
+        (train, ["epoch_size", "epochs"]),
+        (eval, ["max_new_tokens"]),
+    ]
+    for args, names in required:
+        for name in names:
+            if getattr(args, name) is None:
+                issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}")
+    if issues:
+        raise ValueError("\n".join(issues))
+
+
 if __name__ == "__main__":
-    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
-    # torch.backends.cuda.enable_flash_sdp(False)
     torch.set_float32_matmul_precision("high")
 
-    from jsonargparse import CLI
-
     CLI(setup)
diff --git a/generate/adapter.py b/generate/adapter.py
index ed76d2b..15e5df5 100644
--- a/generate/adapter.py
+++ b/generate/adapter.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 import time
 from pathlib import Path
@@ -6,7 +8,6 @@
 import lightning as L
 import torch
 from lightning.fabric.plugins import BitsandbytesPrecision
-from lightning.fabric.strategies import FSDPStrategy
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -14,22 +15,20 @@
 
 from generate.base import generate
 from lit_gpt import Tokenizer
-from lit_gpt.adapter import GPT, Block, Config
-from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, gptq_quantization, lazy_load
+from lit_gpt.adapter import GPT, Config
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
 from scripts.prepare_alpaca import generate_prompt
 
 
 def main(
-    prompt: str = "What food do lamas eat?",
+    prompt: str = "What food do llamas eat?",
     input: str = "",
     adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     max_new_tokens: int = 100,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
-    strategy: str = "auto",
-    devices: int = 1,
     precision: Optional[str] = None,
 ) -> None:
     """Generates a response based on a given instruction and an optional input.
@@ -45,57 +44,50 @@ def main(
         quantize: Whether to quantize the model and using which method:
             - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
             - bnb.int8: 8-bit quantization from bitsandbytes
-            - gptq.int4: 4-bit quantization from GPTQ
             for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
-        strategy: Indicates the Fabric strategy setting to use.
-        devices: How many devices to use.
         precision: Indicates the Fabric precision setting to use.
     """
     precision = precision or get_default_supported_precision(training=False)
 
     plugins = None
-    if quantize is not None:
-        if devices > 1:
-            raise NotImplementedError(
-                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
-                " --quantize flag."
-            )
-        if quantize.startswith("bnb."):
-            if "mixed" in precision:
-                raise ValueError("Quantization and mixed precision is not supported.")
-            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
-            plugins = BitsandbytesPrecision(quantize[4:], dtype)
-            precision = None
-
-    if strategy == "fsdp":
-        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
-
-    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
     fabric.launch()
 
     check_valid_checkpoint_dir(checkpoint_dir)
 
     config = Config.from_json(checkpoint_dir / "lit_config.json")
 
-    if quantize is not None and devices > 1:
-        raise NotImplementedError
-    if quantize == "gptq.int4":
-        model_file = "lit_model_gptq.4bit.pth"
-        if not (checkpoint_dir / model_file).is_file():
-            raise ValueError("Please run `python quantize/gptq.py` first")
-    else:
-        model_file = "lit_model.pth"
-    checkpoint_path = checkpoint_dir / model_file
+    checkpoint_path = checkpoint_dir / "lit_model.pth"
+
+    tokenizer = Tokenizer(checkpoint_dir)
+    sample = {"instruction": prompt, "input": input}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
 
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
     t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    with fabric.init_module(empty_init=True):
         model = GPT(config)
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
 
     t0 = time.perf_counter()
     checkpoint = lazy_load(checkpoint_path)
@@ -104,22 +96,9 @@ def main(
     model.load_state_dict(checkpoint)
     fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
 
-    model.eval()
     model = fabric.setup(model)
 
-    tokenizer = Tokenizer(checkpoint_dir)
-    sample = {"instruction": prompt, "input": input}
-    prompt = generate_prompt(sample)
-    encoded = tokenizer.encode(prompt, device=fabric.device)
-    prompt_length = encoded.size(0)
-    max_returned_tokens = prompt_length + max_new_tokens
-
-    with fabric.init_tensor():
-        # set the max_seq_length to limit the memory usage to what we need
-        model.max_seq_length = max_returned_tokens
-        # enable the kv cache
-        model.set_kv_cache(batch_size=1)
-
+    L.seed_everything(1234)
     t0 = time.perf_counter()
     y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
     t = time.perf_counter() - t0
@@ -135,7 +114,6 @@ def main(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     torch.set_float32_matmul_precision("high")
+
     CLI(main)
diff --git a/generate/adapter_v2.py b/generate/adapter_v2.py
index 4b15184..c799a0e 100644
--- a/generate/adapter_v2.py
+++ b/generate/adapter_v2.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 import time
 from pathlib import Path
@@ -6,7 +8,6 @@
 import lightning as L
 import torch
 from lightning.fabric.plugins import BitsandbytesPrecision
-from lightning.fabric.strategies import FSDPStrategy
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -14,22 +15,20 @@
 
 from generate.base import generate
 from lit_gpt import Tokenizer
-from lit_gpt.adapter_v2 import GPT, Block, Config
-from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, gptq_quantization, lazy_load
+from lit_gpt.adapter_v2 import GPT, Config
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
 from scripts.prepare_alpaca import generate_prompt
 
 
 def main(
-    prompt: str = "What food do lamas eat?",
+    prompt: str = "What food do llamas eat?",
     input: str = "",
     adapter_path: Path = Path("out/adapter_v2/alpaca/lit_model_adapter_finetuned.pth"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     max_new_tokens: int = 100,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
-    strategy: str = "auto",
-    devices: int = 1,
     precision: Optional[str] = None,
 ) -> None:
     """Generates a response based on a given instruction and an optional input.
@@ -45,57 +44,50 @@ def main(
         quantize: Whether to quantize the model and using which method:
             - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
             - bnb.int8: 8-bit quantization from bitsandbytes
-            - gptq.int4: 4-bit quantization from GPTQ
             for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
-        strategy: Indicates the Fabric strategy setting to use.
-        devices: How many devices to use.
         precision: Indicates the Fabric precision setting to use.
     """
     precision = precision or get_default_supported_precision(training=False)
 
     plugins = None
-    if quantize is not None:
-        if devices > 1:
-            raise NotImplementedError(
-                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
-                " --quantize flag."
-            )
-        if quantize.startswith("bnb."):
-            if "mixed" in precision:
-                raise ValueError("Quantization and mixed precision is not supported.")
-            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
-            plugins = BitsandbytesPrecision(quantize[4:], dtype)
-            precision = None
-
-    if strategy == "fsdp":
-        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
-
-    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
     fabric.launch()
 
     check_valid_checkpoint_dir(checkpoint_dir)
 
     config = Config.from_json(checkpoint_dir / "lit_config.json")
 
-    if quantize is not None and devices > 1:
-        raise NotImplementedError
-    if quantize == "gptq.int4":
-        model_file = "lit_model_gptq.4bit.pth"
-        if not (checkpoint_dir / model_file).is_file():
-            raise ValueError("Please run `python quantize/gptq.py` first")
-    else:
-        model_file = "lit_model.pth"
-    checkpoint_path = checkpoint_dir / model_file
+    checkpoint_path = checkpoint_dir / "lit_model.pth"
+
+    tokenizer = Tokenizer(checkpoint_dir)
+    sample = {"instruction": prompt, "input": input}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
 
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
     t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    with fabric.init_module(empty_init=True):
         model = GPT(config)
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
 
     t0 = time.perf_counter()
     checkpoint = lazy_load(checkpoint_path)
@@ -104,22 +96,9 @@ def main(
     model.load_state_dict(checkpoint)
     fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
 
-    model.eval()
     model = fabric.setup(model)
 
-    tokenizer = Tokenizer(checkpoint_dir)
-    sample = {"instruction": prompt, "input": input}
-    prompt = generate_prompt(sample)
-    encoded = tokenizer.encode(prompt, device=fabric.device)
-    prompt_length = encoded.size(0)
-    max_returned_tokens = prompt_length + max_new_tokens
-
-    with fabric.init_tensor():
-        # set the max_seq_length to limit the memory usage to what we need
-        model.max_seq_length = max_returned_tokens
-        # enable the kv cache
-        model.set_kv_cache(batch_size=1)
-
+    L.seed_everything(1234)
     t0 = time.perf_counter()
     y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
     t = time.perf_counter() - t0
@@ -135,7 +114,6 @@ def main(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     torch.set_float32_matmul_precision("high")
+
     CLI(main)
diff --git a/generate/base.py b/generate/base.py
index de22f2a..3cf7571 100644
--- a/generate/base.py
+++ b/generate/base.py
@@ -1,31 +1,56 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 import time
 from pathlib import Path
-from typing import Literal, Optional
+from typing import Any, Literal, Optional
 
 import lightning as L
 import torch
+import torch._dynamo.config
+import torch._inductor.config
 from lightning.fabric.plugins import BitsandbytesPrecision
-from lightning.fabric.strategies import FSDPStrategy
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
 from lit_gpt import GPT, Config, Tokenizer
-from lit_gpt.model import Block
-from lit_gpt.utils import (
-    check_valid_checkpoint_dir,
-    get_default_supported_precision,
-    gptq_quantization,
-    load_checkpoint,
-)
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint
+
+
+def multinomial_num_samples_1(probs: torch.Tensor) -> torch.Tensor:
+    if torch._dynamo.is_compiling():
+        # Faster alternative to `torch.multinomial(probs, num_samples=1)` that is also CUDAGraph friendly
+        distribution = torch.empty_like(probs).exponential_(1)
+        return torch.argmax(probs / distribution, dim=-1, keepdim=True)
+    return torch.multinomial(probs, num_samples=1)
+
+
+def sample(logits: torch.Tensor, temperature: float = 1.0, top_k: Optional[int] = None) -> torch.Tensor:
+    logits = logits[0, -1]
+    # optionally crop the logits to only the top k options
+    if top_k is not None:
+        v, i = torch.topk(logits, min(top_k, logits.size(-1)))
+        # do not use `torch.where` as in nanogpt because it will repeat top-k collisions
+        logits = torch.full_like(logits, float("-inf")).scatter_(-1, i, v)
+    # optionally scale the logits and sample from a probability distribution
+    if temperature > 0.0:
+        probs = torch.nn.functional.softmax(logits / temperature, dim=-1)
+        return multinomial_num_samples_1(probs)
+    return torch.argmax(logits, dim=-1, keepdim=True)
+
+
+def next_token(model: GPT, input_pos: torch.Tensor, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+    logits = model(x, input_pos)
+    next = sample(logits, **kwargs)
+    return next.to(dtype=x.dtype)
 
 
 @torch.inference_mode()
 def generate(
     model: GPT,
-    idx: torch.Tensor,
+    prompt: torch.Tensor,
     max_returned_tokens: int,
     *,
     temperature: float = 1.0,
@@ -38,13 +63,13 @@ def generate(
 
     Args:
         model: The model to use.
-        idx: Tensor of shape (T) with indices of the prompt sequence.
+        prompt: Tensor of shape (T) with indices of the prompt sequence.
         max_returned_tokens: The maximum number of tokens to return (given plus generated).
         temperature: Scales the predicted logits by 1 / temperature.
         top_k: If specified, only sample among the tokens with the k highest probabilities.
         eos_id: If specified, stop generating any more token once the <eos> token is triggered.
     """
-    T = idx.size(0)
+    T = prompt.size(0)
     assert max_returned_tokens > T
     if model.max_seq_length < max_returned_tokens - 1:
         # rolling the kv cache based on the `input_pos` value would be necessary. However, doing so would introduce a
@@ -52,54 +77,34 @@ def generate(
         # not support it to avoid negatively impacting the overall speed
         raise NotImplementedError(f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}")
 
-    device, dtype = idx.device, idx.dtype
-    # create an empty tensor of the expected final shape and fill in the current tokens
-    empty = torch.empty(max_returned_tokens, dtype=dtype, device=device)
-    empty[:T] = idx
-    idx = empty
-    input_pos = torch.arange(0, T, device=device)
-
-    # generate up to a fixed number of tokens
-    for _ in range(max_returned_tokens - T):
-        x = idx.index_select(0, input_pos).view(1, -1)
-
-        # forward
-        logits = model(x, input_pos)
-        logits = logits[0, -1] / temperature
-
-        # optionally crop the logits to only the top k options
-        if top_k is not None:
-            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-            logits = torch.where(logits < v[[-1]], -float("Inf"), logits)
-
-        probs = torch.nn.functional.softmax(logits, dim=-1)
-        idx_next = torch.multinomial(probs, num_samples=1).to(dtype=dtype)
-
-        # advance
-        input_pos = input_pos[-1:] + 1
-
-        # concatenate the new generation
-        idx = idx.index_copy(0, input_pos, idx_next)
-
-        # if <eos> token is triggered, return the output (stop generation)
-        if idx_next == eos_id:
-            return idx[:input_pos]  # include the EOS token
-
-    return idx
+    device = prompt.device
+    tokens = [prompt]
+    input_pos = torch.tensor([T], device=device)
+    token = next_token(
+        model, torch.arange(0, T, device=device), prompt.view(1, -1), temperature=temperature, top_k=top_k
+    ).clone()
+    tokens.append(token)
+    for _ in range(2, max_returned_tokens - T + 1):
+        token = next_token(model, input_pos, token.view(1, -1), temperature=temperature, top_k=top_k).clone()
+        tokens.append(token)
+        if token == eos_id:
+            break
+        input_pos = input_pos.add_(1)
+    return torch.cat(tokens)
 
 
+@torch.inference_mode()
 def main(
-    prompt: str = "Hello, my name is",
+    prompt: str = "What food do llamas eat?",
     *,
     num_samples: int = 1,
     max_new_tokens: int = 50,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
-    strategy: str = "auto",
-    devices: int = 1,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     precision: Optional[str] = None,
+    compile: bool = False,
 ) -> None:
     """Generates text samples based on a pre-trained model and tokenizer.
 
@@ -114,78 +119,65 @@ def main(
         quantize: Whether to quantize the model and using which method:
             - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
             - bnb.int8: 8-bit quantization from bitsandbytes
-            - gptq.int4: 4-bit quantization from GPTQ
             for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
-        strategy: Indicates the Fabric strategy setting to use.
-        devices: How many devices to use.
         precision: Indicates the Fabric precision setting to use.
+        compile: Whether to compile the model.
     """
     precision = precision or get_default_supported_precision(training=False)
 
     plugins = None
-    if quantize is not None:
-        if devices > 1:
-            raise NotImplementedError(
-                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
-                " --quantize flag."
-            )
-        if quantize.startswith("bnb."):
-            if "mixed" in precision:
-                raise ValueError("Quantization and mixed precision is not supported.")
-            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
-            plugins = BitsandbytesPrecision(quantize[4:], dtype)
-            precision = None
-
-    if strategy == "fsdp":
-        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
-
-    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
-    fabric.launch()
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
 
     check_valid_checkpoint_dir(checkpoint_dir)
 
     config = Config.from_json(checkpoint_dir / "lit_config.json")
 
-    if quantize == "gptq.int4":
-        model_file = "lit_model_gptq.4bit.pth"
-        if not (checkpoint_dir / model_file).is_file():
-            raise ValueError("Please run `python quantize/gptq.py` first")
-    else:
-        model_file = "lit_model.pth"
-    checkpoint_path = checkpoint_dir / model_file
+    checkpoint_path = checkpoint_dir / "lit_model.pth"
+
+    tokenizer = Tokenizer(checkpoint_dir)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
 
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
     t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    with fabric.init_module(empty_init=True):
         model = GPT(config)
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
-
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
     model.eval()
+
+    if compile:
+        torch._dynamo.config.automatic_dynamic_shapes = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        global next_token
+        next_token = torch.compile(next_token, mode="reduce-overhead")
+
     model = fabric.setup_module(model)
 
     t0 = time.perf_counter()
     load_checkpoint(fabric, model, checkpoint_path)
     fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
 
-    tokenizer = Tokenizer(checkpoint_dir)
-    encoded = tokenizer.encode(prompt, device=fabric.device)
-    prompt_length = encoded.size(0)
-    max_returned_tokens = prompt_length + max_new_tokens
-
-    with fabric.init_tensor():
-        # set the max_seq_length to limit the memory usage to what we need
-        model.max_seq_length = max_returned_tokens
-
     L.seed_everything(1234)
     for i in range(num_samples):
-        with fabric.init_tensor():
-            # enable the kv cache
-            model.set_kv_cache(batch_size=1)
-
         t0 = time.perf_counter()
-        y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k)
+        y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
         t = time.perf_counter() - t0
-
+        for block in model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
         fabric.print(tokenizer.decode(y))
         tokens_generated = y.size(0) - prompt_length
         fabric.print(
@@ -196,7 +188,6 @@ def main(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     torch.set_float32_matmul_precision("high")
+
     CLI(main)
diff --git a/generate/full.py b/generate/full.py
index 8041920..ca1554e 100644
--- a/generate/full.py
+++ b/generate/full.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 import time
 from pathlib import Path
@@ -6,7 +8,6 @@
 import lightning as L
 import torch
 from lightning.fabric.plugins import BitsandbytesPrecision
-from lightning.fabric.strategies import FSDPStrategy
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -14,27 +15,19 @@
 
 from generate.base import generate
 from lit_gpt import GPT, Config, Tokenizer
-from lit_gpt.model import Block
-from lit_gpt.utils import (
-    check_valid_checkpoint_dir,
-    get_default_supported_precision,
-    gptq_quantization,
-    load_checkpoint,
-)
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, load_checkpoint
 from scripts.prepare_alpaca import generate_prompt
 
 
 def main(
-    prompt: str = "What food do lamas eat?",
+    prompt: str = "What food do llamas eat?",
     input: str = "",
     finetuned_path: Path = Path("out/full/alpaca/lit_model_finetuned.pth"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     max_new_tokens: int = 100,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
-    strategy: str = "auto",
-    devices: int = 1,
     precision: Optional[str] = None,
 ) -> None:
     """Generates a response based on a given instruction and an optional input.
@@ -50,59 +43,32 @@ def main(
         quantize: Whether to quantize the model and using which method:
             - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
             - bnb.int8: 8-bit quantization from bitsandbytes
-            - gptq.int4: 4-bit quantization from GPTQ
             for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
-        strategy: Indicates the Fabric strategy setting to use.
-        devices: How many devices to use.
         precision: Indicates the Fabric precision setting to use.
     """
     precision = precision or get_default_supported_precision(training=False)
 
     plugins = None
-    if quantize is not None:
-        if devices > 1:
-            raise NotImplementedError(
-                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
-                " --quantize flag."
-            )
-        if quantize.startswith("bnb."):
-            if "mixed" in precision:
-                raise ValueError("Quantization and mixed precision is not supported.")
-            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
-            plugins = BitsandbytesPrecision(quantize[4:], dtype)
-            precision = None
-
-    if strategy == "fsdp":
-        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
-
-    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
     fabric.launch()
 
     check_valid_checkpoint_dir(checkpoint_dir)
 
     config = Config.from_json(checkpoint_dir / "lit_config.json")
 
-    if quantize is not None and devices > 1:
-        raise NotImplementedError
     checkpoint_path = finetuned_path
 
-    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
-    t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
-        model = GPT(config)
-    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
-
-    model.eval()
-    model = fabric.setup(model)
-
-    t0 = time.perf_counter()
-    load_checkpoint(fabric, model, checkpoint_path)
-    fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
-
     tokenizer = Tokenizer(checkpoint_dir)
     sample = {"instruction": prompt, "input": input}
     prompt = generate_prompt(sample)
@@ -110,12 +76,25 @@ def main(
     prompt_length = encoded.size(0)
     max_returned_tokens = prompt_length + max_new_tokens
 
+    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
+    t0 = time.perf_counter()
+    with fabric.init_module(empty_init=True):
+        model = GPT(config)
+    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
     with fabric.init_tensor():
         # set the max_seq_length to limit the memory usage to what we need
         model.max_seq_length = max_returned_tokens
         # enable the kv cache
         model.set_kv_cache(batch_size=1)
+    model.eval()
+
+    model = fabric.setup(model)
 
+    t0 = time.perf_counter()
+    load_checkpoint(fabric, model, checkpoint_path)
+    fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+
+    L.seed_everything(1234)
     t0 = time.perf_counter()
     y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
     t = time.perf_counter() - t0
@@ -131,7 +110,6 @@ def main(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     torch.set_float32_matmul_precision("high")
+
     CLI(main)
diff --git a/generate/lora.py b/generate/lora.py
index e095d2c..006b75b 100644
--- a/generate/lora.py
+++ b/generate/lora.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 import time
 from pathlib import Path
@@ -6,7 +8,6 @@
 import lightning as L
 import torch
 from lightning.fabric.plugins import BitsandbytesPrecision
-from lightning.fabric.strategies import FSDPStrategy
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
@@ -14,33 +15,30 @@
 
 from generate.base import generate
 from lit_gpt import Tokenizer
-from lit_gpt.lora import GPT, Block, Config, merge_lora_weights
-from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, gptq_quantization, lazy_load
+from lit_gpt.lora import GPT, Config, merge_lora_weights
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
 from scripts.prepare_alpaca import generate_prompt
 
-lora_r = 8
-lora_alpha = 16
-lora_dropout = 0.05
-lora_query = True
-lora_key = False
-lora_value = True
-lora_projection = False
-lora_mlp = False
-lora_head = False
-
 
 def main(
-    prompt: str = "What food do lamas eat?",
+    prompt: str = "What food do llamas eat?",
     input: str = "",
     lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8", "gptq.int4"]] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8"]] = None,
     max_new_tokens: int = 100,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
-    strategy: str = "auto",
-    devices: int = 1,
     precision: Optional[str] = None,
+    lora_r: int = 8,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+    lora_query: bool = True,
+    lora_key: bool = False,
+    lora_value: bool = True,
+    lora_projection: bool = False,
+    lora_mlp: bool = False,
+    lora_head: bool = False,
 ) -> None:
     """Generates a response based on a given instruction and an optional input.
     This script will only work with checkpoints from the instruction-tuned GPT-LoRA model.
@@ -55,36 +53,24 @@ def main(
         quantize: Whether to quantize the model and using which method:
             - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
             - bnb.int8: 8-bit quantization from bitsandbytes
-            - gptq.int4: 4-bit quantization from GPTQ
             for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
         max_new_tokens: The number of generation steps to take.
         top_k: The number of top most probable tokens to consider in the sampling process.
         temperature: A value controlling the randomness of the sampling process. Higher values result in more random
             samples.
-        strategy: Indicates the Fabric strategy setting to use.
-        devices: How many devices to use.
         precision: Indicates the Fabric precision setting to use.
     """
     precision = precision or get_default_supported_precision(training=False)
 
     plugins = None
-    if quantize is not None:
-        if devices > 1:
-            raise NotImplementedError(
-                "Quantization is currently not supported for multi-GPU training. Please set devices=1 when using the"
-                " --quantize flag."
-            )
-        if quantize.startswith("bnb."):
-            if "mixed" in precision:
-                raise ValueError("Quantization and mixed precision is not supported.")
-            dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
-            plugins = BitsandbytesPrecision(quantize[4:], dtype)
-            precision = None
-
-    if strategy == "fsdp":
-        strategy = FSDPStrategy(auto_wrap_policy={Block}, cpu_offload=False)
-
-    fabric = L.Fabric(devices=devices, precision=precision, strategy=strategy, plugins=plugins)
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    fabric = L.Fabric(devices=1, precision=precision, plugins=plugins)
     fabric.launch()
 
     check_valid_checkpoint_dir(checkpoint_dir)
@@ -102,21 +88,26 @@ def main(
         to_head=lora_head,
     )
 
-    if quantize is not None and devices > 1:
-        raise NotImplementedError
-    if quantize == "gptq.int4":
-        model_file = "lit_model_gptq.4bit.pth"
-        if not (checkpoint_dir / model_file).is_file():
-            raise ValueError("Please run `python quantize/gptq.py` first")
-    else:
-        model_file = "lit_model.pth"
-    checkpoint_path = checkpoint_dir / model_file
+    checkpoint_path = checkpoint_dir / "lit_model.pth"
+
+    tokenizer = Tokenizer(checkpoint_dir)
+    sample = {"instruction": prompt, "input": input}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
 
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
     t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    with fabric.init_module(empty_init=True):
         model = GPT(config)
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+    with fabric.init_tensor():
+        # set the max_seq_length to limit the memory usage to what we need
+        model.max_seq_length = max_returned_tokens
+        # enable the kv cache
+        model.set_kv_cache(batch_size=1)
+    model.eval()
 
     t0 = time.perf_counter()
     checkpoint = lazy_load(checkpoint_path)
@@ -125,23 +116,10 @@ def main(
     model.load_state_dict(checkpoint)
     fabric.print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
 
-    model.eval()
     merge_lora_weights(model)
     model = fabric.setup(model)
 
-    tokenizer = Tokenizer(checkpoint_dir)
-    sample = {"instruction": prompt, "input": input}
-    prompt = generate_prompt(sample)
-    encoded = tokenizer.encode(prompt, device=fabric.device)
-    prompt_length = encoded.size(0)
-    max_returned_tokens = prompt_length + max_new_tokens
-
-    with fabric.init_tensor():
-        # set the max_seq_length to limit the memory usage to what we need
-        model.max_seq_length = max_returned_tokens
-        # enable the kv cache
-        model.set_kv_cache(batch_size=1)
-
+    L.seed_everything(1234)
     t0 = time.perf_counter()
     y = generate(model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id)
     t = time.perf_counter() - t0
@@ -157,7 +135,6 @@ def main(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     torch.set_float32_matmul_precision("high")
+
     CLI(main)
diff --git a/generate/lora_ui_gen.py b/generate/lora_ui_gen.py
index 588e05d..8329b69 100644
--- a/generate/lora_ui_gen.py
+++ b/generate/lora_ui_gen.py
@@ -16,7 +16,7 @@
 from generate.base import generate
 from lit_gpt import Tokenizer
 from lit_gpt.lora import GPT, Block, Config, merge_lora_weights
-from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, gptq_quantization, lazy_load
+from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
 
 lora_r = 16
 lora_alpha = 32
@@ -120,7 +120,8 @@ def main(
 
     fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
     t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    #with fabric.init_module(empty_init=True), gptq_quantization(quantize == "gptq.int4"):
+    with fabric.init_module(empty_init=True):
         model = GPT(config)
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
 
diff --git a/generate/sequentially.py b/generate/sequentially.py
new file mode 100644
index 0000000..cd1b194
--- /dev/null
+++ b/generate/sequentially.py
@@ -0,0 +1,231 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import itertools
+import logging
+import re
+import sys
+import time
+from collections import OrderedDict
+from functools import partial
+from pathlib import Path
+from typing import Literal, Optional
+
+import lightning as L
+import torch
+from lightning.fabric.accelerators import CUDAAccelerator
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.utilities.init import _materialize_meta_tensors
+from typing_extensions import Type
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import generate.base as generate_base
+from lit_gpt import GPT, Config, Tokenizer
+from lit_gpt.model import Block, build_mask_cache
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision
+
+
+@torch.inference_mode()
+def sequential(model: GPT, root: torch.device, max_seq_length: int, devices: int):
+    if model.config.n_layer % devices:
+        # TODO: support smarter partitioning schemes
+        raise NotImplementedError(
+            f"Only balanced partitioning is implemented: n_layer={model.config.n_layer}, devices {devices}"
+        )
+    layers_per_rank = model.config.n_layer // devices
+    # dictates where each block should be instantiated
+    mapping = layer_to_device(model, chunk_on=Block, chunk_size=layers_per_rank)
+
+    # materialize each block on the appropriate device
+    for path, target_index in mapping.items():
+        submodule = model.get_submodule(path)
+        target_device = torch.device(root.type, target_index)
+        print(f"Moving {path!r} to {target_device}", file=sys.stderr)
+        # submodules loaded by the checkpoint will be on CPU (if no quantization). move them
+        replace_device(submodule, replace=torch.device("cpu"), by=target_device)
+        # in case the checkpoint was partial, materialize leftover metas
+        _materialize_meta_tensors(submodule, target_device)
+        # and build the kv cache
+        submodule.attn.kv_cache = submodule.attn.build_kv_cache(1, max_seq_length, model.cos.size(-1), target_device)
+    # rebuild odd ends
+    with root:
+        model.max_seq_length = max_seq_length
+        # the rope cache which is on meta device
+        model.cos, model.sin = model.rope_cache()
+        # the mask cache which cannot be created with `set_kv_cache` because that will set it for all layers
+        model.mask_cache = build_mask_cache(max_seq_length)
+    # and everything that is not a block in the root
+    _materialize_meta_tensors(model, root)
+    replace_device(model, replace=torch.device("cpu"), by=root)
+
+    if devices > 1:
+        # install hooks to move layer inputs/output between devices
+        for layer_num, (path, target_index) in enumerate(mapping.items()):
+            submodule = model.get_submodule(path)
+            if layer_num >= layers_per_rank:
+                # we need to move the block input on the boundaries between devices
+                # and also on every non-root device because the RoPE and mask cache is shared
+                # TODO: the second case could be optimized and then we would only need this hook for
+                # `layer_num in [layers_per_rank * i - 1 for i in range(1, devices + 1)]`
+                target_device = torch.device(root.type, target_index)
+                submodule.register_forward_pre_hook(partial(move_block_input, target_device))
+            if layer_num == model.config.n_layer - 1:
+                submodule.register_forward_hook(partial(move_block_output, root))
+
+    return model
+
+
+def layer_to_device(
+    module: torch.nn.Module, chunk_on: Type[torch.nn.Module], chunk_size: int
+) -> "OrderedDict[str, int]":
+    """Create a mapping from layer (block) to device."""
+    # this assumes that the definition order is the same as the execution order
+    hits = [name for name, submodule in module.named_modules() if isinstance(submodule, chunk_on)]
+    return OrderedDict((name, i // chunk_size) for i, name in enumerate(hits))
+
+
+def move_block_input(device: torch.device, module: torch.nn.Module, ins):
+    """``forward_pre_hook`` to move a Block's input before forward."""
+    # during inference, none of the inputs are None: x, cos, sin, mask, input_pos
+    return tuple(t.to(device) for t in ins)
+
+
+def move_block_output(device: torch.device, module: torch.nn.Module, ins, outs) -> torch.Tensor:
+    """``forward_hook`` to move a Block's output after forward."""
+    return outs.to(device)
+
+
+def replace_device(module: torch.nn.Module, replace: torch.device, by: torch.device) -> torch.nn.Module:
+    for name, submodule in module.named_modules():
+        tensors = dict(
+            itertools.chain(submodule.named_parameters(recurse=False), submodule.named_buffers(recurse=False))
+        )
+        if not tensors:
+            continue
+        devices = {t.device for t in tensors.values()}
+        if len(devices) != 1:
+            # since this is using `submodule.to`, different devices in the same submodule is a problem
+            path_to_device = {f"{name}.{p}": t.device for p, t in tensors.items()}
+            raise ValueError(f"Found multiple devices: {path_to_device}")
+        if devices.pop() == replace:
+            submodule.to(by)
+    return module
+
+
+@torch.inference_mode()
+def main(
+    prompt: str = "What food do llamas eat?",
+    *,
+    num_samples: int = 1,
+    max_new_tokens: int = 50,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.8,
+    checkpoint_dir: Path = Path("checkpoints/mistralai/Mistral-7B-Instruct-v0.1"),
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None,
+    precision: Optional[str] = None,
+    compile: bool = False,
+) -> None:
+    """Generates text samples based on a pre-trained model and tokenizer.
+
+    Args:
+        prompt: The prompt string to use for generating the samples.
+        num_samples: The number of text samples to generate.
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        checkpoint_dir: The checkpoint directory to load.
+        quantize: Whether to quantize the model and using which method:
+            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
+            for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
+        precision: Indicates the Fabric precision setting to use.
+        compile: Whether to compile the model.
+    """
+    precision = precision or get_default_supported_precision(training=False)
+
+    plugins = None
+    if quantize is not None:
+        if compile:
+            raise NotImplementedError  # untested
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    fabric = L.Fabric(devices=1, precision=precision, accelerator="cuda", plugins=plugins)
+
+    total_devices = CUDAAccelerator.auto_device_count()
+    print(f"Using {total_devices} devices", file=sys.stderr)
+
+    check_valid_checkpoint_dir(checkpoint_dir)
+
+    config = Config.from_json(checkpoint_dir / "lit_config.json")
+
+    checkpoint_path = checkpoint_dir / "lit_model.pth"
+
+    tokenizer = Tokenizer(checkpoint_dir)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+
+    print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
+    t0 = time.perf_counter()
+    # cannot use `init_module` because if bitsandbytes is used, the Linear layers will be replaced
+    # which means that the weights will get quantized on cuda:0 on checkpoint load. we need to load and then convert
+    # still, use init_tensor for the precision
+    with fabric.init_tensor(), torch.device("meta"):
+        model = GPT(config)
+    print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+
+    t0 = time.perf_counter()
+    state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu")
+    # TODO: this assumes that the model fits on CPU. Use lazy_load and make the materialization checkpoint aware
+    model.load_state_dict(state_dict, assign=True)
+    print(f"Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+
+    model = fabric.setup_module(model, move_to_device=False)
+
+    t0 = time.perf_counter()
+    model = sequential(model, fabric.device, max_returned_tokens, total_devices)
+    print(f"Time to sequential-ize the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+
+    if compile:
+        # TODO: raises an internal compile AssertionError caused by fabric.strategy.precision.forward_context
+        raise NotImplementedError
+        # silence developer warning on nightly builds
+        # https://github.com/pytorch/pytorch/blob/v2.2.0-rc5/torch/_inductor/ir.py#L4166
+        pattern = re.compile(".*DeviceCopy in input program.*")
+        logging.getLogger("torch._inductor.utils").addFilter(lambda record: not pattern.search(record.getMessage()))
+        torch._dynamo.config.automatic_dynamic_shapes = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        # cannot use cudagraphs because it doesn't support multiple device indices
+        # https://github.com/pytorch/pytorch/blob/v2.2.0-rc5/torch/_inductor/compile_fx.py#L371-L375
+        generate_base.next_token = torch.compile(generate_base.next_token)
+
+    L.seed_everything(1234)
+    for i in range(num_samples):
+        t0 = time.perf_counter()
+        y = generate_base.generate(
+            model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id
+        )
+        t = time.perf_counter() - t0
+        for block in model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
+        print(tokenizer.decode(y))
+        tokens_generated = y.size(0) - prompt_length
+        print(
+            f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
+        )
+    print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    torch.set_float32_matmul_precision("high")
+
+    logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes").setLevel(logging.DEBUG)
+
+    CLI(main)
diff --git a/generate/tp.py b/generate/tp.py
new file mode 100644
index 0000000..abd93cc
--- /dev/null
+++ b/generate/tp.py
@@ -0,0 +1,225 @@
+"""Tensor-parallel implementation adapted from https://github.com/pytorch-labs/gpt-fast/blob/14df27/tp.py"""
+
+import logging
+import sys
+import time
+from functools import partial
+from pathlib import Path
+from typing import Literal, Optional, Union
+
+import lightning as L
+import torch
+import torch._dynamo.config
+import torch._inductor.config
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.utilities import rank_zero_only
+from torch.distributed._functional_collectives import all_reduce
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import generate.base as generate_base
+from lit_gpt import GPT, Config, Tokenizer
+from lit_gpt.model import CausalSelfAttention, GptNeoxMLP, LLaMAMLP, LLaMAMoE
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision
+
+
+def tensor_parallel_linear(fabric: L.Fabric, linear: torch.nn.Linear, style: str) -> None:
+    world_size = fabric.world_size
+    dim, attr = {"colwise": (0, "out_features"), "rowwise": (1, "in_features")}[style]
+    size = getattr(linear, attr)
+    if size % world_size != 0:
+        raise ValueError(
+            f"This linear's {attr} value ({size}) is not evenly divisible by the world size ({world_size})"
+        )
+
+    shard = torch.tensor_split(linear.weight, world_size, dim=dim)[fabric.global_rank]
+    # overwrite `.data` instead of recreating the parameter for quantization (bitsandbytes) support.
+    # the bitsandbytes linear classes use custom `torch.nn.Parameter` subclasses
+    linear.weight.data = shard
+    setattr(linear, attr, shard.size(dim))
+
+    if linear.bias is not None and dim == 0:
+        shard = torch.tensor_split(linear.bias, world_size)[fabric.global_rank]
+        linear.bias = torch.nn.Parameter(shard, requires_grad=linear.bias.requires_grad)
+
+
+def tensor_parallel_mlp(fabric: L.Fabric, mlp: Union[GptNeoxMLP, LLaMAMLP, LLaMAMoE]) -> None:
+    if isinstance(mlp, LLaMAMLP):
+        tensor_parallel_linear(fabric, mlp.fc_1, "colwise")
+        tensor_parallel_linear(fabric, mlp.fc_2, "colwise")
+        tensor_parallel_linear(fabric, mlp.proj, "rowwise")
+        mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size))
+    elif isinstance(mlp, GptNeoxMLP):
+        tensor_parallel_linear(fabric, mlp.fc, "colwise")
+        tensor_parallel_linear(fabric, mlp.proj, "rowwise")
+        mlp.register_forward_hook(partial(all_reduce_output, fabric.world_size))
+    elif isinstance(mlp, LLaMAMoE):
+        # we use expert slicing across ranks, alternatively, we could create a expert parallelism group
+        # when the number of experts is a multiple of the world size
+        for expert in mlp.experts:
+            tensor_parallel_mlp(fabric, expert)
+    else:
+        raise NotImplementedError
+
+
+def tensor_parallel_attn(fabric: L.Fabric, attn: CausalSelfAttention) -> None:
+    tensor_parallel_linear(fabric, attn.attn, "colwise")
+    tensor_parallel_linear(fabric, attn.proj, "rowwise")
+    attn.register_forward_hook(partial(all_reduce_output, fabric.world_size))
+
+
+def all_reduce_output(world_size: int, module: torch.nn.Module, ins, outs) -> torch.Tensor:
+    return all_reduce(outs, "sum", list(range(world_size)))
+
+
+def tensor_parallel(fabric: L.Fabric, model: GPT) -> GPT:
+    for block in model.transformer.h:
+        tensor_parallel_mlp(fabric, block.mlp)
+        tensor_parallel_attn(fabric, block.attn)
+
+    # update the config values to the shard sizes
+    # this is only relevant for `tensor_parallel_attn`, but it needs to run only once
+    world_size = fabric.world_size
+    attrs = ["n_head", "n_embd", "n_query_groups"]
+    for attr in attrs:
+        size = getattr(model.config, attr)
+        if size % world_size != 0:
+            raise ValueError(f"This {attr} value ({size}) is not evenly divisible by the world size ({world_size})")
+        setattr(model.config, attr, size // world_size)
+
+    return model
+
+
+@torch.inference_mode()
+def main(
+    prompt: str = "What food do llamas eat?",
+    *,
+    num_samples: int = 1,
+    max_new_tokens: int = 50,
+    top_k: Optional[int] = 200,
+    temperature: float = 0.8,
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq"]] = None,
+    precision: Optional[str] = None,
+    compile: bool = False,
+) -> None:
+    """Generates text samples based on a pre-trained model and tokenizer.
+
+    Args:
+        prompt: The prompt string to use for generating the samples.
+        num_samples: The number of text samples to generate.
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        checkpoint_dir: The checkpoint directory to load.
+        quantize: Whether to quantize the model and using which method:
+            - bnb.nf4, bnb.nf4-dq, bnb.fp4, bnb.fp4-dq: 4-bit quantization from bitsandbytes
+            for more details, see https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md
+        precision: Indicates the Fabric precision setting to use.
+        compile: Whether to compile the model.
+    """
+    precision = precision or get_default_supported_precision(training=False)
+
+    plugins = None
+    if quantize is not None:
+        if compile:
+            raise NotImplementedError  # untested
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+
+    # set "ddp" as the strategy for the launching functionality, but there's no data-parallelism
+    fabric = L.Fabric(devices="auto", strategy="ddp", precision=precision, plugins=plugins)
+    fabric.launch()
+
+    check_valid_checkpoint_dir(checkpoint_dir)
+
+    config = Config.from_json(checkpoint_dir / "lit_config.json")
+
+    model_file = "lit_model.pth"
+    checkpoint_path = checkpoint_dir / model_file
+
+    tokenizer = Tokenizer(checkpoint_dir)
+    encoded = tokenizer.encode(prompt, device=fabric.device)
+    prompt_length = encoded.size(0)
+    max_returned_tokens = prompt_length + max_new_tokens
+
+    fabric.print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}", file=sys.stderr)
+    t0 = time.perf_counter()
+    # cannot use `init_module` because if bitsandbytes is used, the Linear layers will be replaced
+    # which means that the weights will get quantized on cuda:0 on checkpoint load. we need to load and then convert
+    # still, use init_tensor for the precision
+    with fabric.init_tensor(), torch.device("meta"):
+        model = GPT(config)
+    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+
+    # sequentially do: load the checkpoint on CPU -> quantize -> apply tp -> move to device
+    # so that the CPU RAM doesn't OOM with larger models
+    for rank in range(fabric.world_size):
+        if fabric.global_rank == rank:
+            t0 = time.perf_counter()
+            state_dict = torch.load(str(checkpoint_path), mmap=True, map_location="cpu")
+            model.load_state_dict(state_dict, assign=True)
+            print(f"[{rank}] Time to load the model weights: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+
+            # cannot use `.setup_module` because it will wrap with DDP
+            model = fabric._precision.convert_module(model)
+
+            t0 = time.perf_counter()
+            model = tensor_parallel(fabric, model)
+            print(
+                f"[{rank}] Time to tensor-parallelize the model: {time.perf_counter() - t0:.02f} seconds.",
+                file=sys.stderr,
+            )
+
+            with fabric.init_tensor():
+                # set the max_seq_length to limit the memory usage to what we need
+                model.max_seq_length = max_returned_tokens
+                # the rope cache which is on meta device
+                model.cos, model.sin = model.rope_cache()
+                # enable the kv cache
+                model.set_kv_cache(batch_size=1)
+            model.eval()
+
+            t0 = time.perf_counter()
+            model = fabric.to_device(model)
+            print(f"[{rank}] Time to move the model: {time.perf_counter() - t0:.02f} seconds.", file=sys.stderr)
+        fabric.barrier()
+
+    if compile:
+        torch._dynamo.config.automatic_dynamic_shapes = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        generate_base.next_token = torch.compile(generate_base.next_token, mode="reduce-overhead")
+
+    L.seed_everything(1234)
+    for i in range(num_samples):
+        t0 = time.perf_counter()
+        y = generate_base.generate(
+            model, encoded, max_returned_tokens, temperature=temperature, top_k=top_k, eos_id=tokenizer.eos_id
+        )
+        t = time.perf_counter() - t0
+        for block in model.transformer.h:
+            block.attn.kv_cache.reset_parameters()
+        fabric.print(tokenizer.decode(y))
+        tokens_generated = y.size(0) - prompt_length
+        fabric.print(
+            f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_generated / t:.02f} tokens/sec", file=sys.stderr
+        )
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    torch.set_float32_matmul_precision("high")
+
+    bnb_logger = logging.getLogger("lightning.fabric.plugins.precision.bitsandbytes")
+    bnb_logger.setLevel(logging.DEBUG)
+    bnb_logger.debug = rank_zero_only(bnb_logger.debug)
+
+    CLI(main)
diff --git a/lit_gpt/__init__.py b/lit_gpt/__init__.py
index 264ac94..856e7cd 100644
--- a/lit_gpt/__init__.py
+++ b/lit_gpt/__init__.py
@@ -1,15 +1,27 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import re
+import logging
+
 from lit_gpt.model import GPT
 from lit_gpt.config import Config
 from lit_gpt.tokenizer import Tokenizer
 
 from lightning_utilities.core.imports import RequirementCache
 
-_LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.1.0.dev0")
+_LIGHTNING_AVAILABLE = RequirementCache("lightning>=2.2.0.dev0")
 if not bool(_LIGHTNING_AVAILABLE):
     raise ImportError(
-        "Lit-GPT requires lightning==2.1. Please run:\n"
+        "Lit-GPT requires lightning nightly. Please run:\n"
         f" pip uninstall -y lightning; pip install -r requirements.txt\n{str(_LIGHTNING_AVAILABLE)}"
     )
 
+# Suppress excessive warnings, see https://github.com/pytorch/pytorch/issues/111632
+pattern = re.compile(".*Profiler function .* will be ignored")
+logging.getLogger("torch._dynamo.variables.torch").addFilter(lambda record: not pattern.search(record.getMessage()))
+
+# Avoid printing state-dict profiling output at the WARNING level when saving a checkpoint
+logging.getLogger("torch.distributed.fsdp._optim_utils").disabled = True
+logging.getLogger("torch.distributed.fsdp._debug_utils").disabled = True
 
 __all__ = ["GPT", "Config", "Tokenizer"]
diff --git a/lit_gpt/adapter.py b/lit_gpt/adapter.py
index a99f85f..044b75d 100644
--- a/lit_gpt/adapter.py
+++ b/lit_gpt/adapter.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """Implementation of the paper:
 
 LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention
@@ -5,6 +7,7 @@
 
 Port for Lit-GPT
 """
+
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
diff --git a/lit_gpt/adapter_v2.py b/lit_gpt/adapter_v2.py
index e9e4c69..51b826a 100644
--- a/lit_gpt/adapter_v2.py
+++ b/lit_gpt/adapter_v2.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """Implementation of the paper:
 
 LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model
@@ -5,6 +7,7 @@
 
 Port for Lit-GPT
 """
+
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Tuple, Type
 
@@ -89,7 +92,7 @@ def _init_weights(self, module: nn.Module) -> None:
 
     def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
         """For compatibility with base checkpoints."""
-        mapping = {"lm_head.weight": "lm_head.linear.weight"}
+        mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"}
         state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
@@ -120,7 +123,8 @@ def __init__(self, config: Config, block_idx: int) -> None:
         # key, query, value projections for all heads, but in a batch
         self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias)
         # output projection
-        self.proj = AdapterV2Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head`
+        self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)
         # disabled by default
         self.kv_cache: Optional[KVCache] = None
 
@@ -191,6 +195,29 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
 
+class GemmaMLP(LLaMAMLP):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.gelu(x_fc_1) * x_fc_2
+        return self.proj(x)
+
+
+class LLaMAMoE(lit_gpt.model.LLaMAMoE):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        self.gate = AdapterV2Linear(config.n_embd, config.n_expert, bias=False)
+        self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert))
+
+        self.config = config
+
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {"gate.weight": "gate.linear.weight"}
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+
 def mark_only_adapter_v2_as_trainable(model: GPT) -> None:
     """Sets requires_grad=False for all non-adapter weights"""
     for name, param in model.named_parameters():
diff --git a/lit_gpt/args.py b/lit_gpt/args.py
new file mode 100644
index 0000000..6221707
--- /dev/null
+++ b/lit_gpt/args.py
@@ -0,0 +1,81 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class TrainArgs:
+    """Training related arguments"""
+
+    save_interval: int = 1000
+    """Number of optimizer steps between checkpoints"""
+    log_interval: int = 1
+    """Number of iterations between logging calls"""
+    global_batch_size: int = 64
+    """Number of samples between optimizer steps across data-parallel ranks"""
+    micro_batch_size: int = 4
+    """Number of samples per data-parallel rank"""
+    lr_warmup_steps: int = 100
+    """Number of iterations with learning rate warmup active"""
+    epochs: Optional[int] = None
+    """Number of epochs to run"""
+    epoch_size: Optional[int] = None
+    """Size of the epoch"""
+    # TODO: pretrain/tinyllama is the only script using `max_tokens` explicitly. replace it with epoch_size*epochs?
+    max_tokens: Optional[int] = None
+    """Total number of tokens to train on"""
+    max_seq_length: Optional[int] = None
+    """Limits the length of samples. Off by default"""
+
+    # Optimization args
+    learning_rate: float = 1e-3
+    weight_decay: float = 0.02
+    beta1: float = 0.9
+    beta2: float = 0.95
+    max_norm: Optional[float] = None
+    min_lr: float = 6e-5
+
+    def max_iters(self, devices: int) -> int:
+        """Number of iterations"""
+        max_iters = self.epochs * self.epoch_size // devices // self.micro_batch_size
+        assert max_iters > 0
+        return max_iters
+
+    def gradient_accumulation_iters(self, devices: int) -> int:
+        """Number of iterations between gradient synchronizations"""
+        gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size
+        assert gradient_accumulation_iters > 0
+        return gradient_accumulation_iters
+
+    def batch_size(self, devices: int) -> int:
+        """Number of samples between optimizer steps per data-parallel rank"""
+        batch_size = self.global_batch_size // devices
+        assert batch_size > 0
+        return batch_size
+
+
+@dataclass
+class EvalArgs:
+    """Evaluation related arguments"""
+
+    interval: int = 600
+    """Number of optimizer steps between evaluation calls"""
+    max_new_tokens: Optional[int] = None
+    """Number of tokens to generate"""
+    max_iters: int = 100
+    """Number of iterations"""
+
+
+@dataclass
+class IOArgs:
+    """Inputs and outputs related arguments"""
+
+    # Optional because pretrain/tinyllama hardcodes the path
+    train_data_dir: Optional[Path] = Path("data/alpaca")
+    """Where to read training data from"""
+    val_data_dir: Optional[Path] = None
+    """Where to read validation data from"""
+    checkpoint_dir: Optional[Path] = None
+    """Where to read weights and tokenizer data from"""
+    out_dir: Path = Path("out/adapter/alpaca")
+    """Where to save artifacts"""
diff --git a/lit_gpt/config.py b/lit_gpt/config.py
index 43c4dcc..7778d70 100644
--- a/lit_gpt/config.py
+++ b/lit_gpt/config.py
@@ -1,5 +1,8 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
-from dataclasses import dataclass
+from copy import deepcopy
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Literal, Optional, Type, Union
 
@@ -12,14 +15,16 @@
 
 @dataclass
 class Config:
-    org: str = "Lightning-AI"
-    name: str = "lit-GPT"
+    name: str = ""
+    hf_config: dict = field(default_factory=dict)
+    scale_embeddings: bool = False
     block_size: int = 4096
     vocab_size: int = 50254
     padding_multiple: int = 512
     padded_vocab_size: Optional[int] = None
     n_layer: int = 16
     n_head: int = 32
+    head_size: Optional[int] = None
     n_embd: int = 4096
     rotary_percentage: float = 0.25
     parallel_residual: bool = True
@@ -49,15 +54,21 @@ class Config:
     shared_attention_norm: bool = False
     _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
     norm_eps: float = 1e-5
-    _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
+    _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE"] = "GptNeoxMLP"
     gelu_approximate: str = "none"
     intermediate_size: Optional[int] = None
     rope_condense_ratio: int = 1
     rope_base: int = 10000
+    n_expert: int = 0
+    n_expert_per_token: int = 0
 
     def __post_init__(self):
-        assert self.n_embd % self.n_head == 0
-        self.head_size = self.n_embd // self.n_head
+        if not self.name:
+            self.name = self.hf_config.get("name", self.name)
+
+        if self.head_size is None:
+            assert self.n_embd % self.n_head == 0
+            self.head_size = self.n_embd // self.n_head
 
         # vocab size should be a power of 2 to be optimal on hardware. compute the closest value
         if self.padded_vocab_size is None:
@@ -82,7 +93,16 @@ def __post_init__(self):
 
     @classmethod
     def from_name(cls, name: str, **kwargs: Any) -> Self:
-        conf_dict = name_to_config[name].copy()
+        if name not in name_to_config:
+            # search through all `config['hf_config']['name']`
+            try:
+                conf_dict = next(config for config in configs if name == config["hf_config"]["name"])
+            except StopIteration:
+                raise ValueError(f"{name!r} is not a supported config name")
+        else:
+            conf_dict = name_to_config[name]
+
+        conf_dict = conf_dict.copy()
         if "condense_ratio" in kwargs:  # legacy name
             kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
         conf_dict.update(kwargs)
@@ -96,9 +116,22 @@ def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self:
             json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio")
         if "condense_ratio" in kwargs:  # legacy name
             kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
+        if "org" in json_kwargs:  # legacy name
+            json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")}
+        if "org" in kwargs:  # legacy name
+            kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")}
         json_kwargs.update(kwargs)
         return cls(**json_kwargs)
 
+    @classmethod
+    def from_checkpoint(cls, path: Path, **kwargs: Any) -> Self:
+        """Automatically load `lit_config.json` and if it doesn't exist - a matching config from `lit_gpt/config.py`."""
+        if (config_path := path / "lit_config.json").is_file():
+            return cls.from_json(config_path, **kwargs)
+        if (model_name := path.name) in name_to_config:
+            return cls.from_name(model_name, **kwargs)
+        raise FileNotFoundError(f"For {str(path)!r} neither 'lit_config.json' nor matching config exists.")
+
     @property
     def mlp_class(self) -> Type:
         # `self._mlp_class` cannot be the type to keep the config json serializable
@@ -108,9 +141,11 @@ def mlp_class(self) -> Type:
     def norm_class(self) -> Type:
         # `self._norm_class` cannot be the type to keep the config json serializable
         if self._norm_class == "RMSNorm":
+            from functools import partial
+
             from lit_gpt.rmsnorm import RMSNorm
 
-            return RMSNorm
+            return partial(RMSNorm, add_unit_offset="Gemma" in self.name)
         return getattr(torch.nn, self._norm_class)
 
 
@@ -119,57 +154,254 @@ def norm_class(self) -> Type:
 ########################
 configs = [
     # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json
-    dict(org="stabilityai", name="stablelm-base-alpha-3b"),
+    dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
     # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
-    dict(org="stabilityai", name="stablelm-base-alpha-7b", n_head=48, n_embd=6144, padding_multiple=256),
+    dict(
+        name="stablelm-base-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
     # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
-    dict(org="stabilityai", name="stablelm-tuned-alpha-3b", n_head=32),
+    dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
     # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
-    dict(org="stabilityai", name="stablelm-tuned-alpha-7b", n_head=48, n_embd=6144, padding_multiple=256),
+    dict(
+        name="stablelm-tuned-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
+    # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
+    dict(
+        name="stablelm-3b-4e1t",
+        hf_config=dict(org="stabilityai", name="stablelm-3b-4e1t"),
+        padded_vocab_size=50304,
+        n_layer=32,
+        n_head=32,
+        n_embd=2560,
+        parallel_residual=False,
+        bias=False,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=6912,
+    ),
+    # https://huggingface.co/stabilityai/stablelm-zephyr-3b/blob/main/config.json
+    dict(
+        name="stablelm-zephyr-3b",
+        hf_config=dict(org="stabilityai", name="stablelm-zephyr-3b"),
+        padded_vocab_size=50304,
+        n_layer=32,
+        n_head=32,
+        n_embd=2560,
+        parallel_residual=False,
+        bias=False,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=6912,
+    ),
+]
+
+
+##########################
+# Stability AI StableCode
+##########################
+stablecode = [
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
+        block_size=16384,
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b-4k",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-instruct-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stable-code-3b/blob/main/config.json
+    dict(
+        name="stable-code-3b",
+        hf_config=dict(org="stabilityai", name="stable-code-3b"),
+        padded_vocab_size=50304,
+        n_layer=32,
+        n_embd=2560,
+        block_size=16384,
+        parallel_residual=False,
+        bias=False,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=6912,
+    ),
 ]
+configs.extend(stablecode)
+
 
 ####################
 # EleutherAI Pythia
 ####################
 pythia = [
+    # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json
+    dict(
+        name="pythia-14m",
+        hf_config=dict(org="EleutherAI", name="pythia-14m"),
+        block_size=512,
+        n_layer=6,
+        n_embd=128,
+        n_head=4,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json
+    dict(
+        name="pythia-31m",
+        hf_config=dict(org="EleutherAI", name="pythia-31m"),
+        block_size=1024,
+        n_layer=6,
+        n_embd=256,
+        n_head=8,
+        padding_multiple=128,
+    ),
     # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
-    dict(org="EleutherAI", name="pythia-70m", block_size=2048, n_layer=6, n_embd=512, n_head=8, padding_multiple=128),
+    dict(
+        name="pythia-70m",
+        hf_config=dict(org="EleutherAI", name="pythia-70m"),
+        block_size=2048,
+        n_layer=6,
+        n_embd=512,
+        n_head=8,
+        padding_multiple=128,
+    ),
     # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
     dict(
-        org="EleutherAI", name="pythia-160m", block_size=2048, n_layer=12, n_embd=768, n_head=12, padding_multiple=128
+        name="pythia-160m",
+        hf_config=dict(org="EleutherAI", name="pythia-160m"),
+        block_size=2048,
+        n_layer=12,
+        n_embd=768,
+        n_head=12,
+        padding_multiple=128,
     ),
     # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
     dict(
-        org="EleutherAI", name="pythia-410m", block_size=2048, n_layer=24, n_embd=1024, n_head=16, padding_multiple=128
+        name="pythia-410m",
+        hf_config=dict(org="EleutherAI", name="pythia-410m"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=1024,
+        n_head=16,
+        padding_multiple=128,
     ),
     # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
-    dict(org="EleutherAI", name="pythia-1b", block_size=2048, n_embd=2048, n_head=8, padding_multiple=128),
+    dict(
+        name="pythia-1b",
+        hf_config=dict(org="EleutherAI", name="pythia-1b"),
+        block_size=2048,
+        n_embd=2048,
+        n_head=8,
+        padding_multiple=128,
+    ),
     # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
     dict(
-        org="EleutherAI", name="pythia-1.4b", block_size=2048, n_layer=24, n_embd=2048, n_head=16, padding_multiple=128
+        name="pythia-1.4b",
+        hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=2048,
+        n_head=16,
+        padding_multiple=128,
     ),
     # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
-    dict(org="EleutherAI", name="pythia-2.8b", block_size=2048, n_layer=32, n_embd=2560, padding_multiple=128),
+    dict(
+        name="pythia-2.8b",
+        hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padding_multiple=128,
+    ),
     # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
-    dict(org="EleutherAI", name="pythia-6.9b", block_size=2048, n_layer=32, padding_multiple=256),
+    dict(
+        name="pythia-6.9b",
+        hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+    ),
     # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
-    dict(org="EleutherAI", name="pythia-12b", block_size=2048, n_layer=36, n_embd=5120, n_head=40),
+    dict(
+        name="pythia-12b",
+        hf_config=dict(org="EleutherAI", name="pythia-12b"),
+        block_size=2048,
+        n_layer=36,
+        n_embd=5120,
+        n_head=40,
+    ),
 ]
 configs.extend(pythia)
 for c in pythia:
-    copy = c.copy()
+    # "pythia-14m" and "pythia-31m" don't have deduped version
+    if c["name"] in ("pythia-14m", "pythia-31m"):
+        continue
+    copy = deepcopy(c)
     copy["name"] = f"{c['name']}-deduped"
+    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
     configs.append(copy)
 
 
+###################
+# databricks Dolly
+###################
+dolly = [
+    # https://huggingface.co/databricks/dolly-v2-3b/blob/main/config.json
+    dict(
+        name="dolly-v2-3b",
+        hf_config=dict(org="databricks", name="dolly-v2-3b"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padded_vocab_size=50280,
+    ),
+    # https://huggingface.co/databricks/dolly-v2-7b/blob/main/config.json
+    dict(
+        name="dolly-v2-7b",
+        hf_config=dict(org="databricks", name="dolly-v2-7b"),
+        block_size=2048,
+        n_layer=32,
+        padded_vocab_size=50280,
+    ),
+    # https://huggingface.co/databricks/dolly-v2-12b/blob/main/config.json
+    dict(
+        name="dolly-v2-12b",
+        hf_config=dict(org="databricks", name="dolly-v2-12b"),
+        block_size=2048,
+        n_layer=36,
+        n_embd=5120,
+        n_head=40,
+        padded_vocab_size=50280,
+    ),
+]
+configs.extend(dolly)
+
+
 ####################################
 # togethercomputer RedPajama INCITE
 ####################################
 redpajama_incite = [
     # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json
     dict(
-        org="togethercomputer",
         name="RedPajama-INCITE-{}-3B-v1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
         block_size=2048,
         n_layer=32,
         n_embd=2560,
@@ -179,8 +411,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json
     dict(
-        org="togethercomputer",
         name="RedPajama-INCITE-7B-{}",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
         block_size=2048,
         n_layer=32,
         padding_multiple=256,
@@ -189,8 +421,8 @@ def norm_class(self) -> Type:
     ),
     # this redirects to the checkpoint above. kept for those who had the old weights already downloaded
     dict(
-        org="togethercomputer",
         name="RedPajama-INCITE-{}-7B-v0.1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
         block_size=2048,
         n_layer=32,
         padding_multiple=256,
@@ -200,8 +432,9 @@ def norm_class(self) -> Type:
 ]
 for c in redpajama_incite:
     for kind in ("Base", "Chat", "Instruct"):
-        copy = c.copy()
+        copy = deepcopy(c)
         copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
         configs.append(copy)
 
 
@@ -211,8 +444,8 @@ def norm_class(self) -> Type:
 falcon = [
     # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
     dict(
-        org="tiiuae",
         name="falcon-7b{}",
+        hf_config=dict(org="tiiuae", name="falcon-7b{}"),
         block_size=2048,
         vocab_size=65024,
         padded_vocab_size=65024,
@@ -227,8 +460,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
     dict(
-        org="tiiuae",
         name="falcon-40b{}",
+        hf_config=dict(org="tiiuae", name="falcon-40b{}"),
         block_size=2048,
         vocab_size=65024,
         padded_vocab_size=65024,
@@ -242,14 +475,15 @@ def norm_class(self) -> Type:
 ]
 for c in falcon:
     for kind in ("", "-instruct"):
-        copy = c.copy()
+        copy = deepcopy(c)
         copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
         configs.append(copy)
 
 # https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
 falcon180b = dict(
-    org="tiiuae",
     name="falcon-180B{}",
+    hf_config=dict(org="tiiuae", name="falcon-180B{}"),
     block_size=2048,
     vocab_size=65024,
     padded_vocab_size=65024,
@@ -262,8 +496,9 @@ def norm_class(self) -> Type:
 )
 
 for kind in ("", "-chat"):
-    copy = falcon180b.copy()
+    copy = deepcopy(falcon180b)
     copy["name"] = falcon180b["name"].format(kind)
+    copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
     configs.append(copy)
 
 
@@ -273,8 +508,8 @@ def norm_class(self) -> Type:
 open_LLaMA = [
     # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
     dict(
-        org="openlm-research",
         name="open_llama_3b",
+        hf_config=dict(org="openlm-research", name="open_llama_3b"),
         block_size=2048,
         vocab_size=32000,
         padding_multiple=64,
@@ -290,8 +525,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
     dict(
-        org="openlm-research",
         name="open_llama_7b",
+        hf_config=dict(org="openlm-research", name="open_llama_7b"),
         block_size=2048,
         vocab_size=32000,
         padding_multiple=64,
@@ -306,8 +541,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
     dict(
-        org="openlm-research",
         name="open_llama_13b",
+        hf_config=dict(org="openlm-research", name="open_llama_13b"),
         block_size=2048,
         vocab_size=32000,
         padding_multiple=64,
@@ -332,8 +567,8 @@ def norm_class(self) -> Type:
 vicuna = [
     # https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json
     dict(
-        org="lmsys",
         name="vicuna-7b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
         block_size=2048,
         vocab_size=32000,
         padding_multiple=64,
@@ -348,8 +583,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json
     dict(
-        org="lmsys",
         name="vicuna-13b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
         block_size=2048,
         vocab_size=32000,
         padding_multiple=64,
@@ -366,8 +601,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json
     dict(
-        org="lmsys",
         name="vicuna-33b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
         block_size=2048,
         vocab_size=32000,
         padding_multiple=64,
@@ -384,8 +619,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
     dict(
-        org="lmsys",
         name="vicuna-7b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
         vocab_size=32000,
         padding_multiple=64,
         n_layer=32,
@@ -398,8 +633,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json
     dict(
-        org="lmsys",
         name="vicuna-7b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
         block_size=16384,
         vocab_size=32000,
         padding_multiple=64,
@@ -414,8 +649,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json
     dict(
-        org="lmsys",
         name="vicuna-13b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
         vocab_size=32000,
         padding_multiple=64,
         n_layer=40,
@@ -430,8 +665,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json
     dict(
-        org="lmsys",
         name="vicuna-13b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
         block_size=16384,
         vocab_size=32000,
         padding_multiple=64,
@@ -456,8 +691,8 @@ def norm_class(self) -> Type:
 long_chat = [
     # https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json
     dict(
-        org="lmsys",
         name="longchat-7b-16k",
+        hf_config=dict(org="lmsys", name="longchat-7b-16k"),
         block_size=16384,
         vocab_size=32000,
         padding_multiple=64,
@@ -473,8 +708,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json
     dict(
-        org="lmsys",
         name="longchat-13b-16k",
+        hf_config=dict(org="lmsys", name="longchat-13b-16k"),
         block_size=16384,
         vocab_size=32000,
         padding_multiple=64,
@@ -500,8 +735,8 @@ def norm_class(self) -> Type:
 nous_research = [
     # https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json
     dict(
-        org="NousResearch",
         name="Nous-Hermes-llama-2-7b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
         padded_vocab_size=32000,
         n_layer=32,
         rotary_percentage=1.0,
@@ -514,8 +749,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json
     dict(
-        org="NousResearch",
         name="Nous-Hermes-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
         block_size=2048,
         vocab_size=32000,
         padded_vocab_size=32001,
@@ -532,8 +767,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b
     dict(
-        org="NousResearch",
         name="Nous-Hermes-Llama2-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
         vocab_size=32000,
         padded_vocab_size=32032,
         n_layer=40,
@@ -557,8 +792,8 @@ def norm_class(self) -> Type:
 llama_2 = [
     # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
     dict(
-        org="meta-llama",
         name="Llama-2-7b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
         vocab_size=32000,
         padding_multiple=64,
         n_layer=32,
@@ -571,8 +806,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
     dict(
-        org="meta-llama",
         name="Llama-2-13b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
         vocab_size=32000,
         padding_multiple=64,
         n_layer=40,
@@ -587,8 +822,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
     dict(
-        org="meta-llama",
         name="Llama-2-70b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
         vocab_size=32000,
         padding_multiple=64,
         n_layer=80,
@@ -605,19 +840,71 @@ def norm_class(self) -> Type:
 ]
 for c in llama_2:
     for kind in ("", "-chat"):
-        copy = c.copy()
+        copy = deepcopy(c)
         copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
         configs.append(copy)
 
 
+###############
+# Google Gemma
+###############
+gemma = [
+    # https://huggingface.co/google/gemma-2b/blob/main/config.json
+    dict(
+        name="Gemma-2b",
+        hf_config=dict(org="google", name="gemma-2b"),
+        scale_embeddings=True,
+        vocab_size=256000,
+        padding_multiple=64,
+        n_embd=2048,
+        n_layer=18,
+        n_head=8,
+        n_query_groups=1,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="GemmaMLP",
+        gelu_approximate="tanh",
+        intermediate_size=16384,
+    ),
+    # https://huggingface.co/google/gemma-7b/blob/main/config.json
+    dict(
+        name="Gemma-7b",
+        hf_config=dict(org="google", name="gemma-7b"),
+        scale_embeddings=True,
+        vocab_size=256000,
+        padding_multiple=64,
+        n_embd=3072,
+        n_layer=28,
+        n_head=16,
+        head_size=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="GemmaMLP",
+        gelu_approximate="tanh",
+        intermediate_size=24576,
+    ),
+]
+configs.extend(gemma)
+for c in gemma:
+    copy = deepcopy(c)
+    copy["name"] = f"{c['name']}-it"
+    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it"
+    configs.append(copy)
+
+
 ##########################
 # Stability AI FreeWilly2
 ##########################
 freewilly_2 = [
     # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
     dict(
-        org="stabilityai",
         name="FreeWilly2",
+        hf_config=dict(org="stabilityai", name="FreeWilly2"),
         vocab_size=32000,
         padding_multiple=64,
         n_layer=80,
@@ -641,8 +928,8 @@ def norm_class(self) -> Type:
 code_llama = [
     # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-7b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
         block_size=16384,
         vocab_size=32016,
         padding_multiple=16,
@@ -658,8 +945,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-13b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
         block_size=16384,
         vocab_size=32016,
         padding_multiple=16,
@@ -677,11 +964,11 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-34b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
         block_size=16384,
         vocab_size=32000,
-        padding_multiple=64,
+        padded_vocab_size=32000,
         n_layer=48,
         n_head=64,
         n_embd=8192,
@@ -695,13 +982,33 @@ def norm_class(self) -> Type:
         intermediate_size=22016,
         rope_base=1000000,
     ),
+    # https://huggingface.co/codellama/CodeLlama-70b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-70b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-70b-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+        rope_base=1000000,
+    ),
     # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-7b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
         block_size=16384,
         vocab_size=32000,
-        padding_multiple=64,
+        padded_vocab_size=32000,
         n_layer=32,
         rotary_percentage=1.0,
         parallel_residual=False,
@@ -714,11 +1021,11 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-13b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
         block_size=16384,
         vocab_size=32000,
-        padding_multiple=64,
+        padded_vocab_size=32000,
         n_layer=40,
         n_head=40,
         n_embd=5120,
@@ -733,11 +1040,11 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-34b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
         block_size=16384,
         vocab_size=32000,
-        padding_multiple=64,
+        padded_vocab_size=32000,
         n_layer=48,
         n_head=64,
         n_embd=8192,
@@ -751,10 +1058,30 @@ def norm_class(self) -> Type:
         intermediate_size=22016,
         rope_base=1000000,
     ),
-    # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/tree/main/config.json
+    # https://huggingface.co/codellama/CodeLlama-70b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-70b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-70b-Python-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-7b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
         block_size=16384,
         vocab_size=32016,
         padding_multiple=16,
@@ -770,8 +1097,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-13b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
         block_size=2048,
         vocab_size=32016,
         padding_multiple=16,
@@ -789,11 +1116,11 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
     dict(
-        org="codellama",
         name="CodeLlama-34b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
         block_size=16384,
         vocab_size=32000,
-        padding_multiple=64,
+        padded_vocab_size=32000,
         n_layer=48,
         n_head=64,
         n_embd=8192,
@@ -807,6 +1134,26 @@ def norm_class(self) -> Type:
         intermediate_size=22016,
         rope_base=1000000,
     ),
+    # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-70b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-70b-Instruct-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+        rope_base=1000000,
+    ),
 ]
 configs.extend(code_llama)
 
@@ -817,8 +1164,8 @@ def norm_class(self) -> Type:
 platypus = [
     # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Platypus-30B",
+        hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
         block_size=2048,
         padded_vocab_size=32000,
         n_layer=60,
@@ -834,8 +1181,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Platypus2-7B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
         padded_vocab_size=32000,
         n_layer=32,
         rotary_percentage=1.0,
@@ -848,8 +1195,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
         padded_vocab_size=32000,
         n_layer=40,
         n_head=40,
@@ -864,8 +1211,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
         padded_vocab_size=32000,
         n_layer=80,
         n_head=64,
@@ -879,8 +1226,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Camel-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
         padded_vocab_size=32000,
         n_layer=40,
         n_head=40,
@@ -894,8 +1241,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Camel-Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
         padded_vocab_size=32000,
         n_layer=80,
         n_head=64,
@@ -910,8 +1257,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Stable-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
         padded_vocab_size=32000,
         n_layer=40,
         n_head=40,
@@ -925,8 +1272,8 @@ def norm_class(self) -> Type:
     ),
     # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
     dict(
-        org="garage-bAInd",
         name="Platypus2-70B-instruct",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
         padded_vocab_size=32000,
         n_layer=80,
         n_head=64,
@@ -943,35 +1290,14 @@ def norm_class(self) -> Type:
 configs.extend(platypus)
 
 
-##########################
-# Stability AI StableCode
-##########################
-stablecode = [
-    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
-    dict(
-        org="stabilityai",
-        name="stablecode-completion-alpha-3b",
-        block_size=16384,
-        vocab_size=49152,
-        n_layer=32,
-        n_embd=2560,
-    ),
-    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
-    dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k", vocab_size=49152, n_layer=32, n_embd=2560),
-    # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
-    dict(org="stabilityai", name="stablecode-instruct-alpha-3b", vocab_size=49152, n_layer=32, n_embd=2560),
-]
-configs.extend(stablecode)
-
-
 ##################################
 # togethercomputer LLaMA-2-7B-32K
 ##################################
 together_llama2_32k = [
     # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
     dict(
-        org="togethercomputer",
         name="LLaMA-2-7B-32K",
+        hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
         vocab_size=32000,
         padding_multiple=64,
         n_layer=32,
@@ -993,8 +1319,8 @@ def norm_class(self) -> Type:
 phi = [
     # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
     dict(
-        org="microsoft",
         name="phi-1_5",
+        hf_config=dict(org="microsoft", name="phi-1_5"),
         vocab_size=50257,
         padded_vocab_size=51200,
         block_size=2048,
@@ -1004,7 +1330,21 @@ def norm_class(self) -> Type:
         shared_attention_norm=True,
         lm_head_bias=True,
         gelu_approximate="tanh",
-    )
+    ),
+    # https://huggingface.co/microsoft/phi-2/blob/main/config.json
+    dict(
+        name="phi-2",
+        hf_config=dict(org="microsoft", name="phi-2"),
+        vocab_size=50257,
+        padded_vocab_size=51200,
+        block_size=2048,
+        n_embd=2560,
+        n_layer=32,
+        rotary_percentage=0.4,  # 32 / (n_embd / n_head) = 32 / 80
+        shared_attention_norm=True,
+        lm_head_bias=True,
+        gelu_approximate="tanh",
+    ),
 ]
 configs.extend(phi)
 
@@ -1015,8 +1355,8 @@ def norm_class(self) -> Type:
 mistral = [
     # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
     dict(
-        org="mistralai",
         name="Mistral-7B-{}v0.1",
+        hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
         padded_vocab_size=32000,
         block_size=4096,  # should be 32768 but sliding window attention is not implemented
         n_layer=32,
@@ -1028,13 +1368,109 @@ def norm_class(self) -> Type:
         norm_eps=1e-05,
         _mlp_class="LLaMAMLP",
         intermediate_size=14336,
-    )
+    ),
+    # https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json
+    dict(
+        name="Mixtral-8x7B-{}v0.1",
+        hf_config=dict(org="mistralai", name="Mixtral-8x7B-{}v0.1"),
+        padded_vocab_size=32000,
+        block_size=32768,
+        n_layer=32,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMoE",
+        intermediate_size=14336,
+        rope_base=1000000,
+        n_expert=8,
+        n_expert_per_token=2,
+    ),
 ]
 for c in mistral:
     for kind in ("", "Instruct-"):
-        copy = c.copy()
+        copy = deepcopy(c)
         copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
         configs.append(copy)
+configs.append(
+    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/main/config.json
+    dict(
+        name="Mistral-7B-Instruct-v0.2",
+        hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.2"),
+        padded_vocab_size=32000,
+        block_size=32768,
+        n_layer=32,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=14336,
+    )
+)
+
+
+############
+# TinyLlama
+############
+tiny_llama = [
+    dict(
+        name="tiny-llama-1.1b{}",
+        hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=22,
+        n_head=32,
+        n_embd=2048,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",  # original TinyLlama uses FusedRMSNorm
+        norm_eps=1e-5,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=5632,
+        n_query_groups=4,
+    )
+]
+for c in tiny_llama:
+    for kind, hf_postfix in (("", "-intermediate-step-1431k-3T"), ("-chat", "-Chat-v1.0")):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix)
+        configs.append(copy)
+
+
+##########################
+# Trelis Function Calling
+##########################
+llama_2_function_calling = [
+    # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json
+    dict(
+        name="Llama-2-7b-chat-hf-function-calling-v2",
+        hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"),
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        norm_eps=1e-6,
+        block_size=4096,
+        vocab_size=32000,
+        n_head=32,
+        n_embd=4096,
+        rope_base=10000,
+    )
+]
 
+configs.extend(llama_2_function_calling)
 
 name_to_config = {config["name"]: config for config in configs}
diff --git a/lit_gpt/data/__init__.py b/lit_gpt/data/__init__.py
new file mode 100644
index 0000000..7d47ad7
--- /dev/null
+++ b/lit_gpt/data/__init__.py
@@ -0,0 +1,10 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+from lit_gpt.data.base import LitDataModule, SFTDataset, apply_prompt_template, get_sft_collate_fn
+from lit_gpt.data.alpaca import Alpaca
+from lit_gpt.data.json import JSON
+from lit_gpt.data.dolly import Dolly
+from lit_gpt.data.flan import FLAN
+from lit_gpt.data.lima import LIMA
+from lit_gpt.data.longform import LongForm
+from lit_gpt.data.tinyllama import TinyLlama
diff --git a/lit_gpt/data/alpaca.py b/lit_gpt/data/alpaca.py
new file mode 100644
index 0000000..a8b6687
--- /dev/null
+++ b/lit_gpt/data/alpaca.py
@@ -0,0 +1,138 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+
+import json
+from pathlib import Path
+from typing import Optional, Dict
+
+import torch
+from torch.utils.data import random_split, DataLoader
+from lightning_utilities.core.imports import RequirementCache
+from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule
+from lit_gpt.tokenizer import Tokenizer
+
+_URL = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json"
+
+
+class Alpaca(LitDataModule):
+    """Alpaca data module for supervised finetuning.
+
+    Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
+    """
+
+    def __init__(
+        self,
+        mask_prompt: bool = False,
+        test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+        ignore_index: int = -1,
+        seed: int = 42,
+        num_workers: int = 4,
+        data_file_url: str = _URL,
+        data_file_name: str = "alpaca_data_cleaned_archive.json",
+        download_dir: Path = Path("./data/alpaca"),
+    ) -> None:
+        super().__init__()
+        self.mask_prompt = mask_prompt
+        self.test_split_fraction = test_split_fraction
+        self.ignore_index = ignore_index
+        self.seed = seed
+        self.num_workers = num_workers
+        self.data_file_url = data_file_url
+        self.data_file_name = data_file_name
+        self.download_dir = download_dir
+
+        self.tokenizer: Optional[Tokenizer] = None
+        self.batch_size: int = 1
+        self.max_seq_length: int = -1
+        self.train_dataset: Optional[SFTDataset] = None
+        self.test_dataset: Optional[SFTDataset] = None
+
+    def connect(
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.max_seq_length = -1 if max_seq_length is None else max_seq_length
+
+    def prepare_data(self) -> None:
+        self.download_dir.mkdir(parents=True, exist_ok=True)
+        download_if_missing(self.download_dir / self.data_file_name, self.data_file_url)
+
+    def setup(self, stage: str = "") -> None:
+        with open(self.download_dir / self.data_file_name, "r", encoding="utf-8") as file:
+            data = json.load(file)
+
+        # Partition the dataset into train and test
+        train_data, test_data = random_split(
+            data,
+            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            generator=torch.Generator().manual_seed(self.seed)
+        )
+        train_data, test_data = list(train_data), list(test_data)
+
+        self.train_dataset = SFTDataset(
+            data=train_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+        self.test_dataset = SFTDataset(
+            data=test_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            generator=torch.Generator().manual_seed(self.seed),
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
+        )
+
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
+        )
+
+
+def download_if_missing(file_path: Path, file_url: str) -> None:
+    """Downloads the raw json data file and saves it in the given destination."""
+    if file_path.exists() and file_path.stat().st_size > 0:
+        return
+    requests_available = RequirementCache("requests")
+    if not requests_available:
+        raise ModuleNotFoundError(str(requests_available))
+    import requests
+
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(requests.get(file_url).text)
+
+
+def prompt_template(example: Dict[str, str]) -> str:
+    """The Alpaca prompt template."""
+    if example.get("input"):
+        return (
+            "Below is an instruction that describes a task, paired with an input that provides further context. "
+            "Write a response that appropriately completes the request.\n\n"
+            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n"
+        )
+    return (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
+    )
diff --git a/lit_gpt/data/base.py b/lit_gpt/data/base.py
new file mode 100644
index 0000000..318d3d5
--- /dev/null
+++ b/lit_gpt/data/base.py
@@ -0,0 +1,123 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+from abc import abstractmethod
+from functools import partial
+from typing import List, Dict, Union, Optional
+
+import torch
+from torch import Tensor
+from torch.utils.data import Dataset
+
+from lightning import LightningDataModule
+from lit_gpt import Tokenizer
+
+
+class LitDataModule(LightningDataModule):
+    """Base class for all data modules in Lit-GPT."""
+
+    @abstractmethod
+    def connect(
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None
+    ) -> None:
+        """All settings that can't be determined at the time of instantiation need to be passed through here
+        before any dataloaders can be accessed.
+        """
+
+    def setup(self, stage: str = "") -> None:
+        # Stub is to redefine the default signature, because the concept of 'stage' does not exist in Lit-GPT
+        pass
+
+
+class SFTDataset(Dataset):
+    """An in-memory dataset for supervised finetuning with `input_ids` and `labels`.
+
+    Args:
+        data: A list of samples (dicts). The target/label must be stored under the key 'output' and the instruction
+            or other data can be stored under any key as long as it is compatible with the given prompt template.
+        tokenizer: The tokenizer to use. Should match the one that was used to pretrain the model.
+        prompt_template: A prompt template (format string or callable).
+        max_seq_length: Truncate sequences that are longer than this value. By default, no truncation is applied.
+        mask_prompt: Whether to mask the prompt section from the label (with ``ignore_index``).
+        ignore_index: The index to use for elements to be ignored in the label.
+
+    Returns a dict with two keys:
+        input_ids: The encoded prompt + response
+        labels: Same as input_ids, unless ``mask_prompt=True`` in which case the 'prompt' part is replaced with
+            the ``ignore_index``.
+    """
+    def __init__(
+        self,
+        data: List[Dict[str, str]],
+        tokenizer: Tokenizer,
+        prompt_template: Union[str, callable],
+        max_seq_length: int = -1,
+        mask_prompt: bool = True,
+        ignore_index: int = -1,
+    ) -> None:
+        self.data = data
+        self.tokenizer = tokenizer
+        self.prompt_template = prompt_template
+        self.max_seq_length = max_seq_length
+        self.mask_prompt = mask_prompt
+        self.ignore_index = ignore_index
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __getitem__(self, idx: int) -> Dict[str, Tensor]:
+        example = self.data[idx]
+        prompt = apply_prompt_template(self.prompt_template, example)
+        prompt_and_response = prompt + example["output"]
+        encoded_prompt = self.tokenizer.encode(prompt, max_length=self.max_seq_length)
+        encoded_prompt_and_response = self.tokenizer.encode(
+            prompt_and_response,
+            eos=True,
+            max_length=self.max_seq_length,
+        )
+
+        # The labels are the full prompt with response, but with the prompt masked out
+        labels = encoded_prompt_and_response.clone()
+        if self.mask_prompt:
+            labels[: len(encoded_prompt)] = self.ignore_index
+
+        return {"input_ids": encoded_prompt_and_response.type(torch.int64), "labels": labels.type(torch.int64)}
+
+
+def apply_prompt_template(template: Union[str, callable], example: Dict[str, str]) -> str:
+    if isinstance(template, str):
+        prompt = template.format(**example)
+    else:
+        prompt = template(example)
+    return prompt
+
+
+def get_sft_collate_fn(max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1):
+    """Returns the collate function for supervised finetuning (needed in the DataLoader).
+
+    The collate function gets a list of dicts with keys `input_ids` and `labels`.
+    It returns a dict with batched `input_ids` and `labels`. Also pads short sequences to the longest element in
+    the batch. Optionally truncates all sequences to the specified maximum length.
+    """
+    return partial(_sft_collate_fn, max_seq_length=max_seq_length, pad_id=pad_id, ignore_index=ignore_index)
+
+
+def _sft_collate_fn(
+    samples: List[Dict[str, Tensor]], max_seq_length: int = -1, pad_id: int = 0, ignore_index: int = -1
+) -> Dict[str, Tensor]:
+
+    batched = {}
+    for key in ("input_ids", "labels"):
+        pad_value = pad_id if key == "input_ids" else ignore_index
+
+        # Pad right based on the longest sequence
+        batched[key] = torch.nn.utils.rnn.pad_sequence(
+            [sample[key] for sample in samples], batch_first=True, padding_value=pad_value
+        )
+
+        # Truncate if needed
+        if max_seq_length > 0:
+            batched[key] = batched[key][:, :max_seq_length]
+
+    return batched
diff --git a/lit_gpt/data/dolly.py b/lit_gpt/data/dolly.py
new file mode 100644
index 0000000..8069ad5
--- /dev/null
+++ b/lit_gpt/data/dolly.py
@@ -0,0 +1,73 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+from pathlib import Path
+
+import torch
+from torch.utils.data import random_split
+from lit_gpt.data import SFTDataset, Alpaca
+from lit_gpt.data.alpaca import prompt_template
+
+_URL: str = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl"
+
+
+class Dolly(Alpaca):
+    """Dolly data module for supervised finetuning.
+
+    Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
+    """
+
+    def __init__(
+        self,
+        mask_prompt: bool = False,
+        test_split_fraction: float = 0.1,
+        ignore_index: int = -1,
+        seed: int = 42,
+        num_workers: int = 4,
+        data_file_url: str = _URL,
+        data_file_name: str = "dolly_data_cleaned.json",
+        download_dir: Path = Path("./data/dolly"),
+    ) -> None:
+        super().__init__(
+            mask_prompt=mask_prompt,
+            test_split_fraction=test_split_fraction,
+            ignore_index=ignore_index,
+            seed=seed,
+            num_workers=num_workers,
+            data_file_url=data_file_url,
+            data_file_name=data_file_name,
+            download_dir=download_dir,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        with open(self.download_dir / self.data_file_name, "r", encoding="utf-8") as file:
+            data = file.readlines()
+            data = [json.loads(line) for line in data]
+        for item in data:
+            item["input"] = item.pop("context")
+            item["output"] = item.pop("response")
+
+        # Partition the dataset into train and test
+        train_data, test_data = random_split(
+            data,
+            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            generator=torch.Generator().manual_seed(self.seed)
+        )
+        train_data, test_data = list(train_data), list(test_data)
+
+        self.train_dataset = SFTDataset(
+            data=train_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+        self.test_dataset = SFTDataset(
+            data=test_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
diff --git a/lit_gpt/data/flan.py b/lit_gpt/data/flan.py
new file mode 100644
index 0000000..38e1c4b
--- /dev/null
+++ b/lit_gpt/data/flan.py
@@ -0,0 +1,194 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+from pathlib import Path
+from typing import Optional, Dict, List, Set
+
+import torch
+from torch.utils.data import DataLoader
+from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule
+from lit_gpt.data.alpaca import download_if_missing
+from lit_gpt.tokenizer import Tokenizer
+
+_URL = "https://huggingface.co/datasets/Muennighoff/flan/resolve/main"
+
+
+# TODO: Including all subsets, FLAN is too large to be loaded in memory. Switch the implementation to cache
+#   on disk or use Lightning Data
+class FLAN(LitDataModule):
+    """FLAN data module for supervised finetuning.
+
+    Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
+    """
+
+    def __init__(
+        self,
+        mask_prompt: bool = False,
+        test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
+        ignore_index: int = -1,
+        seed: int = 42,
+        num_workers: int = 4,
+        data_url: str = _URL,
+        download_dir: Path = Path("./data/flan"),
+        subsets: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self.mask_prompt = mask_prompt
+        self.test_split_fraction = test_split_fraction
+        self.ignore_index = ignore_index
+        self.seed = seed
+        self.num_workers = num_workers
+        self.data_url = data_url
+        self.download_dir = download_dir
+
+        supported_subsets = _supported_subsets()
+        if subsets is not None:
+            self.subsets = subsets.split(",")
+            for subset in self.subsets:
+                if subset not in supported_subsets:
+                    raise ValueError(f"{subset} not in {supported_subsets}")
+        else:
+            self.subsets = list(supported_subsets)
+
+        self.tokenizer: Optional[Tokenizer] = None
+        self.batch_size: int = 1
+        self.max_seq_length: int = -1
+        self.train_dataset: Optional[SFTDataset] = None
+        self.test_dataset: Optional[SFTDataset] = None
+
+    def connect(
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.max_seq_length = -1 if max_seq_length is None else max_seq_length
+
+    def prepare_data(self) -> None:
+        self.download_dir.mkdir(parents=True, exist_ok=True)
+        for subset in self.subsets:
+            for split in ("train", "test"):
+                data_file_path = self.download_dir / f"{subset}_{split}.jsonl"
+                data_file_url = f"{self.data_url}/{split}/{subset}_{split}.jsonl"
+                download_if_missing(data_file_path, data_file_url)
+
+    def train_dataloader(self):
+        return self._dataloader("train")
+
+    def val_dataloader(self):
+        return self._dataloader("test")
+
+    def _dataloader(self, split: str) -> DataLoader:
+        data = []
+        for subset in self.subsets:
+            data_file_path = self.download_dir / f"{subset}_{split}.jsonl"
+            data.extend(load_jsonl(data_file_path))
+
+        for item in data:
+            item["output"] = item.pop("targets")
+
+        dataset = SFTDataset(
+            data=data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+        return DataLoader(
+            dataset=dataset,
+            batch_size=self.batch_size,
+            shuffle=(split == "train"),
+            generator=torch.Generator().manual_seed(self.seed),
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
+        )
+
+
+def load_jsonl(filename: Path) -> List[Dict[str, str]]:
+    data = []
+    with open(filename, "r", encoding="utf-8") as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+
+def prompt_template(example: dict) -> str:
+    return (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        f"### Instruction:\n{example['inputs']}\n\n### Response:\n"
+    )
+
+
+def _supported_subsets() -> Set[str]:
+    return {
+        "aeslc_10templates",
+        "ag_news_subset_10templates",
+        "anli_r1_10templates",
+        "anli_r2_10templates",
+        "anli_r3_10templates",
+        "arc_challenge_10templates",
+        "arc_easy_10templates",
+        "bool_q_10templates",
+        "cb_10templates",
+        "cnn_dailymail_10templates",
+        "cola_10templates",
+        "common_gen_10templates",
+        "copa_10templates",
+        "coqa_10templates",
+        "cosmos_qa_10templates",
+        "dart_10templates",
+        "definite_pronoun_resolution_10templates",
+        "drop_10templates",
+        "e2e_nlg_10templates",
+        "fix_punct_10templates",
+        "gigaword_10templates",
+        "glue_mrpc_10templates",
+        "glue_qqp_10templates",
+        "hellaswag_10templates",
+        "imdb_reviews_10templates",
+        "math_dataset_10templates",
+        "mnli_matched_10templates",
+        "mnli_mismatched_10templates",
+        "multi_news_10templates",
+        "multirc_10templates",
+        "natural_questions_10templates",
+        "openbookqa_10templates",
+        "opinion_abstracts_idebate_10templates",
+        "opinion_abstracts_rotten_tomatoes_10templates",
+        "para_crawl_enes_10templates",
+        "paws_wiki_10templates",
+        "piqa_10templates",
+        "qnli_10templates",
+        "quac_10templates",
+        "record_10templates",
+        "rte_10templates",
+        "samsum_10templates",
+        "sentiment140_10templates",
+        "snli_10templates",
+        "squad_v1_10templates",
+        "squad_v2_10templates",
+        "sst2_10templates",
+        "story_cloze_10templates",
+        "stsb_10templates",
+        "trec_10templates",
+        "trivia_qa_10templates",
+        "true_case_10templates",
+        "web_nlg_en_10templates",
+        "wic_10templates",
+        "wiki_lingua_english_en_10templates",
+        "wmt14_enfr_10templates",
+        "wmt16_translate_csen_10templates",
+        "wmt16_translate_deen_10templates",
+        "wmt16_translate_fien_10templates",
+        "wmt16_translate_roen_10templates",
+        "wmt16_translate_ruen_10templates",
+        "wmt16_translate_tren_10templates",
+        "wnli_10templates",
+        "word_segment_10templates",
+        "wsc_10templates",
+        "yelp_polarity_reviews_10templates",
+    }
diff --git a/lit_gpt/data/json.py b/lit_gpt/data/json.py
new file mode 100644
index 0000000..3f47643
--- /dev/null
+++ b/lit_gpt/data/json.py
@@ -0,0 +1,113 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+from pathlib import Path
+from typing import Optional
+
+import torch
+from torch.utils.data import random_split, DataLoader
+from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule
+from lit_gpt.data.alpaca import prompt_template
+from lit_gpt.tokenizer import Tokenizer
+
+
+class JSON(LitDataModule):
+    """Loads JSON data for supervised finetuning.
+
+    Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
+
+    Args:
+        json_path: A path to a JSON file containing the data. The file should contain a list of samples (dicts).
+            Each dict must have the keys 'instruction' and 'output', and can optionally have a key 'input'
+            (see Alpaca).
+        mask_prompt: Whether to mask the prompt section from the label (with ``ignore_index``).
+        test_split_fraction: A number in the range [0, 1] that determines the fraction of the dataset
+            to use for testing.
+        ignore_index: The index to use for elements to be ignored in the label.
+        seed: The random seed for creating the train/val splits and shuffling the dataset.
+        num_workers: How many DataLoader processes to use for loading.
+    """
+
+    def __init__(
+        self,
+        json_path: Path,
+        mask_prompt: bool = False,
+        test_split_fraction: float = 0.1,
+        ignore_index: int = -1,
+        seed: int = 42,
+        num_workers: int = 4,
+    ) -> None:
+        super().__init__()
+        self.json_path = json_path
+        self.mask_prompt = mask_prompt
+        self.test_split_fraction = test_split_fraction
+        self.ignore_index = ignore_index
+        self.seed = seed
+        self.num_workers = num_workers
+
+        self.tokenizer: Optional[Tokenizer] = None
+        self.batch_size: int = 1
+        self.max_seq_length: int = -1
+        self.train_dataset: Optional[SFTDataset] = None
+        self.test_dataset: Optional[SFTDataset] = None
+
+        if not self.json_path.is_file():
+            raise FileNotFoundError(f"The file {self.json_path} does not exist.")
+
+    def connect(
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.max_seq_length = -1 if max_seq_length is None else max_seq_length
+
+    def setup(self, stage: str = "") -> None:
+        with open(self.json_path, "r", encoding="utf-8") as file:
+            data = json.load(file)
+
+        # Partition the dataset into train and test
+        train_data, test_data = random_split(
+            data,
+            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            generator=torch.Generator().manual_seed(self.seed)
+        )
+        train_data, test_data = list(train_data), list(test_data)
+
+        self.train_dataset = SFTDataset(
+            data=train_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+        self.test_dataset = SFTDataset(
+            data=test_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            generator=torch.Generator().manual_seed(self.seed),
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
+        )
+
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
+        )
diff --git a/lit_gpt/data/lima.py b/lit_gpt/data/lima.py
new file mode 100644
index 0000000..d13f86d
--- /dev/null
+++ b/lit_gpt/data/lima.py
@@ -0,0 +1,131 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import os
+
+from typing import Optional, List
+
+import torch
+from torch.utils.data import random_split, DataLoader
+from lit_gpt.data import LitDataModule, SFTDataset, get_sft_collate_fn
+from lit_gpt.data.alpaca import prompt_template
+from lit_gpt.tokenizer import Tokenizer
+
+
+class LIMA(LitDataModule):
+    """LIMA data module for supervised finetuning.
+
+    Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
+    """
+
+    def __init__(
+        self,
+        mask_prompt: bool = False,
+        test_split_fraction: float = 0.1,
+        ignore_index: int = -1,
+        seed: int = 42,
+        include_multiturn_conversations: bool = False,
+        data_repo_id: str = "GAIR/lima",
+        access_token: Optional[str] = os.getenv("HF_TOKEN"),
+        num_workers: int = 4,
+    ) -> None:
+        super().__init__()
+        if access_token is None:
+            raise ValueError(
+                "LIMA requires authentication, please set the `HF_TOKEN=your_token` environment"
+                " variable or pass --access_token=your_token. You can find your token by visiting"
+                " https://huggingface.co/settings/tokens"
+            )
+        self.mask_prompt = mask_prompt
+        self.test_split_fraction = test_split_fraction
+        self.ignore_index = ignore_index
+        self.seed = seed
+        self.num_workers = num_workers
+
+        self.access_token = access_token
+        self.data_repo_id = data_repo_id
+        self.include_multiturn_conversations = include_multiturn_conversations
+
+        self.tokenizer: Optional[Tokenizer] = None
+        self.batch_size = 1
+        self.max_seq_length = -1
+        self.train_dataset: Optional[SFTDataset] = None
+        self.test_dataset: Optional[SFTDataset] = None
+
+    def connect(
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.max_seq_length = -1 if max_seq_length is None else max_seq_length
+
+    def prepare_data(self) -> None:
+        from datasets import load_dataset
+
+        load_dataset(self.data_repo_id, token=self.access_token)
+
+    def setup(self, stage: str = "") -> None:
+        from datasets import load_dataset
+
+        dataset = load_dataset(self.data_repo_id, token=self.access_token)
+        data = format_dataset(dataset["train"], self.include_multiturn_conversations)
+
+        # Partition the dataset into train and test
+        train_data, test_data = random_split(
+            data,
+            [1.0 - self.test_split_fraction, self.test_split_fraction],
+            generator=torch.Generator().manual_seed(self.seed)
+        )
+        train_data, test_data = list(train_data), list(test_data)
+
+        self.train_dataset = SFTDataset(
+            data=train_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+        self.test_dataset = SFTDataset(
+            data=test_data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            generator=torch.Generator().manual_seed(self.seed),
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
+        )
+
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
+        )
+
+
+def format_dataset(dataset_partition: dict, include_multi_turn_conversations: bool) -> List[dict]:
+    formatted_ds = []
+
+    for entry in dataset_partition:
+        convo = entry["conversations"]
+        if include_multi_turn_conversations:
+            for i in range(0, len(convo) - 1, 2):
+                formatted_ds.append({"instruction": convo[i], "input": "", "output": convo[i + 1]})
+        else:
+            formatted_ds.append({"instruction": convo[0], "input": "", "output": convo[1]})
+
+    return formatted_ds
diff --git a/lit_gpt/data/longform.py b/lit_gpt/data/longform.py
new file mode 100644
index 0000000..d8d1cef
--- /dev/null
+++ b/lit_gpt/data/longform.py
@@ -0,0 +1,93 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+from pathlib import Path
+from typing import Optional
+
+import torch
+from torch.utils.data import DataLoader
+from lit_gpt.data import SFTDataset, get_sft_collate_fn, LitDataModule
+from lit_gpt.data.alpaca import download_if_missing
+from lit_gpt.tokenizer import Tokenizer
+
+
+_URL = "https://raw.githubusercontent.com/akoksal/LongForm/main/dataset"
+
+
+class LongForm(LitDataModule):
+    """LongForm data module for supervised finetuning.
+
+    Provides train- and val-dataloaders. The batches return keys "input_ids" and "labels".
+    """
+
+    def __init__(
+        self,
+        mask_prompt: bool = False,
+        ignore_index: int = -1,
+        seed: int = 42,
+        num_workers: int = 4,
+        download_dir: Path = Path("./data/longform"),
+    ) -> None:
+        super().__init__()
+        self.mask_prompt = mask_prompt
+        self.ignore_index = ignore_index
+        self.seed = seed
+        self.num_workers = num_workers
+        self.download_dir = download_dir
+
+        self.tokenizer: Optional[Tokenizer] = None
+        self.batch_size: int = 1
+        self.max_seq_length: int = -1
+        self.train_dataset: Optional[SFTDataset] = None
+        self.test_dataset: Optional[SFTDataset] = None
+
+    def connect(
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.max_seq_length = -1 if max_seq_length is None else max_seq_length
+
+    def prepare_data(self) -> None:
+        self.download_dir.mkdir(parents=True, exist_ok=True)
+        download_if_missing(self.download_dir / "train.json", f"{_URL}/train.json")
+        download_if_missing(self.download_dir / "val.json", f"{_URL}/val.json")
+
+    def train_dataloader(self):
+        return self._dataloader("train")
+
+    def val_dataloader(self):
+        return self._dataloader("val")
+
+    def _dataloader(self, split: str) -> DataLoader:
+        with open(self.download_dir / f"{split}.json", "r", encoding="utf-8") as file:
+            data = json.load(file)
+
+        dataset = SFTDataset(
+            data=data,
+            tokenizer=self.tokenizer,
+            prompt_template=prompt_template,
+            max_seq_length=self.max_seq_length,
+            mask_prompt=self.mask_prompt,
+            ignore_index=self.ignore_index,
+        )
+        return DataLoader(
+            dataset=dataset,
+            batch_size=self.batch_size,
+            shuffle=(split == "train"),
+            generator=torch.Generator().manual_seed(self.seed),
+            num_workers=self.num_workers,
+            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index)
+        )
+
+
+def prompt_template(example: dict) -> str:
+    """A modified Alpaca prompt template without the 'input'."""
+    return (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        f"### Instruction:\n{example['input']}\n\n### Response:\n"
+    )
diff --git a/lit_gpt/data/tinyllama.py b/lit_gpt/data/tinyllama.py
new file mode 100644
index 0000000..21ab713
--- /dev/null
+++ b/lit_gpt/data/tinyllama.py
@@ -0,0 +1,99 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+from pathlib import Path
+from typing import Union, Optional
+
+from torch.utils.data import DataLoader
+
+from lit_gpt import Tokenizer
+from lit_gpt.data import LitDataModule
+
+
+class TinyLlama(LitDataModule):
+    """The TinyLlama data module is composed of a mix of SlimPajama and Starcoder data.
+
+    Provides training and validation streaming dataloaders that return batches of tokens.
+
+    Args:
+        data_path: The path to the data directory, containing two folders 'slimpajama' and 'starcoder'
+            which are the output of the preprocessing step done in advance. See the `tutorial/pretrain_tinyllama.md`
+            for instructions. The path can also be a remote path (e.g., s3://).
+        seed: The seed to use for shuffling the training data.
+        num_workers: The number of workers to use for the dataloaders.
+    """
+
+    def __init__(
+        self,
+        data_path: Union[str, Path] = Path("data/"),
+        seed: int = 42,
+        num_workers: int = 8,
+    ) -> None:
+        super().__init__()
+        self.seed = seed
+        self.num_workers = num_workers
+
+        self.batch_size = 1
+        self.seq_length = 2048
+
+        # Could be a remote path (s3://) or a local path
+        self.slimpajama_train = str(data_path).rstrip("/") + "/slimpajama/train"
+        self.slimpajama_val = str(data_path).rstrip("/") + "/slimpajama/val"
+        self.starcoder_train = str(data_path).rstrip("/") + "/starcoder"
+
+    def connect(
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None
+    ) -> None:
+        self.batch_size = batch_size
+        self.seq_length = max_seq_length + 1  # Increase by one because we need the next token as well
+
+    def prepare_data(self) -> None:
+        for path in (self.slimpajama_train, self.slimpajama_val, self.starcoder_train):
+            if not path.startswith("s3://") and not Path(path).is_dir():
+                raise FileNotFoundError(
+                    "The data path for TinyLlama is expected to be the directory containing these subdirectories:"
+                    f" `slimpajama/train`, `slimpajama/val`, `starcoder`. The directory {path} does not exist."
+                )
+
+    def train_dataloader(self) -> DataLoader:
+        from lightning.data.streaming import CombinedStreamingDataset, StreamingDataLoader, StreamingDataset, TokensLoader
+
+        train_datasets = [
+            StreamingDataset(
+                input_dir=self.slimpajama_train,
+                item_loader=TokensLoader(block_size=self.seq_length),
+                shuffle=True,
+                drop_last=True,
+            ),
+            StreamingDataset(
+                input_dir=self.starcoder_train,
+                item_loader=TokensLoader(block_size=self.seq_length),
+                shuffle=True,
+                drop_last=True,
+            ),
+        ]
+
+        # Mix SlimPajama data and Starcoder data with these proportions:
+        weights = (0.693584, 0.306416)
+        combined_dataset = CombinedStreamingDataset(datasets=train_datasets, seed=self.seed, weights=weights)
+        train_dataloader = StreamingDataLoader(
+            combined_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
+        )
+        return train_dataloader
+
+    def val_dataloader(self) -> DataLoader:
+        from lightning.data.streaming import StreamingDataset, TokensLoader
+
+        val_dataset = StreamingDataset(
+            input_dir=self.slimpajama_val,
+            item_loader=TokensLoader(block_size=self.seq_length),
+            shuffle=True,
+            # Consider setting to False, but we would lose some samples due to truncation when world size > 1
+            drop_last=True,
+        )
+        val_dataloader = DataLoader(
+            val_dataset, batch_size=self.batch_size, pin_memory=True, num_workers=self.num_workers, drop_last=True
+        )
+        return val_dataloader
diff --git a/lit_gpt/lora.py b/lit_gpt/lora.py
index 3382fe0..bfc7adc 100644
--- a/lit_gpt/lora.py
+++ b/lit_gpt/lora.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 # Derived from https://github.com/microsoft/LoRA
 #  ------------------------------------------------------------------------------------------
 #  Copyright (c) Microsoft Corporation. All rights reserved.
@@ -95,7 +97,7 @@ def __init__(
         r: int = 0,
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
-        **kwargs,
+        **kwargs: Any,
     ):
         """LoRA wrapper around linear class.
 
@@ -133,11 +135,36 @@ def reset_parameters(self) -> None:
             nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
             nn.init.zeros_(self.lora_B)
 
+    def get_lora_AB(self) -> torch.Tensor:
+        """Return merged lora_A and lora_B matrices with the same shape as the pretrained weights."""
+        return (self.lora_B @ self.lora_A) * self.scaling
+
     def merge(self) -> None:
         """Merges the LoRA weights into the full-rank weights (W = W + delta_W)."""
         if self.r > 0 and not self.merged:
-            # Merge the weights and mark it
-            self.linear.weight.data += (self.lora_B @ self.lora_A) * self.scaling
+            pretrained_dtype = self.linear.weight.data.dtype
+            lora_data = self.get_lora_AB()
+            # if the pretrained weights and LoRA weights are of the same dtype - simply sum them
+            if pretrained_dtype == lora_data.dtype:
+                self.linear.weight.data += lora_data
+            # if only the pretrained are in quantized form - dequantize, sum with LoRA and quantize the result
+            elif pretrained_dtype == torch.uint8:
+                import bitsandbytes as bnb
+
+                weight = self.linear.weight
+                # dequantize the pretrained weights
+                weight_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state).to(lora_data.dtype)
+                # add pretrained and LoRA weights
+                weight_data += lora_data
+                # assign updated weights and quantize by moving to CUDA device
+                self.linear.weight = bnb.nn.Params4bit(weight_data, requires_grad=False, **weight.__dict__)
+                self.linear.weight.cuda(weight.device)
+            else:
+                raise NotImplementedError(
+                    f"Cannot merge the pretrained weights of type {pretrained_dtype}"
+                    f" and LoRA weights of type {lora_data.dtype}"
+                )
+
             self.merged = True
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -164,7 +191,7 @@ def __init__(
         lora_alpha: int = 1,
         lora_dropout: float = 0.0,
         enable_lora: Union[bool, Tuple[bool, bool, bool]] = False,
-        **kwargs,
+        **kwargs: Any,
     ):
         """LoRA wrapper around linear class that is used for calculation of q, k and v matrices.
 
@@ -234,23 +261,21 @@ def __init__(
             self.scaling = self.lora_alpha / self.r
 
             # Compute the indices
-            # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values,
-            # but not keys, then the weights update should be:
-            #
-            # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
-            #  [....................................],
-            #  [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
-            #      ↑              ↑            ↑
-            # ________________________________________
-            # | query         | key       | value    |
-            # ----------------------------------------
+            # Indices are needed to properly pad weight updates with zeros in `zero_pad` method.
+            q_per_kv = self.n_head // self.n_query_groups
+            total_qkv = q_per_kv + 2
+            head_size = out_features // (self.n_query_groups * total_qkv)
+            ind = range(out_features)
             self.lora_ind = []
             if enable_q:
-                self.lora_ind.extend(range(0, self.linear.in_features))
+                q_ind = [x for x in ind if (x // head_size) % total_qkv < total_qkv - 2]
+                self.lora_ind.extend(q_ind)
             if enable_k:
-                self.lora_ind.extend(range(self.linear.in_features, self.linear.in_features + self.kv_embd_size))
+                k_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 2]
+                self.lora_ind.extend(k_ind)
             if enable_v:
-                self.lora_ind.extend(range(self.linear.in_features + self.kv_embd_size, self.linear.out_features))
+                v_ind = [x for x in ind if (x // head_size) % total_qkv == total_qkv - 1]
+                self.lora_ind.extend(v_ind)
             self.reset_parameters()
 
     def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
@@ -266,6 +291,27 @@ def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
         ________________________________________
         | query         | key       | value    |
         ----------------------------------------
+        For Llama2's GQA support, Q, K, and V weights are interleaved, so that weights for grouped
+        queries are adjacent to their associated key and value weights.
+        For example, suppose we have n_head = 12 with 3 query groups.
+        Then along the embedding dimension the interleaved weights would look like
+
+        [Q, Q, Q, Q, K, V, Q, Q, Q, Q, K, V, Q, Q, Q, Q, K, V],
+
+        where each Q, K, and V has size head_size.
+
+        In this case, the previously-described weight update applies separately to each
+        individual block, so the update will take the form
+
+        [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ...],
+         [.............................................................................],
+         [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW, ...]]
+             ↑              ↑            ↑        ↑             ↑            ↑
+        ________________________________________________________________________________
+        | q block 1 | k block 1  | v block 1 | q block 2 |  k block 2 |  v block 2 | ...
+        --------------------------------------------------------------------------------
+        Note that in the above diagram, the size of each q block will equal q_per_kv
+        times the size of each k and v block.
 
         Args:
             x: tensor with weights update that will be padded with zeros if necessary
@@ -330,23 +376,24 @@ def conv1d(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
             [F.conv1d(a, b) for a, b in zip(input_splitted, weight_splitted)], dim=1  # (B, C_output', T)
         )  # (B, C_output, T)
 
-    def merge(self) -> None:
-        """Merges the LoRA weights into the full-rank weights (W = W + delta_W)."""
-
+    def get_lora_AB(self) -> torch.Tensor:
+        """Return merged lora_A and lora_B matrices with the same shape as the pretrained weights."""
         # Let's assume that:
         # ⚬ self.linear.weight.data: (384, 128) or (3 * embedding_size, embedding_size)
         # ⚬ self.lora_A.data: (4, 128)
         # ⚬ self.lora_B.data: (256, 2)
+        lora = self.conv1d(
+            self.lora_A.data.unsqueeze(0),  # (4, 128) -> (1, 4, 128)
+            self.lora_B.data.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
+        ).squeeze(
+            0
+        )  # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128)
+        return self.zero_pad(lora * self.scaling)  # (256, 128) after zero_pad (384, 128)
+
+    def merge(self) -> None:
+        """Merges the LoRA weights into the full-rank weights (W = W + delta_W)."""
         if self.r > 0 and any(self.enable_lora) and not self.merged:
-            delta_w = self.conv1d(
-                self.lora_A.data.unsqueeze(0),  # (4, 128) -> (1, 4, 128)
-                self.lora_B.data.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
-            ).squeeze(
-                0
-            )  # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128)
-            # W = W + delta_W (merge)
-            self.linear.weight.data += self.zero_pad(delta_w * self.scaling)  # (256, 128) after zero_pad (384, 128)
-            self.merged = True
+            super().merge()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Do the forward pass.
@@ -514,7 +561,7 @@ def _init_weights(self, module: nn.Module) -> None:
 
     def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
         """For compatibility with base checkpoints."""
-        mapping = {"lm_head.weight": "lm_head.linear.weight"}
+        mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"}
         state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
@@ -551,8 +598,9 @@ def __init__(self, config: Config) -> None:
             n_query_groups=config.n_query_groups,
         )
         # output projection
+        # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head`
         self.proj = LoRALinear(
-            config.n_embd,
+            config.head_size * config.n_head,
             config.n_embd,
             bias=config.bias,
             r=(config.r if config.to_projection else 0),
@@ -652,6 +700,36 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
 
+class GemmaMLP(LLaMAMLP):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.gelu(x_fc_1) * x_fc_2
+        return self.proj(x)
+
+
+class LLaMAMoE(lit_gpt.model.LLaMAMoE):
+    def __init__(self, config: Config) -> None:
+        nn.Module.__init__(self)
+        self.gate = LoRALinear(
+            config.n_embd,
+            config.n_expert,
+            bias=False,
+            r=(config.r if config.to_mlp else 0),
+            lora_alpha=config.alpha,
+            lora_dropout=config.dropout,
+        )
+        self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert))
+
+        self.config = config
+
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+        """For compatibility with base checkpoints."""
+        mapping = {"gate.weight": "gate.linear.weight"}
+        state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+
 def merge_lora_weights(model: GPT) -> None:
     """Merge LoRA weights into the full-rank weights to speed up inference."""
     for module in model.modules():
diff --git a/lit_gpt/model.py b/lit_gpt/model.py
index f4eb243..29f6f38 100644
--- a/lit_gpt/model.py
+++ b/lit_gpt/model.py
@@ -1,20 +1,20 @@
-"""Full definition of a GPT NeoX Language Model, all of it in this single file.
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+"""Full definition of a decoder-only transformer-based language model, all of it in this single file.
 
 Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and
 https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model.
 """
+
 import math
 from typing import Any, Optional, Tuple
 
 import torch
 import torch.nn as nn
-from lightning_utilities.core.imports import RequirementCache
 from typing_extensions import Self
 
 from lit_gpt.config import Config
 
-FlashAttention2Available = bool(RequirementCache("flash-attn>=2.0.0.post1"))
-
 
 class GPT(nn.Module):
     def __init__(self, config: Config) -> None:
@@ -51,15 +51,15 @@ def max_seq_length(self, value: int) -> None:
             cos, sin = self.rope_cache()
             self.register_buffer("cos", cos, persistent=False)
             self.register_buffer("sin", sin, persistent=False)
+        # override
         elif value != self.cos.size(0):
-            # override
             self.cos, self.sin = self.rope_cache(device=self.cos.device)
         # the mask and kv cache size will get updated on `set_kv_cache`. we cannot update it here because we don't know
         # if the kv cache is expected
 
     def reset_parameters(self) -> None:
         # Trigger resetting the rope-cache
-        self.max_seq_length = self.config.block_size
+        self.cos, self.sin = self.rope_cache(device=self.cos.device)
 
     def _init_weights(self, module: nn.Module) -> None:
         """Meant to be used with `gpt.apply(gpt._init_weights)`."""
@@ -87,6 +87,9 @@ def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) -
             mask = None
 
         x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        if self.config.scale_embeddings:
+            x = x * (self.config.n_embd**0.5)
+
         for block in self.transformer.h:
             x = block(x, cos, sin, mask, input_pos)
         x = self.transformer.ln_f(x)
@@ -100,7 +103,6 @@ def rope_cache(self, device: Optional[torch.device] = None) -> Tuple[torch.Tenso
         return build_rope_cache(
             seq_len=self.max_seq_length,
             n_elem=self.config.rope_n_elem,
-            dtype=torch.get_default_dtype(),
             device=device,
             condense_ratio=self.config.rope_condense_ratio,
             base=self.config.rope_base,
@@ -124,11 +126,9 @@ def set_kv_cache(
             )
 
         if self.mask_cache is None or self.mask_cache.size(3) != max_seq_length:
-            # passing `attn_mask` to SDPA downgrades it to use the inefficient implementation. since we only need the mask
+            # passing `attn_mask` to SDPA disables the flash implementation. since we only need the mask
             # for the kv-cache support (only during inference), we only create it in that situation
-            # this will be resolved by https://github.com/pytorch/pytorch/issues/96099
-            ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool)
-            self.mask_cache = torch.tril(ones).unsqueeze(0).unsqueeze(0)
+            self.mask_cache = build_mask_cache(max_seq_length, device)
 
     def clear_kv_cache(self) -> None:
         self.mask_cache = None
@@ -158,15 +158,15 @@ def forward(
         h = self.attn(n_1, cos, sin, mask, input_pos)
         if self.config.parallel_residual:
             n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
-            x = x + h + self.mlp(n_2)
+            x = self.mlp(n_2) + h + x
         else:
             if self.config.shared_attention_norm:
                 raise NotImplementedError(
                     "No checkpoint amongst the ones we support uses this configuration"
                     " (non-parallel residual and shared attention norm)."
                 )
-            x = x + h
-            x = x + self.mlp(self.norm_2(x))
+            x = h + x
+            x = self.mlp(self.norm_2(x)) + x
         return x
 
 
@@ -177,7 +177,8 @@ def __init__(self, config: Config) -> None:
         # key, query, value projections for all heads, but in a batch
         self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
         # output projection
-        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # if `head_size` is explicitly specified in the config, `n_emd` might not be equal to `head_size * n_head`
+        self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)
         # disabled by default
         self.kv_cache: Optional[KVCache] = None
 
@@ -204,9 +205,10 @@ def forward(
         # split batched computation into three
         q, k, v = qkv.split((q_per_kv, 1, 1), dim=2)
 
-        # repeat k and v if necessary
-        if self.config.n_query_groups != 1:  # doing this would require a full kv cache with MQA (inefficient!)
-            # for MHA this is a no-op
+        # maybe repeat k and v if for the non multi-head attention cases
+        # training: flash attention requires it
+        # inference: multi-query would require a full kv cache so avoid it to limit its memory usage
+        if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1):
             k = k.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
             v = v.expand(B, self.config.n_query_groups, q_per_kv, T, self.config.head_size)
 
@@ -226,7 +228,7 @@ def forward(
 
         y = self.scaled_dot_product_attention(q, k, v, mask)
 
-        y = y.reshape(B, T, C)  # re-assemble all head outputs side by side
+        y = y.reshape(B, T, self.config.head_size * self.config.n_head)  # re-assemble all head outputs side by side
 
         # output projection
         return self.proj(y)
@@ -235,19 +237,6 @@ def scaled_dot_product_attention(
         self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         scale = 1.0 / math.sqrt(self.config.head_size)
-        if (
-            FlashAttention2Available
-            and mask is None
-            and q.device.type == "cuda"
-            and q.dtype in (torch.float16, torch.bfloat16)
-        ):
-            from flash_attn import flash_attn_func
-
-            # flash-attn requires (B, T, nh, hs)
-            q = q.transpose(1, 2)
-            k = k.transpose(1, 2)
-            v = v.transpose(1, 2)
-            return flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=scale, causal=True)
         y = torch.nn.functional.scaled_dot_product_attention(
             q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=mask is None
         )
@@ -298,6 +287,8 @@ def __init__(self, config: Config) -> None:
         self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
         self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
 
+        self.config = config
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_fc_1 = self.fc_1(x)
         x_fc_2 = self.fc_2(x)
@@ -305,13 +296,43 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.proj(x)
 
 
+class GemmaMLP(LLaMAMLP):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.gelu(x_fc_1, approximate=self.config.gelu_approximate) * x_fc_2
+        return self.proj(x)
+
+
+class LLaMAMoE(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.gate = nn.Linear(config.n_embd, config.n_expert, bias=False)
+        self.experts = nn.ModuleList(LLaMAMLP(config) for _ in range(config.n_expert))
+
+        self.config = config
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Derived from: https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
+        See also figure 1 in https://arxiv.org/abs/2211.15841
+        """
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        x = x.view(-1, C)  # (B*T, C)
+        router = self.gate(x)  # (B*T, n_expert)
+        probs, indices = torch.topk(router, self.config.n_expert_per_token)  # (B*T, n_expert_per_token)
+        probs = probs.softmax(dim=1, dtype=torch.float).to(dtype=x.dtype)
+        masks = indices.unsqueeze(-1) == torch.arange(self.config.n_expert, device=x.device)
+        masks = masks.permute(2, 0, 1)  # (n_expert, B*T, n_expert_per_token)
+        y = torch.zeros_like(x)  # (B*T, C)
+        for mask, expert in zip(masks, self.experts):
+            token_idx, expert_idx = torch.where(mask)
+            y[token_idx] += probs[token_idx, expert_idx, None] * expert(x[token_idx])
+        return y.view(B, T, C)
+
+
 def build_rope_cache(
-    seq_len: int,
-    n_elem: int,
-    dtype: torch.dtype,
-    device: Optional[torch.device] = None,
-    base: int = 10000,
-    condense_ratio: int = 1,
+    seq_len: int, n_elem: int, device: Optional[torch.device] = None, base: int = 10000, condense_ratio: int = 1
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Enhanced Transformer with Rotary Position Embedding.
 
@@ -320,7 +341,7 @@ def build_rope_cache(
     https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
     """
     # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device) / n_elem))
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
 
     # Create position indexes `[0, 1, ..., seq_len - 1]`
     seq_idx = torch.arange(seq_len, device=device) / condense_ratio
@@ -328,12 +349,7 @@ def build_rope_cache(
     # Calculate the product of position index and $\theta_i$
     idx_theta = torch.outer(seq_idx, theta).repeat(1, 2)
 
-    cos, sin = torch.cos(idx_theta), torch.sin(idx_theta)
-
-    # this is to mimic the behaviour of complex32, else we will get different results
-    if dtype in (torch.float16, torch.bfloat16, torch.int8):
-        return cos.half(), sin.half()
-    return cos, sin
+    return torch.cos(idx_theta), torch.sin(idx_theta)
 
 
 def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
@@ -342,7 +358,7 @@ def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.T
     x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
     rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
     roped = (x * cos) + (rotated * sin)
-    return roped.type_as(x)
+    return roped.to(dtype=x.dtype)
 
 
 class KVCache(nn.Module):
@@ -365,3 +381,12 @@ def forward(self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor) ->
         k = self.k.index_copy_(2, input_pos, k)
         v = self.v.index_copy_(2, input_pos, v)
         return k, v
+
+    def reset_parameters(self) -> None:
+        torch.nn.init.zeros_(self.k)
+        torch.nn.init.zeros_(self.v)
+
+
+def build_mask_cache(max_seq_length: int, device: Optional[torch.device] = None) -> torch.Tensor:
+    ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool)
+    return torch.tril(ones).unsqueeze(0).unsqueeze(0)
diff --git a/lit_gpt/packed_dataset.py b/lit_gpt/packed_dataset.py
index 12f85f7..2b5b3d6 100644
--- a/lit_gpt/packed_dataset.py
+++ b/lit_gpt/packed_dataset.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 # Very loosely inspired by indexed_dataset in Fairseq, Megatron
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py
 
@@ -219,6 +221,8 @@ def __init__(self, datasets, seed, weights=None):
         n_datasets = len(datasets)
         if weights is None:
             self._weights = [1 / n_datasets] * n_datasets
+        else:
+            self._weights = [w / sum(weights) for w in weights]
 
     def __iter__(self):
         return CombinedDatasetIterator(self._datasets, self._seed, self._weights)
diff --git a/lit_gpt/rmsnorm.py b/lit_gpt/rmsnorm.py
index e68a87c..dcaab67 100644
--- a/lit_gpt/rmsnorm.py
+++ b/lit_gpt/rmsnorm.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import torch
 
 
@@ -8,17 +10,25 @@ class RMSNorm(torch.nn.Module):
     https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
     """
 
-    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_unit_offset: bool = False) -> None:
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(size))
         self.eps = eps
         self.dim = dim
+        self.add_unit_offset = add_unit_offset
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        dtype = x.dtype
+        x = x.float()
         # NOTE: the original RMSNorm paper implementation is not equivalent
         norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
         x_normed = x * torch.rsqrt(norm_x + self.eps)
-        return self.weight * x_normed
+        x_normed = x_normed.to(dtype=dtype)
+        if self.add_unit_offset:
+            # Gemma model requires a unit offset
+            # https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L176
+            return x_normed * (1 + self.weight)
+        return x_normed * self.weight
 
     def reset_parameters(self) -> None:
         torch.nn.init.ones_(self.weight)
diff --git a/lit_gpt/speed_monitor.py b/lit_gpt/speed_monitor.py
deleted file mode 100644
index 4d4b1b0..0000000
--- a/lit_gpt/speed_monitor.py
+++ /dev/null
@@ -1,425 +0,0 @@
-import time
-from collections import deque
-from contextlib import nullcontext
-from typing import Any, Callable, Deque, Dict, Optional
-
-import torch
-from lightning import Callback, Fabric, LightningModule, Trainer
-from lightning.fabric.accelerators.xla import _XLA_GREATER_EQUAL_2_1
-from lightning.fabric.plugins import (
-    BitsandbytesPrecision,
-    DoublePrecision,
-    FSDPPrecision,
-    HalfPrecision,
-    MixedPrecision,
-    Precision,
-    TransformerEnginePrecision,
-    XLAPrecision,
-)
-from lightning.fabric.utilities.rank_zero import rank_zero_only as fabric_rank_zero_only
-from lightning.pytorch.plugins import (
-    DoublePrecisionPlugin,
-    FSDPPrecisionPlugin,
-    HalfPrecisionPlugin,
-    MixedPrecisionPlugin,
-    XLAPrecisionPlugin,
-)
-from lightning.pytorch.utilities.rank_zero import rank_zero_only as trainer_rank_zero_only
-from torch.utils.flop_counter import FlopCounterMode
-
-from lit_gpt import GPT
-from lit_gpt.utils import num_parameters
-
-GPU_AVAILABLE_FLOPS = {
-    # source: https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-datasheet
-    # nvidia publishes spec sheet with a 2x sparsity factor
-    "h100-sxm": {
-        torch.float64: 67e12,
-        torch.float32: 67e12,
-        torch.bfloat16: 1.979e15 / 2,
-        torch.float16: 1.979e15 / 2,
-        torch.int8: 3.958e15 / 2,
-    },
-    "h100-pcie": {
-        torch.float64: 51e12,
-        torch.float32: 51e12,
-        torch.bfloat16: 1.513e15 / 2,
-        torch.float16: 1.513e15 / 2,
-        torch.int8: 3.026e15 / 2,
-    },
-    # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
-    # sxm and pcie have same flop counts
-    "a100": {torch.float64: 19.5e12, torch.float32: 19.5e12, torch.bfloat16: 312e12, torch.float16: 312e12},
-    # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a10/pdf/a10-datasheet.pdf
-    "a10g": {torch.float32: 31.2e12, torch.bfloat16: 125e12, torch.float16: 125e12},
-    # source: https://images.nvidia.com/content/technologies/volta/pdf/volta-v100-datasheet-update-us-1165301-r5.pdf
-    "v100-sxm": {torch.float64: 7.8e12, torch.float32: 15.7e12, torch.float16: 125e12},
-    "v100-pcie": {torch.float64: 7e12, torch.float32: 14e12, torch.float16: 112e12},
-    "v100s-pcie": {torch.float64: 8.2e12, torch.float32: 16.4e12, torch.float16: 130e12},
-    # source: https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-t4/t4-tensor-core-datasheet-951643.pdf
-    # sxm and pcie have same flop counts
-    "t4": {torch.float32: 8.1e12, torch.float16: 65e12, torch.int8: 130e12},
-    # https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf
-    "quadro rtx 5000": {torch.float32: 11.2e12, torch.float16: 89.2e12},
-}
-
-TPU_AVAILABLE_FLOPS = {
-    # flop count for each TPU generation is the same for all precisions
-    # since bfloat16 precision is always used for performing matrix operations
-    # for more info: https://cloud.google.com/tpu/docs/bfloat16#choosing_bfloat16
-    # source: https://arxiv.org/pdf/1907.10701.pdf
-    "v2": 45e12,
-    # source: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_v3
-    "v3": 123e12,
-    # source: https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu_v4
-    "v4": 275e12,
-    # source: https://cloud.google.com/tpu/docs/v5e-training
-    "v5litepod": 197e12,
-}
-
-
-def get_flops_available(device: torch.device, dtype: torch.dtype) -> Optional[float]:
-    if device.type == "cuda":
-        device_name = torch.cuda.get_device_name(device).lower()
-        if "h100" in device_name and "hbm3" in device_name:
-            device_name = "h100-sxm"
-        elif "h100" in device_name and ("pcie" in device_name or "hbm2e" in device_name):
-            device_name = "h100-pcie"
-        elif "a100" in device_name:
-            device_name = "a100"
-        elif "a10g" in device_name:
-            device_name = "a10g"
-        elif "v100-sxm" in device_name:
-            device_name = "v100-sxm"
-        elif "v100-pcie" in device_name:
-            device_name = "v100-pcie"
-        elif "t4" in device_name:
-            device_name = "t4"
-        elif "quadro rtx 5000" in device_name:
-            device_name = "quadro rtx 5000"
-        else:
-            device_name = None
-
-        if device_name is not None:
-            try:
-                return int(GPU_AVAILABLE_FLOPS[device_name][dtype])
-            except KeyError:
-                raise KeyError(
-                    f"flop count not found for {device_name} with dtype: {dtype}; "
-                    "MFU cannot be calculated and reported."
-                )
-    elif device.type == "xla":
-        if _XLA_GREATER_EQUAL_2_1:
-            from torch_xla._internal import tpu
-        else:
-            from torch_xla.experimental import tpu
-
-        device_name = tpu.get_tpu_env()["TYPE"].lower()
-        try:
-            return int(TPU_AVAILABLE_FLOPS[device_name])
-        except KeyError:
-            raise KeyError(
-                f"flop count not found for {device_name} with dtype: {dtype}; MFU cannot be calculated and reported."
-            )
-
-    return None
-
-
-# Adapted from https://github.com/mosaicml/composer/blob/f2a2dc820cb75023b9eb7c46fdfd25273712abd0/composer/callbacks/speed_monitor.py
-
-
-class SpeedMonitorBase:
-    """Logs the training throughput and utilization.
-
-    +-------------------------------------+-----------------------------------------------------------+
-    | Key                                 | Logged data                                               |
-    +=====================================+===========================================================+
-    |                                     | Rolling average (over `window_size` most recent           |
-    | `throughput/batches_per_sec`        | batches) of the number of batches processed per second    |
-    |                                     |                                                           |
-    +-------------------------------------+-----------------------------------------------------------+
-    |                                     | Rolling average (over `window_size` most recent           |
-    | `throughput/samples_per_sec`        | batches) of the number of samples processed per second    |
-    |                                     |                                                           |
-    +-------------------------------------+-----------------------------------------------------------+
-    |                                     | Rolling average (over `window_size` most recent           |
-    | `throughput/tokens_per_sec`         | batches) of the number of tokens processed per second.    |
-    |                                     | This may include padding depending on dataset             |
-    +-------------------------------------+-----------------------------------------------------------+
-    |                                     | Estimates flops by `flops_per_batch * batches_per_sec`    |
-    | `throughput/flops_per_sec`          |                                                           |
-    |                                     |                                                           |
-    +-------------------------------------+-----------------------------------------------------------+
-    | `throughput/device/batches_per_sec` | `throughput/batches_per_sec` divided by world size        |
-    +-------------------------------------+-----------------------------------------------------------+
-    | `throughput/device/samples_per_sec` | `throughput/samples_per_sec` divided by world size        |
-    +-------------------------------------+-----------------------------------------------------------+
-    |                                     | `throughput/tokens_per_sec` divided by world size. This   |
-    | `throughput/device/tokens_per_sec`  | may include pad tokens depending on dataset               |
-    |                                     |                                                           |
-    +-------------------------------------+-----------------------------------------------------------+
-    |                                     | `throughput/flops_per_sec` divided by world size. Only    |
-    | `throughput/device/flops_per_sec`   | logged when model has attribute `flops_per_batch`         |
-    |                                     |                                                           |
-    +-------------------------------------+-----------------------------------------------------------+
-    |                                     | `throughput/device/flops_per_sec` divided by world size.  |
-    | `throughput/device/mfu`             |                                                           |
-    |                                     |                                                           |
-    +-------------------------------------+-----------------------------------------------------------+
-    | `time/train`                        | Total elapsed training time                               |
-    +-------------------------------------+-----------------------------------------------------------+
-    | `time/val`                          | Total elapsed validation time                             |
-    +-------------------------------------+-----------------------------------------------------------+
-    | `time/total`                        | Total elapsed time (time/train + time/val)                |
-    +-------------------------------------+-----------------------------------------------------------+
-
-    Notes:
-        - The implementation assumes that devices are homogeneous as it normalizes by the world size.
-        - Tokens/sec, flops/sec and MFU do not account for padding tokens if present. We suggest using samples/sec or
-          batches/sec to measure throughput under this circumstance.
-        - Be careful when comparing MFU numbers across projects, as this will highly depend on the ``flops_per_batch``.
-          There is no widespread, realistic, and reliable implementation to compute them.
-          We suggest using our ``measure_flops`` function, but many other works will use ``estimated_flops`` which
-          will almost always be an overestimate when compared to the true value.
-
-    Args:
-        window_size (int, optional): Number of batches to use for a rolling average of throughput.
-            Defaults to 100.
-        time_unit (str, optional): Time unit to use for `time` logging. Can be one of
-            'seconds', 'minutes', 'hours', or 'days'. Defaults to 'hours'.
-    """
-
-    def __init__(
-        self,
-        flops_available: float,
-        log_dict: Callable[[Dict, int], None],
-        window_size: int = 100,
-        time_unit: str = "hours",
-    ):
-        self.flops_available = flops_available
-        self.log_dict = log_dict
-
-        # Track the batch num samples and wct to compute throughput over a window of batches
-        self.history_samples: Deque[int] = deque(maxlen=window_size + 1)
-        self.history_wct: Deque[float] = deque(maxlen=window_size + 1)
-        self.history_lengths: Deque[int] = deque(maxlen=window_size + 1)
-        self.history_flops: Deque[int] = deque(maxlen=window_size + 1)
-
-        self.divider = 1
-        if time_unit == "seconds":
-            self.divider = 1
-        elif time_unit == "minutes":
-            self.divider = 60
-        elif time_unit == "hours":
-            self.divider = 60 * 60
-        elif time_unit == "days":
-            self.divider = 60 * 60 * 24
-        else:
-            raise ValueError(
-                f'Invalid time_unit: {time_unit}. Must be one of "seconds", "minutes", "hours", or "days".'
-            )
-
-        # Keep track of time spent evaluating
-        self.total_eval_wct = 0.0
-        self.step = -1
-
-    def on_train_batch_end(
-        self,
-        samples: int,  # total samples seen (per device)
-        train_elapsed: float,  # total training time (seconds)
-        world_size: int,
-        flops_per_batch: Optional[int] = None,  # (per device)
-        lengths: Optional[int] = None,  # total length of the samples seen (per device)
-    ) -> None:
-        self.step += 1
-        step = self.step
-        metrics = {}
-
-        self.history_samples.append(samples)
-        if lengths is not None:
-            self.history_lengths.append(lengths)
-            # if lengths are passed, there should be as many values as samples
-            assert len(self.history_samples) == len(self.history_lengths)
-        self.history_wct.append(train_elapsed)
-        if len(self.history_wct) == self.history_wct.maxlen:
-            elapsed_batches = len(self.history_samples) - 1
-            elapsed_samples = self.history_samples[-1] - self.history_samples[0]
-            elapsed_wct = self.history_wct[-1] - self.history_wct[0]
-            samples_per_sec = elapsed_samples * world_size / elapsed_wct
-            dev_samples_per_sec = elapsed_samples / elapsed_wct
-            metrics.update(
-                {
-                    "throughput/batches_per_sec": elapsed_batches * world_size / elapsed_wct,
-                    "throughput/samples_per_sec": samples_per_sec,
-                    "throughput/device/batches_per_sec": elapsed_batches / elapsed_wct,
-                    "throughput/device/samples_per_sec": dev_samples_per_sec,
-                }
-            )
-            if lengths is not None:
-                elapsed_lengths = int(self.history_lengths[-1]) - int(self.history_lengths[0])
-                avg_length = elapsed_lengths / elapsed_batches
-                metrics.update(
-                    {
-                        "throughput/tokens_per_sec": samples_per_sec * avg_length,
-                        "throughput/device/tokens_per_sec": dev_samples_per_sec * avg_length,
-                    }
-                )
-
-        if flops_per_batch is not None:
-            # sum of flops per batch across ranks
-            self.history_flops.append(flops_per_batch * world_size)
-        if len(self.history_flops) == self.history_flops.maxlen:
-            elapsed_flops = sum(self.history_flops) - self.history_flops[0]
-            elapsed_wct = self.history_wct[-1] - self.history_wct[0]
-            flops_per_sec = elapsed_flops / elapsed_wct
-            device_flops_per_sec = flops_per_sec / world_size
-            metrics.update(
-                {"throughput/flops_per_sec": flops_per_sec, "throughput/device/flops_per_sec": device_flops_per_sec}
-            )
-            if self.flops_available:
-                metrics["throughput/device/mfu"] = device_flops_per_sec / self.flops_available
-
-        metrics.update(
-            {
-                "time/train": train_elapsed / self.divider,
-                "time/val": self.total_eval_wct / self.divider,
-                "time/total": (train_elapsed + self.total_eval_wct) / self.divider,
-                "samples": samples,
-            }
-        )
-
-        self.log_dict(metrics, step)
-
-    def eval_end(self, eval_elapsed: float) -> None:
-        self.total_eval_wct += eval_elapsed  # seconds
-
-
-def plugin_to_compute_dtype(plugin: Precision) -> torch.dtype:
-    if isinstance(plugin, BitsandbytesPrecision):
-        return plugin.dtype
-    if isinstance(plugin, (HalfPrecision, MixedPrecision, HalfPrecisionPlugin)):
-        return plugin._desired_input_dtype
-    if isinstance(plugin, MixedPrecisionPlugin):
-        return torch.bfloat16 if plugin.precision == "bf16-mixed" else torch.half
-    if isinstance(plugin, (DoublePrecision, DoublePrecisionPlugin)):
-        return torch.double
-    if isinstance(plugin, (XLAPrecision, XLAPrecisionPlugin)):
-        return plugin._desired_dtype
-    if isinstance(plugin, TransformerEnginePrecision):
-        return torch.int8
-    if isinstance(plugin, (FSDPPrecision, FSDPPrecisionPlugin)):
-        return plugin.mixed_precision_config.reduce_dtype
-    if isinstance(plugin, Precision):
-        return torch.float32
-    raise NotImplementedError(plugin)
-
-
-class SpeedMonitorFabric(SpeedMonitorBase):
-    def __init__(self, fabric: Fabric, *args: Any, **kwargs: Any) -> None:
-        dtype = plugin_to_compute_dtype(fabric.strategy.precision)
-        flops_available = get_flops_available(fabric.device, dtype)
-        super().__init__(flops_available, fabric.log_dict, *args, **kwargs)
-
-    @fabric_rank_zero_only
-    def on_train_batch_end(self, *args: Any, **kwargs: Any) -> None:
-        super().on_train_batch_end(*args, **kwargs)
-
-
-class SpeedMonitorCallback(Callback):
-    def __init__(self, length_fn: Callable[[Any], int], batch_size: int, **kwargs: Any) -> None:
-        super().__init__()
-        self.speed_monitor: Optional[SpeedMonitorBase] = None
-        self.speed_monitor_kwargs = kwargs
-        self.length_fn = length_fn
-        self.batch_size = batch_size
-        self.eval_t0: int = 0
-        self.train_t0: int = 0
-        self.total_lengths: int = 0
-
-    def setup(self, trainer: Trainer, pl_module: LightningModule, stage: str) -> None:
-        if self.speed_monitor is not None:
-            return  # already setup
-        dtype = plugin_to_compute_dtype(trainer.precision_plugin)
-        flops_available = get_flops_available(trainer.strategy.root_device, dtype)
-        self.speed_monitor = SpeedMonitorBase(flops_available, trainer.logger.log_metrics, **self.speed_monitor_kwargs)
-
-    @trainer_rank_zero_only
-    def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
-        if trainer.fit_loop._should_accumulate():
-            return
-
-        self.train_t0 = time.perf_counter()
-
-    @trainer_rank_zero_only
-    def on_train_batch_end(
-        self, trainer: Trainer, pl_module: LightningModule, outputs: Any, batch: Any, batch_idx: int
-    ) -> None:
-        self.total_lengths += self.length_fn(batch)
-        if trainer.fit_loop._should_accumulate():
-            return
-        train_elapsed = time.perf_counter() - self.train_t0
-        assert self.speed_monitor is not None
-        iter_num = trainer.fit_loop.total_batch_idx
-        assert (measured_flops := pl_module.measured_flops) is not None
-        self.speed_monitor.on_train_batch_end(
-            (iter_num + 1) * self.batch_size,
-            train_elapsed,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            trainer.world_size,
-            flops_per_batch=measured_flops,
-            lengths=self.total_lengths,
-        )
-
-    @trainer_rank_zero_only
-    def on_validation_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
-        self.eval_t0 = time.perf_counter()
-
-    @trainer_rank_zero_only
-    def on_validation_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
-        eval_elapsed = time.perf_counter() - self.eval_t0
-        assert self.speed_monitor is not None
-        self.speed_monitor.eval_end(eval_elapsed)
-
-
-def flops_per_param(max_seq_length: int, n_layer: int, n_embd: int, n_params: int) -> int:
-    flops_per_token = 2 * n_params  # each parameter is used for a MAC (2 FLOPS) per network operation
-    # this assumes that all samples have a fixed length equal to the block size
-    # which is most likely false during finetuning
-    flops_per_seq = flops_per_token * max_seq_length
-    attn_flops_per_seq = n_layer * 2 * 2 * (n_embd * (max_seq_length**2))
-    return flops_per_seq + attn_flops_per_seq
-
-
-def estimate_flops(model: GPT) -> int:
-    """Measures estimated FLOPs for MFU.
-
-    Refs:
-        * https://ar5iv.labs.arxiv.org/html/2205.05198#A1
-        * https://ar5iv.labs.arxiv.org/html/2204.02311#A2
-    """
-    # using all parameters for this is a naive over estimation because not all model parameters actually contribute to
-    # this FLOP computation (e.g. embedding, norm). For this reason, the result will be higher by a fixed percentage
-    # (~10%) compared to the measured FLOPs, making those lower but more realistic.
-    # For a proper estimate, this needs a more fine-grained calculation as in Appendix A of the paper.
-    n_trainable_params = num_parameters(model, requires_grad=True)
-    trainable_flops = flops_per_param(
-        model.max_seq_length, model.config.n_layer, model.config.n_embd, n_trainable_params
-    )
-    # forward + backward + gradients (assumes no gradient accumulation)
-    ops_per_step = 3 if model.training else 1
-    n_frozen_params = num_parameters(model, requires_grad=False)
-    frozen_flops = flops_per_param(model.max_seq_length, model.config.n_layer, model.config.n_embd, n_frozen_params)
-    # forward + backward
-    frozen_ops_per_step = 2 if model.training else 1
-    return ops_per_step * trainable_flops + frozen_ops_per_step * frozen_flops
-
-
-def measure_flops(model: GPT, x: torch.Tensor) -> int:
-    """Measures real FLOPs for HFU"""
-    flop_counter = FlopCounterMode(model, display=False)
-    ctx = nullcontext() if model.training else torch.no_grad()
-    with ctx, flop_counter:
-        y = model(x)
-        if model.training:
-            y.sum().backward()
-    return flop_counter.get_total_flops()
diff --git a/lit_gpt/tokenizer.py b/lit_gpt/tokenizer.py
index 1907ca7..3a6758e 100644
--- a/lit_gpt/tokenizer.py
+++ b/lit_gpt/tokenizer.py
@@ -1,12 +1,18 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
 
 class Tokenizer:
-    def __init__(self, checkpoint_dir: Path) -> None:
+    def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
+        checkpoint_dir = Path(checkpoint_dir)
+        if not checkpoint_dir.exists():
+            raise NotADirectoryError(f"The checkpoint directory does not exist: {str(checkpoint_dir)}")
+
         self.use_bos = self.check_if_bos_token_used(checkpoint_dir)
         self.bos_id = None
         self.eos_id = None
diff --git a/lit_gpt/utils.py b/lit_gpt/utils.py
index cfe0bc2..c910279 100644
--- a/lit_gpt/utils.py
+++ b/lit_gpt/utils.py
@@ -1,11 +1,13 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """Utility functions for training and inference."""
+
 import math
 import pickle
 import sys
-from contextlib import nullcontext
 from io import BytesIO
 from pathlib import Path
-from typing import ContextManager, Dict, List, Mapping, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, TypeVar, Union
 
 import lightning as L
 import torch
@@ -14,6 +16,10 @@
 from lightning.fabric.strategies import FSDPStrategy
 from lightning.fabric.utilities.load import _lazy_load as lazy_load
 from torch.serialization import normalize_storage_type
+from typing_extensions import Self
+
+if TYPE_CHECKING:
+    from lit_gpt import GPT
 
 
 def find_multiple(n: int, k: int) -> int:
@@ -35,28 +41,12 @@ def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> i
     return total
 
 
-def gptq_quantization(enabled: bool = False) -> ContextManager:
-    if not enabled:
-        return nullcontext()
-
-    from lightning.fabric.plugins.precision.utils import _ClassReplacementContextManager
-
-    from quantize.gptq import ColBlockQuantizedLinear
-
-    class QuantizedLinear(ColBlockQuantizedLinear):
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, bits=4, tile_cols=-1, **kwargs)
-
-    return _ClassReplacementContextManager({"torch.nn.Linear": QuantizedLinear})
-
-
 def check_valid_checkpoint_dir(checkpoint_dir: Path) -> None:
     files = {
         "lit_model.pth": (checkpoint_dir / "lit_model.pth").is_file(),
         "lit_config.json": (checkpoint_dir / "lit_config.json").is_file(),
-        "tokenizer.json OR tokenizer.model": (checkpoint_dir / "tokenizer.json").is_file() or (
-            checkpoint_dir / "tokenizer.model"
-        ).is_file(),
+        "tokenizer.json OR tokenizer.model": (checkpoint_dir / "tokenizer.json").is_file()
+        or (checkpoint_dir / "tokenizer.model").is_file(),
         "tokenizer_config.json": (checkpoint_dir / "tokenizer_config.json").is_file(),
     }
     if checkpoint_dir.is_dir():
@@ -236,7 +226,10 @@ def __exit__(self, type, value, traceback):
 
 
 def chunked_cross_entropy(
-    logits: Union[torch.Tensor, List[torch.Tensor]], targets: torch.Tensor, chunk_size: int = 128
+    logits: Union[torch.Tensor, List[torch.Tensor]],
+    targets: torch.Tensor,
+    chunk_size: int = 128,
+    ignore_index: int = -1,
 ) -> torch.Tensor:
     # with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate
     # the memory usage in fine-tuning settings with low number of parameters.
@@ -250,31 +243,33 @@ def chunked_cross_entropy(
             logits = torch.cat(logits, dim=1)
             logits = logits.reshape(-1, logits.size(-1))
             targets = targets.reshape(-1)
-            return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)
+            return torch.nn.functional.cross_entropy(logits, targets, ignore_index=ignore_index)
 
         # chunk cross entropy
         logit_chunks = [logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits]
         target_chunks = [target_chunk.reshape(-1) for target_chunk in targets.split(logits[0].size(1), dim=1)]
         loss_chunks = [
-            torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=-1, reduction="none")
+            torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=ignore_index, reduction="none")
             for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
         ]
-        return torch.cat(loss_chunks).mean()
+        non_masked_elems = (targets != ignore_index).sum()
+        return torch.cat(loss_chunks).sum() / max(1, non_masked_elems)
 
     # no chunking at all
     logits = logits.reshape(-1, logits.size(-1))
     targets = targets.reshape(-1)
     if chunk_size == 0:
-        return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)
+        return torch.nn.functional.cross_entropy(logits, targets, ignore_index=ignore_index)
 
     # lm_head wasn't chunked, chunk cross entropy
     logit_chunks = logits.split(chunk_size)
     target_chunks = targets.split(chunk_size)
     loss_chunks = [
-        torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=-1, reduction="none")
+        torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=ignore_index, reduction="none")
         for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
     ]
-    return torch.cat(loss_chunks).mean()
+    non_masked_elems = (targets != ignore_index).sum()
+    return torch.cat(loss_chunks).sum() / max(1, non_masked_elems)
 
 
 def map_old_state_dict_weights(state_dict: Dict, mapping: Mapping, prefix: str) -> Dict:
@@ -309,3 +304,76 @@ def load_checkpoint(fabric: L.Fabric, model: nn.Module, checkpoint_path: Path, s
         state_dict = lazy_load(checkpoint_path)
         state_dict = state_dict.get("model", state_dict)
         model.load_state_dict(state_dict, strict=strict)
+
+
+def flops_per_param(max_seq_length: int, n_layer: int, n_embd: int, n_params: int) -> int:
+    flops_per_token = 2 * n_params  # each parameter is used for a MAC (2 FLOPS) per network operation
+    # this assumes that all samples have a fixed length equal to the block size
+    # which is most likely false during finetuning
+    flops_per_seq = flops_per_token * max_seq_length
+    attn_flops_per_seq = n_layer * 2 * 2 * (n_embd * (max_seq_length**2))
+    return flops_per_seq + attn_flops_per_seq
+
+
+def estimate_flops(model: "GPT", training: bool) -> int:
+    """Measures estimated FLOPs for MFU.
+
+    Refs:
+        * https://ar5iv.labs.arxiv.org/html/2205.05198#A1
+        * https://ar5iv.labs.arxiv.org/html/2204.02311#A2
+    """
+    # using all parameters for this is a naive over estimation because not all model parameters actually contribute to
+    # this FLOP computation (e.g. embedding, norm). For this reason, the result will be higher by a fixed percentage
+    # (~10%) compared to the measured FLOPs, making those lower but more realistic.
+    # For a proper estimate, this needs a more fine-grained calculation as in Appendix A of the paper.
+    n_trainable_params = num_parameters(model, requires_grad=True)
+    trainable_flops = flops_per_param(
+        model.max_seq_length, model.config.n_layer, model.config.n_embd, n_trainable_params
+    )
+    # forward + backward + gradients (assumes no gradient accumulation)
+    ops_per_step = 3 if training else 1
+    n_frozen_params = num_parameters(model, requires_grad=False)
+    frozen_flops = flops_per_param(model.max_seq_length, model.config.n_layer, model.config.n_embd, n_frozen_params)
+    # forward + backward
+    frozen_ops_per_step = 2 if training else 1
+    return ops_per_step * trainable_flops + frozen_ops_per_step * frozen_flops
+
+
+class CycleIterator:
+    """An iterator that cycles through an iterable indefinitely.
+
+    Example:
+        >>> iterator = CycleIterator([1, 2, 3])
+        >>> [next(iterator) for _ in range(5)]
+        [1, 2, 3, 1, 2]
+
+    Note:
+        Unlike ``itertools.cycle``, this iterator does not cache the values of the iterable.
+    """
+
+    def __init__(self, iterable: Iterable) -> None:
+        self.iterable = iterable
+        self.epoch = 0
+        self._iterator = None
+
+    def __next__(self) -> Any:
+        if self._iterator is None:
+            self._iterator = iter(self.iterable)
+        try:
+            return next(self._iterator)
+        except StopIteration:
+            self._iterator = iter(self.iterable)
+            self.epoch += 1
+            return next(self._iterator)
+
+    def __iter__(self) -> Self:
+        return self
+
+
+def CLI(*args: Any, **kwargs: Any) -> Any:
+    from jsonargparse import CLI, set_docstring_parse_options
+
+    set_docstring_parse_options(attribute_docstrings=True)
+
+    kwargs.setdefault("as_positional", False)
+    return CLI(*args, **kwargs)
diff --git a/notebooks/falcon-inference.ipynb b/notebooks/falcon-inference.ipynb
index 4ce0de2..3dab497 100644
--- a/notebooks/falcon-inference.ipynb
+++ b/notebooks/falcon-inference.ipynb
@@ -48,7 +48,7 @@
    "outputs": [],
    "source": [
     "# install the dependencies\n",
-    "!pip install huggingface_hub tokenizers sentencepiece -r requirements.txt -q"
+    "!pip install -r requirements-all.txt -q"
    ]
   },
   {
diff --git a/pretrain/openwebtext.py b/pretrain/openwebtext.py
index d1d4c95..653b21f 100644
--- a/pretrain/openwebtext.py
+++ b/pretrain/openwebtext.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import math
 import sys
 import time
@@ -9,6 +11,7 @@
 import torch
 from lightning.fabric.loggers import CSVLogger
 from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities import ThroughputMonitor, measure_flops
 from torch.utils.data import DataLoader, IterableDataset
 
 # support running without installing as a package
@@ -16,41 +19,36 @@
 sys.path.append(str(wd))
 
 from lit_gpt import Config
+from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 from lit_gpt.model import GPT, Block
-from lit_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
-from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
-from lit_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters
-
-model_name = "pythia-70m"
-name = "openwebtext"
-out_dir = Path("out") / name
-data_dir = Path("data") / name
-save_interval = 10
-eval_interval = 1000
-eval_iters = 100
-log_interval = 1
-
-# Hyperparameters
-learning_rate = 6e-4
-batch_size = 125
-micro_batch_size = 5
-gradient_accumulation_steps = batch_size // micro_batch_size
-assert gradient_accumulation_steps > 0
-max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices
-weight_decay = 1e-1
-beta1 = 0.9
-beta2 = 0.95
-grad_clip = 1.0
-decay_lr = True
-warmup_iters = 2000
-lr_decay_iters = max_iters
-min_lr = 6e-5
-
-hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
-logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)
-
-
-def setup(devices: int = 1, precision: Optional[str] = None, resume: Union[bool, Path] = False) -> None:
+from lit_gpt.utils import CLI, chunked_cross_entropy, estimate_flops, get_default_supported_precision, num_parameters
+
+
+def setup(
+    model_name: str = "pythia-70m",
+    precision: Optional[str] = None,
+    resume: Union[bool, Path] = False,
+    seed: int = 1337,
+    devices: int = 1,
+    io: IOArgs = IOArgs(train_data_dir=Path("data/openwebtext"), val_data_dir=None, out_dir=Path("out/openwebtext")),
+    train: TrainArgs = TrainArgs(
+        save_interval=1000,
+        log_interval=1,
+        global_batch_size=125,
+        micro_batch_size=5,
+        lr_warmup_steps=100,
+        epochs=1,
+        epoch_size=600000,
+        learning_rate=6e-4,
+        weight_decay=1e-1,
+        beta1=0.9,
+        beta2=0.95,
+        max_norm=1.0,
+        min_lr=6e-5,
+    ),
+    eval: EvalArgs = EvalArgs(interval=1000, max_iters=100),
+) -> None:
+    print(locals())
     precision = precision or get_default_supported_precision(training=True)
 
     if devices > 1:
@@ -64,143 +62,165 @@ def setup(devices: int = 1, precision: Optional[str] = None, resume: Union[bool,
     else:
         strategy = "auto"
 
+    logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval)
     fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger)
-    fabric.print(hparams)
-    fabric.launch(main, resume=resume)
+
+    fabric.launch(main, devices, resume, seed, Config.from_name(name=model_name), io, train, eval)
 
 
-def main(fabric: L.Fabric, resume: Union[bool, Path]) -> None:
-    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
+def main(
+    fabric: L.Fabric,
+    devices: int,
+    resume: Union[bool, Path],
+    seed: int,
+    config: Config,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
+) -> None:
+    validate_args(io, train, eval)
 
     if fabric.global_rank == 0:
-        out_dir.mkdir(parents=True, exist_ok=True)
+        io.out_dir.mkdir(parents=True, exist_ok=True)
 
-    fabric.seed_everything(1337, workers=True)  # same seed for every process to init model (FSDP)
+    fabric.seed_everything(seed, workers=True)  # same seed for every process to init model (FSDP)
 
-    config = Config.from_name(model_name)
     fabric.print(f"Loading model with {config.__dict__}")
     t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True):
+    with fabric.init_module(empty_init=(fabric.world_size > 1)):
         model = GPT(config)
-        model.apply(model._init_weights)
+    model.apply(model._init_weights)
 
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.")
     fabric.print(f"Total parameters {num_parameters(model):,}")
 
     model = fabric.setup(model)
     optimizer = torch.optim.AdamW(
-        model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), foreach=False
+        model.parameters(),
+        lr=train.learning_rate,
+        weight_decay=train.weight_decay,
+        betas=(train.beta1, train.beta2),
+        foreach=False,
     )
     optimizer = fabric.setup_optimizers(optimizer)
 
-    train_data, val_data = load_datasets(data_dir, max_seq_length=model.max_seq_length)
-    train_dataloader = DataLoader(train_data, batch_size=micro_batch_size, num_workers=2)
-    val_dataloader = DataLoader(val_data, batch_size=micro_batch_size, num_workers=2)
+    train_data, val_data = load_datasets(io, max_seq_length=model.max_seq_length)
+    train_dataloader = DataLoader(train_data, batch_size=train.micro_batch_size, num_workers=2)
+    val_dataloader = DataLoader(val_data, batch_size=train.micro_batch_size, num_workers=2)
     train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader)
 
-    state = {"model": model, "optimizer": optimizer, "hparams": hparams, "iter_num": 0, "step_count": 0}
+    state = {"model": model, "optimizer": optimizer, "iter_num": 0, "step_count": 0}
 
     if resume is True:
-        resume = sorted(out_dir.glob("*.pth"))[-1]
+        resume = max(io.out_dir.glob("*.pth"), key=lambda p: int(p.name.split("-")[1]))
     if resume:
         fabric.print(f"Resuming training from {resume}")
         fabric.load(resume, state)
 
     train_time = time.perf_counter()
-    train(fabric, state, train_dataloader, val_dataloader, speed_monitor)
+    fit(fabric, devices, state, train_dataloader, val_dataloader, io, train, eval)
     fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
     if fabric.device.type == "cuda":
         fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
 
 
-def train(
+def fit(
     fabric: L.Fabric,
+    devices: int,
     state: dict,
     train_dataloader: DataLoader,
     val_dataloader: DataLoader,
-    speed_monitor: SpeedMonitorBase,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
 ) -> None:
     model = state["model"]
     optimizer = state["optimizer"]
 
-    validate(fabric, model, val_dataloader)  # sanity check
+    validate(fabric, model, val_dataloader, max_iters=2)  # sanity check
 
     with torch.device("meta"):
         meta_model = GPT(model.config)
         # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
         # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
-        estimated_flops = estimate_flops(meta_model) * micro_batch_size
+        # consider passing `flops_per_batch=estimated_flops` instead
+        estimated_flops = estimate_flops(meta_model, training=True) * train.micro_batch_size
         fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
-        x = torch.randint(0, 1, (micro_batch_size, model.max_seq_length))
-        measured_flops = measure_flops(meta_model, x)
+        x = torch.randint(0, 1, (train.micro_batch_size, model.max_seq_length))
+        forward_fn = lambda: meta_model(x)
+        loss_fn = lambda y: chunked_cross_entropy(y, x, chunk_size=0)
+        measured_flops = measure_flops(meta_model, forward_fn, loss_fn)
         fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
         del meta_model, x
 
-    total_lengths = 0
+    throughput = ThroughputMonitor(fabric, window_size=50)
     total_t0 = time.perf_counter()
 
     train_iter = iter(train_dataloader)
 
-    for state["iter_num"] in range(state["iter_num"], max_iters):
+    lr_warmup_iters = train.lr_warmup_steps * train.gradient_accumulation_iters(devices)
+    for state["iter_num"] in range(state["iter_num"], train.max_iters(devices)):
         # determine and set the learning rate for this iteration
-        lr = get_lr(state["iter_num"]) if decay_lr else learning_rate
+        lr = get_lr(
+            train.learning_rate, state["iter_num"], lr_warmup_iters, train.max_iters(devices), min_lr=train.min_lr
+        )
         for param_group in optimizer.param_groups:
             param_group["lr"] = lr
 
+        iter_num = state["iter_num"] + 1
         iter_t0 = time.perf_counter()
 
         input_ids, targets = next(train_iter)
 
-        is_accumulating = (state["iter_num"] + 1) % gradient_accumulation_steps != 0
+        is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0
         with fabric.no_backward_sync(model, enabled=is_accumulating):
             logits = model(input_ids)
             loss = chunked_cross_entropy(logits, targets, chunk_size=0)
-            fabric.backward(loss / gradient_accumulation_steps)
+            fabric.backward(loss / train.gradient_accumulation_iters(devices))
 
         if not is_accumulating:
-            fabric.clip_gradients(model, optimizer, max_norm=grad_clip)
+            fabric.clip_gradients(model, optimizer, max_norm=train.max_norm)
             optimizer.step()
             optimizer.zero_grad()
             state["step_count"] += 1
 
-        t1 = time.perf_counter()
-        total_lengths += input_ids.size(1)
-        speed_monitor.on_train_batch_end(
-            (state["iter_num"] + 1) * micro_batch_size,
-            t1 - total_t0,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            fabric.world_size,
-            flops_per_batch=measured_flops,
-            lengths=total_lengths,
-        )
-        if state["iter_num"] % log_interval == 0:
+        if iter_num % train.log_interval == 0:
+            loss_item = loss.item()  # expensive device-to-host synchronization
+            t1 = time.perf_counter()
+            throughput.update(
+                time=t1 - total_t0,
+                batches=iter_num,
+                samples=iter_num * train.micro_batch_size,
+                lengths=iter_num * train.micro_batch_size * model.max_seq_length,
+                flops=measured_flops * train.log_interval,
+            )
+            throughput.compute_and_log(step=iter_num)
             fabric.print(
-                f"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, iter time:"
+                f"iter {iter_num} step {state['step_count']}: loss {loss_item:.4f}, iter time:"
                 f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
             )
 
-        if not is_accumulating and state["step_count"] % eval_interval == 0:
+        if not is_accumulating and state["step_count"] % eval.interval == 0:
             t0 = time.perf_counter()
-            val_loss = validate(fabric, model, val_dataloader)
+            val_loss = validate(fabric, model, val_dataloader, max_iters=eval.max_iters)
             t1 = time.perf_counter() - t0
-            speed_monitor.eval_end(t1)
-            fabric.print(f"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
+            fabric.print(f"step {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
             fabric.barrier()
-        if not is_accumulating and state["step_count"] % save_interval == 0:
-            checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
+        if not is_accumulating and state["step_count"] % train.save_interval == 0:
+            checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth"
             fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
             fabric.save(checkpoint_path, state)
 
 
-@torch.inference_mode()
-def validate(fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader) -> torch.Tensor:
+# FSDP has issues with `inference_mode`
+@torch.no_grad()
+def validate(fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader, max_iters: int) -> torch.Tensor:
     fabric.print("Validating ...")
     model.eval()
     val_iter = iter(val_dataloader)
 
-    losses = torch.zeros(eval_iters, device=fabric.device)
-    for k in range(eval_iters):
+    losses = torch.zeros(max_iters, device=fabric.device)
+    for k in range(max_iters):
         input_ids, targets = next(val_iter)
         logits = model(input_ids)
         losses[k] = chunked_cross_entropy(logits, targets, chunk_size=0)
@@ -210,9 +230,9 @@ def validate(fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoade
     return out
 
 
-def load_datasets(data_dir: Path, max_seq_length: int) -> Tuple["Dataset", "Dataset"]:
-    train_data = Dataset(data_dir / "train.bin", max_seq_length)
-    val_data = Dataset(data_dir / "val.bin", max_seq_length)
+def load_datasets(io: IOArgs, max_seq_length: int) -> Tuple["Dataset", "Dataset"]:
+    train_data = Dataset(io.train_data_dir / "train.bin", max_seq_length)
+    val_data = Dataset(io.val_data_dir / "val.bin", max_seq_length)
     return train_data, val_data
 
 
@@ -231,26 +251,38 @@ def __iter__(self):
             yield x, y
 
 
-# learning rate decay scheduler (cosine with warmup)
-def get_lr(it: int) -> float:
+# learning rate decay scheduler (cosine with linear warmup)
+def get_lr(learning_rate: float, it: int, warmup_iters: int, max_iters: int, min_lr: float) -> float:
     # 1) linear warmup for warmup_iters steps
     if it < warmup_iters:
         return learning_rate * it / warmup_iters
-    # 2) if it > lr_decay_iters, return min learning rate
-    if it > lr_decay_iters:
+    # 2) if it > max_iters, return min learning rate
+    if it > max_iters:
         return min_lr
     # 3) in between, use cosine decay down to min learning rate
-    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+    decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
     assert 0 <= decay_ratio <= 1
     coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
     return min_lr + coeff * (learning_rate - min_lr)
 
 
+def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    issues = []
+    unsupported = [(io, ["checkpoint_dir"]), (train, ["max_tokens"]), (eval, ["max_new_tokens"])]
+    for args, names in unsupported:
+        for name in names:
+            if getattr(args, name) is not None:
+                issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}")
+    required = [(io, ["train_data_dir", "val_data_dir"]), (train, ["epoch_size", "epochs", "max_norm"])]
+    for args, names in required:
+        for name in names:
+            if getattr(args, name) is None:
+                issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}")
+    if issues:
+        raise ValueError("\n".join(issues))
+
+
 if __name__ == "__main__":
-    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
-    # torch.backends.cuda.enable_flash_sdp(False)
     torch.set_float32_matmul_precision("high")
 
-    from jsonargparse import CLI
-
     CLI(setup)
diff --git a/pretrain/openwebtext_trainer.py b/pretrain/openwebtext_trainer.py
deleted file mode 100644
index dd26e37..0000000
--- a/pretrain/openwebtext_trainer.py
+++ /dev/null
@@ -1,199 +0,0 @@
-import math
-import sys
-import time
-from pathlib import Path
-from typing import Any, Optional
-
-import lightning as L
-import numpy as np
-import torch
-from lightning.pytorch.callbacks import ModelCheckpoint
-from lightning.pytorch.loggers import CSVLogger
-from lightning.pytorch.strategies import FSDPStrategy
-from torch.utils.data import DataLoader, IterableDataset
-
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
-
-from lit_gpt import Config
-from lit_gpt.model import GPT, Block
-from lit_gpt.speed_monitor import SpeedMonitorCallback, estimate_flops, measure_flops
-from lit_gpt.utils import chunked_cross_entropy, get_default_supported_precision
-
-model_name = "pythia-70m"
-name = "openwebtext"
-out_dir = Path("out") / name
-data_dir = Path("data") / name
-save_interval = 1000
-eval_interval = 1000
-eval_iters = 100
-log_interval = 1
-
-# Hyperparameters
-learning_rate = 6e-4
-batch_size = 125
-micro_batch_size = 5
-gradient_accumulation_steps = batch_size // micro_batch_size
-assert gradient_accumulation_steps > 0
-max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices
-weight_decay = 1e-1
-beta1 = 0.9
-beta2 = 0.95
-decay_lr = True
-warmup_iters = 2000
-lr_decay_iters = max_iters
-min_lr = 6e-5
-
-hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
-
-
-class LightningGPTModule(L.LightningModule):
-    def __init__(self, config: Config) -> None:
-        super().__init__()
-        self.config = config
-        self.module: Optional[torch.nn.Module] = None
-        self.measured_flops: Optional[int] = None
-
-    def configure_model(self) -> None:
-        self.module = GPT(self.config)
-        self.module.apply(self.module._init_weights)
-
-    def configure_optimizers(self) -> torch.optim.Optimizer:
-        return torch.optim.AdamW(
-            self.module.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), foreach=False
-        )
-
-    def on_fit_start(self) -> None:
-        trainer = self.trainer
-        with torch.device("meta"):
-            meta_model = GPT(self.module.config)
-            # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
-            # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-            # consider setting `self.measured_flops = estimated_flops` instead
-            estimated_flops = estimate_flops(meta_model) * micro_batch_size
-            self.print(f"Estimated TFLOPs: {estimated_flops * trainer.world_size / 1e12:.2f}")
-            x = torch.randint(0, 1, (micro_batch_size, meta_model.max_seq_length))
-            self.measured_flops = measure_flops(meta_model, x)
-            self.print(f"Measured TFLOPs: {self.measured_flops * trainer.world_size / 1e12:.2f}")
-
-    def on_train_batch_start(self, batch: Any, batch_idx: int) -> None:
-        if not decay_lr:
-            return
-        # determine and set the learning rate for this iteration
-        lr = get_lr(self.trainer.fit_loop.total_batch_idx)
-        for optimizer in self.trainer.strategy.optimizers:
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = lr
-
-    def training_step(self, batch: Any, batch_idx: int) -> torch.Tensor:
-        input_ids, targets = batch
-        logits = self.module(input_ids)
-        loss = chunked_cross_entropy(logits, targets, chunk_size=0)
-        self.log("train_loss", loss, on_step=True, on_epoch=False, prog_bar=True)
-        return loss
-
-    def validation_step(self, batch: Any, batch_idx: int) -> None:
-        input_ids, targets = batch
-        logits = self.module(input_ids)
-        loss = chunked_cross_entropy(logits, targets, chunk_size=0)
-        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
-
-
-def main(devices: int = 1, precision: Optional[str] = None) -> None:
-    precision = precision or get_default_supported_precision(training=True)
-
-    if devices > 1:
-        strategy = FSDPStrategy(
-            auto_wrap_policy={Block},
-            activation_checkpointing_policy={Block},
-            # the argument is not available in the Trainer strategy, but it's the default anyways
-            # state_dict_type="full",
-            limit_all_gathers=True,
-            cpu_offload=False,
-        )
-    else:
-        strategy = "auto"
-
-    logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)
-    speed_monitor = SpeedMonitorCallback(
-        length_fn=lambda batch: batch[0].size(1), batch_size=micro_batch_size, window_size=50, time_unit="seconds"
-    )
-    model_checkpoint = ModelCheckpoint(dirpath=out_dir, every_n_train_steps=save_interval, save_last=True, verbose=True)
-    trainer = L.Trainer(
-        devices=devices,
-        strategy=strategy,
-        precision=precision,
-        logger=logger,
-        callbacks=[speed_monitor, model_checkpoint],
-        max_steps=max_iters,
-        max_epochs=1,
-        limit_val_batches=eval_iters,
-        accumulate_grad_batches=gradient_accumulation_steps,
-        log_every_n_steps=log_interval,
-        val_check_interval=eval_interval,
-    )
-
-    L.seed_everything(1337, workers=True)  # same seed for every process to init model (FSDP)
-
-    trainer.print(hparams)
-
-    if trainer.global_rank == 0:
-        out_dir.mkdir(parents=True, exist_ok=True)
-
-    config = Config.from_name(model_name)
-    trainer.print(f"Loading model with {config.__dict__}")
-    t0 = time.perf_counter()
-    model = LightningGPTModule(config)
-    trainer.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.")
-
-    train_data = Dataset(str(data_dir / "train.bin"), config.block_size)
-    val_data = Dataset(str(data_dir / "val.bin"), config.block_size)
-    train_dataloader = DataLoader(train_data, batch_size=micro_batch_size, num_workers=2)
-    val_dataloader = DataLoader(val_data, batch_size=micro_batch_size, num_workers=2)
-
-    t0 = time.perf_counter()
-    trainer.fit(model, train_dataloader, val_dataloader, ckpt_path="last")
-    trainer.print(f"Training time: {(time.perf_counter()-t0):.2f}s")
-    if trainer.strategy.root_device.type == "cuda":
-        trainer.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
-
-
-class Dataset(IterableDataset):
-    def __init__(self, data_file: Path, block_size: int):
-        super().__init__()
-        self.data_file = data_file
-        self.block_size = block_size
-
-    def __iter__(self):
-        data = np.memmap(self.data_file, dtype=np.uint16, mode="r")
-        while True:
-            i = torch.randint(len(data) - self.block_size, (1,)).item()
-            x = torch.from_numpy((data[i : i + self.block_size]).astype(np.int64))
-            y = torch.from_numpy((data[i + 1 : i + 1 + self.block_size]).astype(np.int64))
-            yield x, y
-
-
-# learning rate decay scheduler (cosine with warmup)
-def get_lr(it: int) -> float:
-    # 1) linear warmup for warmup_iters steps
-    if it < warmup_iters:
-        return learning_rate * it / warmup_iters
-    # 2) if it > lr_decay_iters, return min learning rate
-    if it > lr_decay_iters:
-        return min_lr
-    # 3) in between, use cosine decay down to min learning rate
-    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
-    assert 0 <= decay_ratio <= 1
-    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
-    return min_lr + coeff * (learning_rate - min_lr)
-
-
-if __name__ == "__main__":
-    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
-    # torch.backends.cuda.enable_flash_sdp(False)
-    torch.set_float32_matmul_precision("high")
-
-    from jsonargparse import CLI
-
-    CLI(main)
diff --git a/pretrain/redpajama.py b/pretrain/redpajama.py
index 820dbcc..31e5431 100644
--- a/pretrain/redpajama.py
+++ b/pretrain/redpajama.py
@@ -1,4 +1,5 @@
-import glob
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import math
 import sys
 import time
@@ -9,42 +10,18 @@
 import torch
 from lightning.fabric.loggers import CSVLogger
 from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities import ThroughputMonitor, measure_flops
 from torch.utils.data import DataLoader
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from lit_gpt.model import GPT, Block, Config
+from lit_gpt import Config
+from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
+from lit_gpt.model import GPT, Block
 from lit_gpt.packed_dataset import CombinedDataset, PackedDataset
-from lit_gpt.speed_monitor import SpeedMonitorBase, estimate_flops, measure_flops
-from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
-from lit_gpt.utils import chunked_cross_entropy, get_default_supported_precision, num_parameters
-
-model_name = "Llama-2-7b-hf"
-name = "redpajama"
-out_dir = Path("out") / name
-save_interval = 1000
-eval_interval = 1000
-eval_iters = 100
-log_interval = 1
-
-# Hyperparameters
-learning_rate = 6e-4
-batch_size = 125
-micro_batch_size = 6
-gradient_accumulation_steps = batch_size // micro_batch_size
-assert gradient_accumulation_steps > 0
-max_iters = 600000  # num_epochs * (epoch_size // micro_batch_size) // devices
-weight_decay = 1e-1
-beta1 = 0.9
-beta2 = 0.95
-grad_clip = 1.0
-decay_lr = True
-warmup_iters = 2000
-lr_decay_iters = max_iters
-min_lr = 6e-5
-
+from lit_gpt.utils import CLI, chunked_cross_entropy, estimate_flops, get_default_supported_precision, num_parameters
 
 # Data proportions from https://arxiv.org/pdf/2302.13971.pdf Table 1
 data_config = [
@@ -57,17 +34,33 @@
     ("wikipedia", 4.5),
 ]
 
-hparams = {k: v for k, v in locals().items() if isinstance(v, (int, float, str)) and not k.startswith("_")}
-logger = CSVLogger("out", name, flush_logs_every_n_steps=log_interval)
-
 
 def setup(
-    devices: int = 4,
-    train_data_dir: Path = Path("data/redpajama_sample"),
+    model_name: str = "Llama-2-7b-hf",
     val_data_dir: Optional[Path] = None,
     precision: Optional[str] = None,
     resume: Union[bool, Path] = False,
+    seed: int = 1337,
+    devices: int = 4,
+    io: IOArgs = IOArgs(train_data_dir=Path("data/redpajama_sample"), val_data_dir=None, out_dir=Path("out/redpajama")),
+    train: TrainArgs = TrainArgs(
+        save_interval=1000,
+        log_interval=1,
+        global_batch_size=125,
+        micro_batch_size=6,
+        lr_warmup_steps=100,
+        epochs=1,
+        epoch_size=600000,
+        learning_rate=6e-4,
+        weight_decay=1e-1,
+        beta1=0.9,
+        beta2=0.95,
+        max_norm=1.0,
+        min_lr=6e-5,
+    ),
+    eval: EvalArgs = EvalArgs(interval=1000, max_iters=100),
 ) -> None:
+    print(locals())
     precision = precision or get_default_supported_precision(training=True)
 
     if devices > 1:
@@ -81,154 +74,177 @@ def setup(
     else:
         strategy = "auto"
 
+    logger = CSVLogger(io.out_dir.parent, io.out_dir.name, flush_logs_every_n_steps=train.log_interval)
     fabric = L.Fabric(devices=devices, strategy=strategy, precision=precision, loggers=logger)
-    fabric.print(hparams)
-    fabric.launch(main, train_data_dir, val_data_dir, resume)
 
+    fabric.launch(main, devices, resume, seed, Config.from_name(name=model_name), io, train, eval)
 
-def main(fabric: L.Fabric, train_data_dir: Path, val_data_dir: Path, resume: Union[bool, Path]) -> None:
-    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
 
-    if fabric.global_rank == 0:
-        out_dir.mkdir(parents=True, exist_ok=True)
+def main(
+    fabric: L.Fabric,
+    devices: int,
+    resume: Union[bool, Path],
+    seed: int,
+    config: Config,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
+) -> None:
+    validate_args(io, train, eval)
 
-    config = Config.from_name(model_name)
+    if fabric.global_rank == 0:
+        io.out_dir.mkdir(parents=True, exist_ok=True)
 
     train_dataloader, val_dataloader = create_dataloaders(
-        batch_size=micro_batch_size,
+        batch_size=train.micro_batch_size,
         block_size=config.block_size,
         fabric=fabric,
-        train_data_dir=train_data_dir,
-        val_data_dir=val_data_dir,
-        seed=(1337 + fabric.global_rank),
+        train_data_dir=io.train_data_dir,
+        val_data_dir=io.val_data_dir,
+        seed=(seed + fabric.global_rank),
     )
     if val_dataloader is None:
         train_dataloader = fabric.setup_dataloaders(train_dataloader)
     else:
         train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader)
 
-    fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)
+    fabric.seed_everything(seed)  # same seed for every process to init model (FSDP)
 
     fabric.print(f"Loading model with {config.__dict__}")
     t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True):
+    with fabric.init_module(empty_init=(fabric.world_size > 1)):
         model = GPT(config)
-        model.apply(model._init_weights)
+    model.apply(model._init_weights)
 
     fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.")
     fabric.print(f"Total parameters {num_parameters(model):,}")
 
     model = fabric.setup(model)
     optimizer = torch.optim.AdamW(
-        model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(beta1, beta2), foreach=False
+        model.parameters(),
+        lr=train.learning_rate,
+        weight_decay=train.weight_decay,
+        betas=(train.beta1, train.beta2),
+        foreach=False,
     )
     optimizer = fabric.setup_optimizers(optimizer)
 
-    state = {"model": model, "optimizer": optimizer, "hparams": hparams, "iter_num": 0, "step_count": 0}
+    state = {"model": model, "optimizer": optimizer, "iter_num": 0, "step_count": 0}
 
     if resume is True:
-        resume = sorted(out_dir.glob("*.pth"))[-1]
+        resume = max(io.out_dir.glob("*.pth"), key=lambda p: int(p.name.split("-")[1]))
     if resume:
         fabric.print(f"Resuming training from {resume}")
         fabric.load(resume, state)
 
     train_time = time.perf_counter()
-    train(fabric, state, train_dataloader, val_dataloader, speed_monitor)
+    fit(fabric, devices, state, train_dataloader, val_dataloader, io, train, eval)
     fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
     if fabric.device.type == "cuda":
         fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
 
 
-def train(
+def fit(
     fabric: L.Fabric,
+    devices: int,
     state: dict,
     train_dataloader: DataLoader,
-    val_dataloader: DataLoader,
-    speed_monitor: SpeedMonitorBase,
+    val_dataloader: Optional[DataLoader],
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
 ) -> None:
     model = state["model"]
     optimizer = state["optimizer"]
 
     if val_dataloader is not None:
-        validate(fabric, model, val_dataloader)  # sanity check
+        validate(fabric, model, val_dataloader, max_iters=2)  # sanity check
 
     with torch.device("meta"):
         meta_model = GPT(model.config)
         # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
         # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
-        estimated_flops = estimate_flops(meta_model) * micro_batch_size
+        # consider passing `flops_per_batch=estimated_flops` instead
+        estimated_flops = estimate_flops(meta_model, training=True) * train.micro_batch_size
         fabric.print(f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
-        x = torch.randint(0, 1, (micro_batch_size, model.max_seq_length))
-        measured_flops = measure_flops(meta_model, x)
+        x = torch.randint(0, 1, (train.micro_batch_size, model.max_seq_length))
+        forward_fn = lambda: meta_model(x)
+        loss_fn = lambda y: chunked_cross_entropy(y, x, chunk_size=0)
+        measured_flops = measure_flops(meta_model, forward_fn, loss_fn)
         fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
         del meta_model, x
 
-    total_lengths = 0
+    throughput = ThroughputMonitor(fabric, window_size=50)
     total_t0 = time.perf_counter()
 
+    lr_warmup_iters = train.lr_warmup_steps * train.gradient_accumulation_iters(devices)
     for state["iter_num"], train_data in enumerate(train_dataloader, state["iter_num"]):
-        if state["iter_num"] >= max_iters:
+        if state["iter_num"] >= train.max_iters(devices):
             break
 
         # determine and set the learning rate for this iteration
-        lr = get_lr(state["iter_num"]) if decay_lr else learning_rate
+        lr = get_lr(
+            train.learning_rate, state["iter_num"], lr_warmup_iters, train.max_iters(devices), min_lr=train.min_lr
+        )
         for param_group in optimizer.param_groups:
             param_group["lr"] = lr
 
+        iter_num = state["iter_num"] + 1
         iter_t0 = time.perf_counter()
 
         input_ids = train_data[:, 0 : model.max_seq_length].contiguous()
         targets = train_data[:, 1 : model.max_seq_length + 1].contiguous()
 
-        is_accumulating = (state["iter_num"] + 1) % gradient_accumulation_steps != 0
+        is_accumulating = iter_num % train.gradient_accumulation_iters(devices) != 0
         with fabric.no_backward_sync(model, enabled=is_accumulating):
             logits = model(input_ids)
             loss = chunked_cross_entropy(logits, targets, chunk_size=0)
-            fabric.backward(loss / gradient_accumulation_steps)
+            fabric.backward(loss / train.gradient_accumulation_iters(devices))
 
         if not is_accumulating:
-            fabric.clip_gradients(model, optimizer, max_norm=grad_clip)
+            fabric.clip_gradients(model, optimizer, max_norm=train.max_norm)
             optimizer.step()
             optimizer.zero_grad()
             state["step_count"] += 1
 
-        t1 = time.perf_counter()
-        total_lengths += input_ids.size(1)
-        speed_monitor.on_train_batch_end(
-            (state["iter_num"] + 1) * micro_batch_size,
-            t1 - total_t0,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            fabric.world_size,
-            flops_per_batch=measured_flops,
-            lengths=total_lengths,
-        )
-        if state["iter_num"] % log_interval == 0:
+        if iter_num % train.log_interval == 0:
+            loss_item = loss.item()  # expensive device-to-host synchronization
+            t1 = time.perf_counter()
+            throughput.update(
+                time=t1 - total_t0,
+                batches=iter_num,
+                samples=iter_num * train.micro_batch_size,
+                lengths=iter_num * train.micro_batch_size * model.max_seq_length,
+                flops=measured_flops * train.log_interval,
+            )
+            throughput.compute_and_log(step=iter_num)
             fabric.print(
-                f"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, iter time:"
+                f"iter {iter_num} step {state['step_count']}: loss {loss_item:.4f}, iter time:"
                 f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
             )
 
-        if val_dataloader is not None and not is_accumulating and state["step_count"] % eval_interval == 0:
+        if val_dataloader is not None and not is_accumulating and state["step_count"] % eval.interval == 0:
             t0 = time.perf_counter()
-            val_loss = validate(fabric, model, val_dataloader)
+            val_loss = validate(fabric, model, val_dataloader, max_iters=eval.max_iters)
             t1 = time.perf_counter() - t0
-            speed_monitor.eval_end(t1)
-            fabric.print(f"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
+            fabric.print(f"step {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
             fabric.barrier()
-        if not is_accumulating and state["step_count"] % save_interval == 0:
-            checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
+        if not is_accumulating and state["step_count"] % train.save_interval == 0:
+            checkpoint_path = io.out_dir / f"iter-{iter_num:06d}-ckpt.pth"
             fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
             fabric.save(checkpoint_path, state)
 
 
-@torch.inference_mode()
-def validate(fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader) -> torch.Tensor:
+# FSDP has issues with `inference_mode`
+@torch.no_grad()
+def validate(fabric: L.Fabric, model: torch.nn.Module, val_dataloader: DataLoader, max_iters: int) -> torch.Tensor:
     fabric.print("Validating ...")
     model.eval()
 
-    losses = torch.zeros(eval_iters, device=fabric.device)
+    losses = torch.zeros(max_iters, device=fabric.device)
     for k, val_data in enumerate(val_dataloader):
+        if k >= max_iters:
+            break
         input_ids = val_data[:, 0 : model.max_seq_length].contiguous()
         targets = val_data[:, 1 : model.max_seq_length + 1].contiguous()
         logits = model(input_ids)
@@ -244,7 +260,11 @@ def create_dataloader(
 ) -> DataLoader:
     datasets = []
     for prefix, _ in data_config:
-        filenames = glob.glob(str(data_dir / f"{prefix}*"))
+        filenames = list(data_dir.glob(f"{prefix}*"))
+        if not filenames:
+            raise FileNotFoundError(
+                f"No files found at {str(data_dir)} with prefix {prefix}. Did you forget to run `prepare_redpajama.py`?"
+            )
         dataset = PackedDataset(
             filenames,
             n_chunks=4,
@@ -277,7 +297,7 @@ def create_dataloaders(
     train_data_dir: Path = Path("data/redpajama_sample"),
     val_data_dir: Optional[Path] = None,
     seed: int = 12345,
-) -> Tuple[DataLoader, DataLoader]:
+) -> Tuple[DataLoader, Optional[DataLoader]]:
     # Increase by one because we need the next word as well
     effective_block_size = block_size + 1
     train_dataloader = create_dataloader(
@@ -303,26 +323,38 @@ def create_dataloaders(
     return train_dataloader, val_dataloader
 
 
-# learning rate decay scheduler (cosine with warmup)
-def get_lr(it: int) -> float:
+# learning rate decay scheduler (cosine with linear warmup)
+def get_lr(learning_rate: float, it: int, warmup_iters: int, max_iters: int, min_lr: float) -> float:
     # 1) linear warmup for warmup_iters steps
     if it < warmup_iters:
         return learning_rate * it / warmup_iters
-    # 2) if it > lr_decay_iters, return min learning rate
-    if it > lr_decay_iters:
+    # 2) if it > max_iters, return min learning rate
+    if it > max_iters:
         return min_lr
     # 3) in between, use cosine decay down to min learning rate
-    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+    decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
     assert 0 <= decay_ratio <= 1
     coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
     return min_lr + coeff * (learning_rate - min_lr)
 
 
+def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    issues = []
+    unsupported = [(io, ["checkpoint_dir"]), (train, ["max_tokens"]), (eval, ["max_new_tokens"])]
+    for args, names in unsupported:
+        for name in names:
+            if getattr(args, name) is not None:
+                issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}")
+    required = [(io, ["train_data_dir"]), (train, ["epoch_size", "epochs", "max_norm"])]
+    for args, names in required:
+        for name in names:
+            if getattr(args, name) is None:
+                issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}")
+    if issues:
+        raise ValueError("\n".join(issues))
+
+
 if __name__ == "__main__":
-    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
-    # torch.backends.cuda.enable_flash_sdp(False)
     torch.set_float32_matmul_precision("high")
 
-    from jsonargparse import CLI
-
     CLI(setup)
diff --git a/pretrain/tinyllama.py b/pretrain/tinyllama.py
new file mode 100644
index 0000000..7da9dae
--- /dev/null
+++ b/pretrain/tinyllama.py
@@ -0,0 +1,351 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+"""
+This script is adapted from TinyLlama:
+https://github.com/jzhang38/TinyLlama/blob/main/pretrain/tinyllama.py
+"""
+
+import math
+import os
+import sys
+import time
+from functools import partial
+from pathlib import Path
+from typing import Tuple, Union
+
+import lightning as L
+import torch
+import torch.nn as nn
+from lightning.fabric.loggers import CSVLogger, TensorBoardLogger
+from lightning.fabric.strategies import FSDPStrategy
+from lightning.fabric.utilities.throughput import ThroughputMonitor, measure_flops
+from lightning.pytorch.loggers import WandbLogger
+from torch.utils.data import DataLoader
+from torchmetrics.aggregation import RunningMean
+from typing_extensions import Literal
+
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
+from lit_gpt.model import GPT, Block, CausalSelfAttention, Config, LLaMAMLP
+from lit_gpt.utils import CLI, CycleIterator, chunked_cross_entropy, num_parameters
+from lit_gpt.data import TinyLlama, LitDataModule
+
+
+def setup(
+    model: Config = Config(name="tiny-llama-1.1b"),
+    logger_name: Literal["wandb", "tensorboard", "csv"] = "tensorboard",
+    resume: Union[bool, Path] = False,
+    devices: int = torch.cuda.device_count() or 1,
+    seed: int = 1337,
+    data: LitDataModule = TinyLlama(),
+    io: IOArgs = IOArgs(
+        out_dir=Path(os.getenv("LIGHTNING_ARTIFACTS_DIR", "out")) / "lit-tiny-llama-1.1b", train_data_dir=None
+    ),
+    train: TrainArgs = TrainArgs(
+        save_interval=1000,
+        log_interval=1,
+        global_batch_size=512,
+        micro_batch_size=4,
+        max_tokens=int(3e12),  # 3 trillion
+        learning_rate=1e-1,
+        weight_decay=4e-4,
+        beta1=0.9,
+        beta2=0.95,
+        max_norm=1.0,
+        min_lr=4e-5,
+        lr_warmup_steps=2000,
+    ),
+    eval: EvalArgs = EvalArgs(interval=1000, max_iters=100),
+):
+    hparams = locals()
+    logger = choose_logger(io.out_dir, logger_name, name=f"pretrain-{model.name}", resume=resume)
+
+    strategy = FSDPStrategy(auto_wrap_policy={Block}, state_dict_type="full", sharding_strategy="HYBRID_SHARD")
+    fabric = L.Fabric(devices=devices, strategy=strategy, precision="bf16-mixed", loggers=[logger])
+    fabric.launch()
+
+    fabric.print(hparams)
+    if logger_name in ("tensorboard", "wandb"):
+        fabric.logger.log_hyperparams(hparams)
+
+    fabric.launch(main, devices, seed, resume, model, data, io, train, eval)
+
+
+def main(
+    fabric: L.Fabric,
+    devices: int,
+    seed: int,
+    resume: Union[bool, Path],
+    config: Config,
+    data: LitDataModule,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
+) -> None:
+    validate_args(io, train, eval)
+
+    if fabric.global_rank == 0:
+        io.out_dir.mkdir(parents=True, exist_ok=True)
+
+    train_dataloader, val_dataloader = get_dataloaders(fabric, data, train, config.block_size)
+    train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader)
+
+    fabric.seed_everything(seed)  # same seed for every process to init model (FSDP)
+
+    fabric.print(f"Loading model with {config.__dict__}")
+    t0 = time.perf_counter()
+    with fabric.init_module(empty_init=False):
+        model = GPT(config)
+        model.apply(partial(init_weights, n_layer=config.n_layer, n_embd=config.n_embd))
+
+    fabric.print(f"Time to instantiate model: {time.perf_counter() - t0:.02f} seconds.")
+    fabric.print(f"Total parameters: {num_parameters(model):,}")
+
+    model = torch.compile(model)
+    model = fabric.setup(model)
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=train.learning_rate,
+        weight_decay=train.weight_decay,
+        betas=(train.beta1, train.beta2),
+        fused=True,
+    )
+    optimizer = fabric.setup_optimizers(optimizer)
+
+    state = {
+        "model": model,
+        "optimizer": optimizer,
+        "train_dataloader": train_dataloader,
+        "iter_num": 0,
+        "step_count": 0,
+    }
+
+    if resume is True:
+        resume = max(io.out_dir.glob("*.pth"), key=(lambda p: int(p.name.split("-")[1])))
+    if resume:
+        fabric.print(f"Resuming training from {resume}")
+        fabric.load(resume, state)
+
+    train_time = time.perf_counter()
+    fit(fabric, devices, state, train_dataloader, val_dataloader, io, train, eval)
+    fabric.print(f"Training time: {(time.perf_counter()-train_time):.2f}s")
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
+
+
+def fit(
+    fabric,
+    devices: int,
+    state: dict,
+    train_dataloader: DataLoader,
+    val_dataloader: DataLoader,
+    io: IOArgs,
+    train: TrainArgs,
+    eval: EvalArgs,
+) -> None:
+    model = state["model"]
+    optimizer = state["optimizer"]
+
+    validate(fabric, model, val_dataloader, max_iters=2)  # sanity check
+    throughput = ThroughputMonitor(fabric, window_size=5)
+
+    with torch.device("meta"):
+        meta_model = GPT(model.config)
+        x = torch.randint(0, 1, (train.micro_batch_size, meta_model.config.block_size))
+        model_fwd = lambda: meta_model(x)
+        model_loss = lambda y: chunked_cross_entropy(y, x, chunk_size=0)
+        measured_flops = measure_flops(meta_model, model_fwd, model_loss)
+        fabric.print(f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
+        del meta_model, x
+
+    max_tokens_per_device = train.max_tokens // fabric.world_size
+    tokens_per_iter = train.micro_batch_size * model.config.block_size
+    max_iters = max_tokens_per_device // tokens_per_iter
+    log_iter_interval = train.log_interval * train.gradient_accumulation_iters(devices)
+    initial_iter = state["iter_num"]
+    train_iterator = CycleIterator(train_dataloader)
+
+    running_loss = RunningMean(window=train.gradient_accumulation_iters(devices), sync_on_compute=False).to(
+        fabric.device
+    )
+    fabric.barrier()
+    total_t0 = time.perf_counter()
+
+    warmup_iters = train.lr_warmup_steps * train.gradient_accumulation_iters(devices)
+    for train_data in train_iterator:
+        if state["iter_num"] >= max_iters:
+            break
+
+        # determine and set the learning rate for this iteration
+        lr = get_lr(train.learning_rate, state["iter_num"], warmup_iters, max_iters, train.min_lr)
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = lr
+
+        state["iter_num"] += 1
+        iter_t0 = time.perf_counter()
+
+        input_ids = train_data[:, 0 : model.config.block_size].contiguous().long()
+        targets = train_data[:, 1 : (model.config.block_size + 1)].contiguous().long()
+
+        is_accumulating = state["iter_num"] % train.gradient_accumulation_iters(devices) != 0
+        with fabric.no_backward_sync(model, enabled=is_accumulating):
+            logits = model(input_ids)
+            loss = chunked_cross_entropy(logits, targets)
+            fabric.backward(loss / train.gradient_accumulation_iters(devices))
+
+        running_loss.update(loss.detach())
+
+        if not is_accumulating:
+            fabric.clip_gradients(model, optimizer, max_norm=train.max_norm)
+            optimizer.step()
+            optimizer.zero_grad()
+            state["step_count"] += 1
+
+        if state["iter_num"] % log_iter_interval == 0:
+            loss = running_loss.compute().item()  # expensive device-to-host synchronization
+            t1 = time.perf_counter()
+            throughput.update(
+                time=(t1 - total_t0),
+                flops=(measured_flops * log_iter_interval),
+                batches=state["iter_num"],
+                samples=(state["iter_num"] * train.micro_batch_size),
+                lengths=(state["iter_num"] * train.micro_batch_size * model.config.block_size),
+            )
+            metrics = {
+                "loss": loss,
+                "iter": state["iter_num"],
+                "step": state["step_count"],
+                "epoch": train_iterator.epoch,
+                "iter_time": t1 - iter_t0,
+                "remaining_time": (
+                    (t1 - total_t0) / (state["iter_num"] - initial_iter) * (max_iters - state["iter_num"])
+                ),
+                "tokens": state["iter_num"] * train.micro_batch_size * model.config.block_size,
+                "total_tokens": (
+                    state["iter_num"] * train.micro_batch_size * model.config.block_size * fabric.world_size
+                ),
+                "learning_rate": lr,
+            }
+
+            fabric.print(
+                f"iter {metrics['iter']} | step {metrics['step']}: loss {metrics['loss']:.4f}, iter time:"
+                f" {metrics['iter_time'] * 1000:.2f} ms{' (optimizer.step),' if not is_accumulating else ','}"
+                f" remaining time: {metrics['remaining_time'] / 3600 / 24:.2f} days"
+            )
+
+            throughput_metrics = throughput.compute()
+            metrics.update(throughput_metrics)
+            fabric.log_dict(metrics, step=state["iter_num"])
+
+        if val_dataloader is not None and not is_accumulating and state["step_count"] % eval.interval == 0:
+            t0 = time.perf_counter()
+            val_loss = validate(fabric, model, val_dataloader, max_iters=eval.max_iters)
+            val_loss = val_loss.item()
+            td = time.perf_counter() - t0
+
+            fabric.print(f"iter {state['iter_num']}: val loss {val_loss:.4f}, val time: {td * 1000:.2f} ms")
+            metrics = {"val_loss": val_loss, "val_ppl": math.exp(val_loss)}
+            fabric.log_dict(metrics, step=state["iter_num"])
+            fabric.barrier()
+
+        if not is_accumulating and state["step_count"] % train.save_interval == 0:
+            checkpoint_path = io.out_dir / f"step-{state['step_count']:08d}.pth"
+            fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
+            fabric.save(checkpoint_path, state)
+
+
+@torch.no_grad()
+def validate(fabric: L.Fabric, model: nn.Module, val_dataloader: DataLoader, max_iters: int) -> torch.Tensor:
+    fabric.print("Validating ...")
+    model.eval()
+
+    losses = torch.zeros(max_iters, device=fabric.device)
+    for k, val_data in enumerate(val_dataloader):
+        if k >= max_iters:
+            break
+        input_ids = val_data[:, 0 : model.config.block_size].contiguous().long()
+        targets = val_data[:, 1 : (model.config.block_size + 1)].contiguous().long()
+        logits = model(input_ids)
+        loss = chunked_cross_entropy(logits, targets)
+        losses[k] = loss
+
+    model.train()
+    return losses.mean()
+
+
+def get_dataloaders(fabric: L.Fabric, data: LitDataModule, train: TrainArgs, block_size: int) -> Tuple[DataLoader, DataLoader]:
+    data.connect(batch_size=train.micro_batch_size, max_seq_length=block_size)
+    with fabric.rank_zero_first():
+        data.prepare_data()
+    data.setup()
+    train_dataloader = data.train_dataloader()
+    val_dataloader = data.val_dataloader()
+    return train_dataloader, val_dataloader
+
+
+# learning rate decay scheduler (cosine with linear warmup)
+def get_lr(learning_rate: float, it: int, warmup_iters: int, max_iters: int, min_lr: float) -> float:
+    # 1) linear warmup for warmup_iters steps
+    if it < warmup_iters:
+        return learning_rate * it / warmup_iters
+    # 2) if it > max_iters, return min learning rate
+    if it > max_iters:
+        return min_lr
+    # 3) in between, use cosine decay down to min learning rate
+    decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
+    assert 0 <= decay_ratio <= 1
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
+    return min_lr + coeff * (learning_rate - min_lr)
+
+
+def init_weights(module: nn.Module, n_layer: int, n_embd: int):
+    # Follows GPT-NeoX: https://arxiv.org/abs/2204.06745
+    if isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / n_embd))
+    elif isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / n_embd))
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    for name, param in module.named_parameters():
+        if name == "proj.weight" and isinstance(module, (LLaMAMLP, CausalSelfAttention)):
+            nn.init.normal_(param, mean=0.0, std=(1 / math.sqrt(n_embd) / n_layer))
+
+
+def choose_logger(out_dir: Path, logger_name: str, name: str, resume: Union[bool, Path], *args, **kwargs):
+    if logger_name == "csv":
+        return CSVLogger(root_dir=(out_dir / "logs"), name="csv", *args, **kwargs)
+    if logger_name == "tensorboard":
+        return TensorBoardLogger(root_dir=(out_dir / "logs"), name="tensorboard", *args, **kwargs)
+    if logger_name == "wandb":
+        return WandbLogger(project="tinyllama", name=name, resume=(resume is not False), *args, **kwargs)
+    raise ValueError(f"`logger={logger_name}` is not a valid option.")
+
+
+def validate_args(io: IOArgs, train: TrainArgs, eval: EvalArgs) -> None:
+    issues = []
+    unsupported = [
+        (io, ["train_data_dir", "val_data_dir", "checkpoint_dir"]),
+        (train, ["epoch_size", "epochs"]),
+        (eval, ["max_new_tokens"]),
+    ]
+    for args, names in unsupported:
+        for name in names:
+            if getattr(args, name) is not None:
+                issues.append(f"{__file__} doesn't support the {name!r} argument. This is set in {args}")
+    required = [(train, ["max_tokens", "max_norm"])]
+    for args, names in required:
+        for name in names:
+            if getattr(args, name) is None:
+                issues.append(f"{__file__} requires the {name!r} argument. This is set in {args}")
+    if issues:
+        raise ValueError("\n".join(issues))
+
+
+if __name__ == "__main__":
+    torch.set_float32_matmul_precision("high")
+
+    CLI(setup)
diff --git a/quantize/gptq.py b/quantize/gptq.py
deleted file mode 100644
index 34d2bcb..0000000
--- a/quantize/gptq.py
+++ /dev/null
@@ -1,637 +0,0 @@
-# This adapts GPTQ's quantization process: https://github.com/IST-DASLab/gptq/
-# E. Frantar et al GPTQ: Accurate Post-training Compression for GPT, arXiv:2210.17323
-# portions copyright by the authors licensed under the Apache License 2.0
-import gc
-import math
-import sys
-import time
-from pathlib import Path
-from typing import Optional
-
-import torch
-from lightning import Fabric
-
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
-
-from lightning_utilities.core.imports import RequirementCache
-
-from lit_gpt import GPT, Config, Tokenizer
-from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
-
-_TRITON_AVAILABLE = RequirementCache("triton")
-if _TRITON_AVAILABLE:
-    import triton
-    import triton.language as tl
-
-    # This is adapted from the OpenAI Triton matmul example.
-    @triton.autotune(
-        configs=[
-            triton.Config(
-                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=5,
-                num_warps=2,
-            ),
-            triton.Config(
-                {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
-                num_stages=5,
-                num_warps=2,
-            ),
-        ],
-        key=["M", "N", "K"],
-    )
-    @triton.jit
-    def linear_kernel_4bit_weight(
-        # Pointers to matrices
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        bscales_ptr,
-        bzeros_ptr,
-        # bdequant,
-        # Matrix dimensions
-        M,
-        N,
-        K,
-        # The stride variables represent how much to increase the ptr by when moving by 1
-        # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
-        # by to get the element one row down (A has M rows)
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        # Meta-parameters
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-    ):
-        """Kernel for computing the matmul C = A x B.T.
-        A has shape (M, K), B has shape (N, K) and C has shape (M, N)
-        """
-        # -----------------------------------------------------------
-        # Map program ids `pid` to the block of C it should compute.
-        # This is done in a grouped ordering to promote L2 data reuse
-        # See above `L2 Cache Optimizations` section for details
-        pid = tl.program_id(axis=0)
-        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n
-        group_id = pid // num_pid_in_group
-        first_pid_m = group_id * GROUP_SIZE_M
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-        pid_m = first_pid_m + (pid % group_size_m)
-        pid_n = (pid % num_pid_in_group) // group_size_m
-
-        # ----------------------------------------------------------
-        # Create pointers for the first blocks of A and B.
-        # We will advance this pointer as we move in the K direction
-        # and accumulate
-        # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
-        # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
-        # see above `Pointer Arithmetics` section for details
-        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        a_mask = offs_am[:, None] < M
-        b_mask = offs_bn[None, :] < N
-        offs_k = tl.arange(0, BLOCK_SIZE_K)
-        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
-        b_ptrs = b_ptr + ((offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn)
-
-        bscales_ptrs = bscales_ptr + offs_bn[None, :]
-        bzeros_ptrs = bzeros_ptr + offs_bn[None, :]
-
-        scale = tl.load(bscales_ptrs)
-        zero = tl.load(bzeros_ptrs)
-        # -----------------------------------------------------------
-        # Iterate to compute a block of the C matrix
-        # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
-        # of fp32 values for higher accuracy.
-        # `accumulator` will be converted back to fp16 after the loop
-        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, K, BLOCK_SIZE_K):
-            # wasteful as it is to load everything twice, my attempts at avoiding it lead to slower code
-            b12 = tl.load(b_ptrs, mask=b_mask)
-            # Note that for simplicity, we don't apply a mask in K here.
-            a = tl.load(a_ptrs, mask=a_mask).to(tl.float32)
-            b = (((b12.to(tl.uint8) >> ((offs_k[:, None] % 2) * 4)) & 0xF).to(tl.float32) - zero) * scale
-            accumulator += tl.dot(a, b)
-
-            # Advance the ptrs to the next K block
-            a_ptrs += BLOCK_SIZE_K * stride_ak
-            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
-        c = accumulator
-
-        # -----------------------------------------------------------
-        # Write back the block of the output matrix C
-        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-        tl.store(c_ptrs, c, mask=c_mask)
-
-
-def qlinear_4bit_weight(inp, weight, scales, zeros):
-    weight = weight.t().contiguous()
-    c_shape = inp.shape[:-1] + weight.shape[-1:]
-    inp = inp.reshape(-1, inp.shape[-1]).contiguous()
-    # we pad the input to amortize triton compilation cost better
-    PAD_TO = 256
-    if inp.shape[0] % PAD_TO != 0:
-        c_crop = inp.shape[0]
-        new_inp_shape0 = inp.shape[0] + PAD_TO - inp.shape[0] % PAD_TO
-        inp2 = inp.new_empty((new_inp_shape0, inp.shape[1]))
-        inp2[: inp.shape[0]] = inp
-        inp2[inp.shape[0] :].zero_()
-        inp = inp2
-    else:
-        c_crop = None
-
-    assert inp.shape[1] == weight.shape[0] * 2, "incompatible dimensions"
-
-    assert scales.shape == (weight.shape[1], 1)
-    assert zeros.shape == (weight.shape[1], 1)
-    scales = scales.contiguous()
-    zeros = zeros.contiguous()
-    K, N = weight.shape
-    M, K = inp.shape
-    assert K % 32 == 0, "We don't check memory-out-of-bounds with K so K must be divisible by BLOCK_SIZE_K"
-    # allocates output
-    c = torch.empty((M, N), device=inp.device, dtype=inp.dtype)
-    # 1D launch kernel where each block gets its own program.
-    grid = lambda META: (triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),)
-    linear_kernel_4bit_weight[grid](
-        inp,
-        weight,
-        c,
-        scales,
-        zeros,
-        M,
-        N,
-        K,
-        inp.stride(0),
-        inp.stride(1),
-        weight.stride(0),
-        weight.stride(1),
-        c.stride(0),
-        c.stride(1),
-    )
-    return c[:c_crop].reshape(c_shape)
-
-
-# for correctness but with terrible perf
-class ColBlockQuantizedLinear(torch.nn.Module):
-    def __init__(self, in_features, out_features, bias: bool, *, bits, tile_cols):
-        if not _TRITON_AVAILABLE:
-            raise ModuleNotFoundError(str(_TRITON_AVAILABLE))
-
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.tile_cols = tile_cols if tile_cols != -1 else self.in_features
-        self.bits = bits
-        self.entries_per_byte = 8 // bits
-        assert self.entries_per_byte > 0
-        assert self.entries_per_byte * self.bits == 8
-        assert in_features % self.entries_per_byte == 0
-        self.register_buffer(
-            "quant_weight",
-            torch.empty((self.out_features, self.in_features // self.entries_per_byte), dtype=torch.uint8)
-            .t()
-            .contiguous()
-            .t(),
-        )
-        self.register_buffer(
-            "scales", torch.empty((self.out_features, (self.in_features + self.tile_cols - 1) // self.tile_cols))
-        )
-        self.register_buffer("zeros", torch.empty_like(self.scales))
-        assert isinstance(bias, bool)
-        if bias:
-            self.register_buffer("bias", torch.empty((self.out_features,)))
-        else:
-            self.register_buffer("bias", None)
-
-    def pack_weight(self, weight):
-        weight = weight.to(device=self.quant_weight.device, copy=True)
-        for j in range(self.scales.size(1)):
-            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] /= self.scales[:, j : j + 1]
-            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] += self.zeros[:, j : j + 1]
-        weight = weight.clamp_(min=0, max=2**self.bits - 1).to(dtype=torch.uint8)
-        self.quant_weight.zero_()
-        for nr in range(self.entries_per_byte):
-            self.quant_weight += weight[:, nr :: self.entries_per_byte] << (nr * self.bits)
-
-    def get_weight(self, dtype=torch.float):
-        weight = torch.empty((self.out_features, self.in_features), device=self.quant_weight.device, dtype=dtype)
-        mask = (1 << self.bits) - 1
-        for nr in range(self.entries_per_byte):
-            weight[:, nr :: self.entries_per_byte] = ((self.quant_weight >> (nr * self.bits)) & mask).float()
-        self.quant_weight.to(dtype)
-        for j in range(self.scales.size(1)):
-            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] -= self.zeros[:, j : j + 1]
-            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] *= self.scales[:, j : j + 1]
-        return weight
-
-    def forward(self, inp):
-        if (
-            self.bits == 4
-            and self.quant_weight.device.type == "cuda"
-            and self.zeros.shape[1] == 1
-            and self.quant_weight.shape[1] % 32 == 0
-        ):
-            return qlinear_4bit_weight(inp, self.quant_weight, self.scales, self.zeros)
-        weight = self.get_weight(dtype=inp.dtype)
-        return torch.nn.functional.linear(inp, weight, self.bias)
-
-
-class GPTQQuantizer:
-    # The algorithm and code has been taken from  https://github.com/IST-DASLab/gptq/
-    # E. Frantar et al GPTQ: Accurate Post-training Compression for GPT, arXiv:2210.17323
-    # portions copyright by the authors licensed under the Apache License 2.0
-    # All errors are our own.
-
-    def __init__(
-        self,
-        linear_module,
-        *,
-        bits,
-        perchannel=True,
-        sym=False,
-        blocksize=128,
-        percdamp=0.01,
-        groupsize=-1,
-        actorder=False,
-    ):
-        if not _TRITON_AVAILABLE:
-            raise ModuleNotFoundError(str(_TRITON_AVAILABLE))
-
-        assert isinstance(linear_module, torch.nn.Linear)
-
-        self.linear_module = linear_module
-        self.dev = self.linear_module.weight.device
-        self.rows = linear_module.weight.shape[0]
-        self.columns = linear_module.weight.shape[1]
-        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
-        self.nsamples = 0
-        self.bits = bits
-        self.maxq = 2**bits - 1
-        self.perchannel = perchannel
-        self.sym = sym
-        self.blocksize = blocksize
-        self.percdamp = percdamp
-        self.groupsize = groupsize
-        self.actorder = actorder
-        self.tile_cols = self.columns if groupsize == -1 else groupsize
-        self.scales = torch.zeros(
-            (self.rows, (self.columns + self.tile_cols - 1) // self.tile_cols),
-            dtype=self.linear_module.weight.dtype,
-            device=self.dev,
-        )
-        self.zeros = torch.zeros_like(self.scales)
-        assert not (
-            self.actorder and self.groupsize != -1
-        ), "The permutation trick does not work for grouped quantization"
-
-    @staticmethod
-    def quantize_weight(x, scale, zero, maxq):
-        q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
-        return scale * (q - zero)
-
-    def find_params_weight(self, x):
-        dev = x.device
-
-        shape = x.shape
-        x = x.flatten(1) if self.perchannel else x.flatten().unsqueeze(0)
-
-        tmp = torch.zeros(x.shape[0], device=dev)
-        xmin = torch.minimum(x.min(1)[0], tmp)
-        xmax = torch.maximum(x.max(1)[0], tmp)
-
-        if self.sym:
-            xmax = torch.maximum(torch.abs(xmin), xmax)
-            tmp = xmin < 0
-            if torch.any(tmp):
-                xmin[tmp] = -xmax[tmp]
-        tmp = (xmin == 0) & (xmax == 0)
-        xmin[tmp] = -1
-        xmax[tmp] = +1
-
-        scale = (xmax - xmin) / self.maxq
-        zero = torch.full_like(scale, (self.maxq + 1) / 2) if self.sym else torch.round(-xmin / scale)
-
-        if not self.perchannel:
-            tmp = shape[0]
-            scale = scale.repeat(tmp)
-            zero = zero.repeat(tmp)
-
-        shape = [-1] + [1] * (len(shape) - 1)
-        scale = scale.reshape(shape)
-        zero = zero.reshape(shape)
-        return scale, zero
-
-    def collect_input_stats(self, _1, inp, _2):
-        inp = inp[0].detach()
-        self.last_inp = inp
-        if len(inp.shape) == 2:
-            inp = inp.unsqueeze(0)
-        tmp = inp.shape[0]
-        if len(inp.shape) == 3:
-            inp = inp.reshape((-1, inp.shape[-1]))
-        inp = inp.t()
-        self.H *= self.nsamples / (self.nsamples + tmp)
-        self.nsamples += tmp
-        # inp = inp.float()
-        inp = math.sqrt(2 / self.nsamples) * inp.float()
-        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
-        self.H += inp.matmul(inp.t())
-
-    def quantize(self):
-        W = self.linear_module.weight.detach().to(dtype=torch.float, copy=True)
-
-        scale, zero = self.find_params_weight(W)
-        self.scales[:] = scale
-        self.zeros[:] = zero
-
-        H = self.H
-        del self.H
-        dead = torch.diag(H) == 0
-        H[dead, dead] = 1
-        W[:, dead] = 0
-        if self.actorder:
-            perm = torch.argsort(torch.diag(H), descending=True)
-            W = W[:, perm]
-            H = H[perm][:, perm]
-
-        Losses = torch.zeros_like(W)
-        Q = torch.zeros_like(W)
-
-        damp = self.percdamp * torch.mean(torch.diag(H))
-        diag = torch.arange(self.columns, device=self.dev)
-        H[diag, diag] += damp
-        H = torch.linalg.cholesky(H)
-        H = torch.cholesky_inverse(H)
-        H = torch.linalg.cholesky(H, upper=True)
-        Hinv = H
-
-        for i1 in range(0, self.columns, self.blocksize):
-            i2 = min(i1 + self.blocksize, self.columns)
-            count = i2 - i1
-
-            W1 = W[:, i1:i2].clone()
-            Q1 = torch.zeros_like(W1)
-            Err1 = torch.zeros_like(W1)
-            Losses1 = torch.zeros_like(W1)
-            Hinv1 = Hinv[i1:i2, i1:i2]
-
-            for i in range(count):
-                w = W1[:, i]
-                d = Hinv1[i, i]
-
-                if self.groupsize != -1 and (i1 + i) % self.groupsize == 0:
-                    scale, zero = self.find_params_weight(W[:, (i1 + i) : (i1 + i + self.groupsize)])
-                    self.scales[:, (i1 + i) // self.groupsize] = scale
-                    self.zeros[:, (i1 + i) // self.groupsize] = zero
-
-                q = self.quantize_weight(w.unsqueeze(1), scale, zero, self.maxq)
-                q = q.squeeze(1)
-                assert q.dim() == 1
-                Q1[:, i] = q
-                Losses1[:, i] = (w - q) ** 2 / d**2
-
-                err1 = (w - q) / d
-                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
-                Err1[:, i] = err1
-
-            Q[:, i1:i2] = Q1
-            Losses[:, i1:i2] = Losses1 / 2
-
-            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
-
-        if self.actorder:
-            invperm = torch.argsort(perm)
-            Q = Q[:, invperm]
-
-        weight = Q.reshape(self.linear_module.weight.shape).to(self.linear_module.weight.data.dtype)
-        error = torch.sum(Losses).item()
-
-        q_module = ColBlockQuantizedLinear(
-            self.linear_module.in_features,
-            self.linear_module.out_features,
-            self.linear_module.bias is not None,
-            bits=self.bits,
-            tile_cols=self.groupsize,
-        ).to(self.dev)
-        q_module.scales = self.scales
-        q_module.zeros = self.zeros
-        q_module.pack_weight(weight)
-        q_module.bias = self.linear_module.bias
-        return q_module, error
-
-
-def get_sample_data():
-    from datasets import load_dataset
-
-    traindata = load_dataset(
-        "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train"
-    )
-    # heuristic for the data size?
-    return "\n".join(traindata[i]["text"] for i in torch.randperm(len(traindata))[:2000].tolist())
-
-
-@torch.no_grad()
-def blockwise_quantization(model, sample_inputs, working_device, *, bits=4, groupsize=-1):
-    """
-    This is the classic post-training quantization of all linear layers.
-    We quantize in order, i.e. when observing the inputs, we use the outputs of the previously quantized layers rather
-    than doing them all at once.
-    """
-    if not _TRITON_AVAILABLE:
-        raise ModuleNotFoundError(str(_TRITON_AVAILABLE))
-
-    print(model)
-    print(model.config)
-
-    print("Getting inputs for first block")
-    model.transformer.wte.to(working_device)
-    sample_inputs = sample_inputs.to(working_device)
-    inps = model.transformer.wte(sample_inputs)
-    model.transformer.wte.to("cpu")
-    torch.cuda.empty_cache()
-
-    print("Starting to quantize blocks")
-    outs = torch.zeros_like(inps)
-
-    # better than relying on enumeration? originally the code bundled
-    # the two mlp fc layers
-    # we could automate this with a lot of hooks and another iteration
-    submodules_to_process = ["attn.attn", "attn.proj", "mlp.proj"]
-    if model.config._mlp_class == "GptNeoxMLP":
-        submodules_to_process.append("mlp.fc")
-    else:
-        submodules_to_process.extend(["mlp.fc_1", "mlp.fc_2"])
-
-    for i, block in enumerate(model.transformer.h):
-        block.to(working_device)
-
-        for name in submodules_to_process:
-            print(i, name, end=" ")
-            t0 = time.perf_counter()
-            print("collecting stats", end=" ")
-            sys.stdout.flush()
-            module = block.get_submodule(name)
-
-            gptq = GPTQQuantizer(module, bits=bits, groupsize=groupsize, actorder=(groupsize == -1))
-            handle = module.register_forward_hook(gptq.collect_input_stats)
-            for j in range(inps.size(0)):
-                outs[j : j + 1] = block(inps[j : j + 1], cos=model.cos, sin=model.sin)
-
-            handle.remove()
-
-            print("quantizing", end=" ")
-            sys.stdout.flush()
-            q_module, error = gptq.quantize()
-
-            # replace the linear module with the quantized module
-            pname, dname = name.rsplit(".", 1)
-            setattr(block.get_submodule(pname), dname, q_module)
-
-            # cleanup in an attempt to not run out of memory
-            del gptq
-            gc.collect()
-            torch.cuda.empty_cache()
-            t1 = time.perf_counter()
-            print(f"time {int(t1 - t0 + 0.5)}s quantization error {error:.1f}")
-
-        for j in range(inps.size(0)):
-            outs[j : j + 1] = block(inps[j : j + 1], cos=model.cos, sin=model.sin)
-
-        block.cpu()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        # the outputs are the next block's inputs and we'll reuse the old inputs
-        inps, outs = outs, inps
-
-    model.transformer.ln_f.to(working_device)
-    for j in range(inps.size(0)):
-        outs[j : j + 1] = model.transformer.ln_f(inps[j : j + 1])
-    model.transformer.ln_f.to("cpu")
-    inps, outs = outs, inps
-
-    model.lm_head.to(working_device)
-    gptq = GPTQQuantizer(model.lm_head, bits=bits, groupsize=groupsize, actorder=(groupsize == -1))
-    handle = model.lm_head.register_forward_hook(gptq.collect_input_stats)
-    for j in range(inps.size(0)):
-        model.lm_head(inps[j : j + 1])
-    handle.remove()
-    q_module, error = gptq.quantize()
-    model.lm_head = q_module
-    model.lm_head.to("cpu")
-
-
-def main(
-    *,
-    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    output_path: Optional[Path] = None,
-    n_samples: int = 128,
-    precision: Optional[str] = None,
-) -> None:
-    """Generates text samples based on a pre-trained LLM and tokenizer.
-
-    Args:
-        checkpoint_dir: The checkpoint directory to load.
-        output_path: Path to write the quantized model's state dict to.
-        n_samples: Number of example inputs to use for statistics (default: 128)
-        precision: The precision to use to load the model.
-    """
-    if not _TRITON_AVAILABLE:
-        raise ModuleNotFoundError(str(_TRITON_AVAILABLE))
-
-    precision = precision or get_default_supported_precision(training=False)
-
-    if output_path is None:
-        output_path = checkpoint_dir / "lit_model_gptq.4bit.pth"
-    check_valid_checkpoint_dir(checkpoint_dir)
-
-    config = Config.from_json(checkpoint_dir / "lit_config.json")
-
-    device = "cuda"
-    fabric = Fabric(accelerator="cuda", devices=1, precision=precision)
-
-    # we avoid loading the entire model on the GPU and do this block by block
-    checkpoint_path = checkpoint_dir / "lit_model.pth"
-    print(f"Loading model {str(checkpoint_path)!r} with {config.__dict__}")
-    t0 = time.perf_counter()
-    with fabric.init_module(empty_init=True):
-        model = GPT(config)
-    checkpoint = lazy_load(checkpoint_path)
-    model.load_state_dict(checkpoint)
-    print(f"Time to load model: {time.perf_counter() - t0:.02f} seconds.")
-
-    model.eval()
-
-    tokenizer = Tokenizer(checkpoint_dir)
-
-    test_string = get_sample_data()
-    encoded_text = tokenizer.encode(test_string, eos=True)
-    block_size = config.block_size
-    encoded_text = encoded_text[: n_samples * block_size].reshape(n_samples, block_size)
-
-    t0 = time.perf_counter()
-    blockwise_quantization(model, encoded_text, device, bits=4)
-    t = time.perf_counter() - t0
-
-    print(f"\n\nTime for quantization: {t:.02f} sec total")
-    print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
-
-    torch.save(model.state_dict(), output_path)
-
-
-if __name__ == "__main__":
-    from jsonargparse import CLI
-
-    torch.set_float32_matmul_precision("high")
-    CLI(main)
diff --git a/requirements-all.txt b/requirements-all.txt
index 84a1a8a..a6cdf33 100644
--- a/requirements-all.txt
+++ b/requirements-all.txt
@@ -1,9 +1,17 @@
 -r requirements.txt
-bitsandbytes   # quantization
-scipy          # required by bitsandbytes
-sentencepiece  # pythia, falcon, redpajama
-tokenizers     # llama-based models
-datasets       # quantize/gptq.py
-zstandard      # scripts/prepare_redpajama.py
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@master  # eval
-pandas         # prepare_csv.py
+jsonargparse[signatures]  # CLI
+bitsandbytes==0.41.0      # quantization
+scipy                     # required by bitsandbytes
+sentencepiece             # llama-based models
+tokenizers                # pythia, falcon, redpajama
+datasets                  # eval
+requests                  # scripts/prepare_*
+zstandard                 # scripts/prepare_redpajama.py, scripts/prepare_starcoder.py
+pandas                    # scripts/prepare_csv.py, scripts/prepare_starcoder.py
+pyarrow                   # scripts/prepare_starcoder.py
+tensorboard               # pretrain/tinyllama.py
+torchmetrics              # pretrain/tinyllama.py
+# eval
+git+https://github.com/EleutherAI/lm-evaluation-harness.git@115206dc89dad67b8beaa90051fb52db77f0a529
+# scripts/prepare_slimpajama.py, scripts/prepare_starcoder.py, pretrain/tinyllama.py
+lightning[data] @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af
diff --git a/requirements.txt b/requirements.txt
index 493eb51..a6ecd90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-torch>=2.1.0
-lightning @ git+https://github.com/Lightning-AI/lightning@71aed751f7f0ca8422ddca256e602099070f490b
 jsonargparse[signatures]  # CLI
 jinja2
+torch>=2.2.0
+lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index a0058db..b033a49 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -1,6 +1,9 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import gc
 import json
 import sys
+from collections import defaultdict
 from dataclasses import asdict
 from functools import partial
 from pathlib import Path
@@ -119,24 +122,46 @@ def copy_weights_hf_llama(
 ) -> None:
     weight_map = {
         "model.embed_tokens.weight": "transformer.wte.weight",
-        "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+        "model.layers.{}.input_layernorm.weight": "transformer.h.{l}.norm_1.weight",
+        "model.layers.{}.input_layernorm.bias": "transformer.h.{l}.norm_1.bias",
         "model.layers.{}.self_attn.q_proj.weight": None,
         "model.layers.{}.self_attn.k_proj.weight": None,
         "model.layers.{}.self_attn.v_proj.weight": None,
-        "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{}.attn.proj.weight",
+        "model.layers.{}.self_attn.o_proj.weight": "transformer.h.{l}.attn.proj.weight",
         "model.layers.{}.self_attn.rotary_emb.inv_freq": None,
-        "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{}.norm_2.weight",
-        "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{}.mlp.fc_1.weight",
-        "model.layers.{}.mlp.up_proj.weight": "transformer.h.{}.mlp.fc_2.weight",
-        "model.layers.{}.mlp.down_proj.weight": "transformer.h.{}.mlp.proj.weight",
+        "model.layers.{}.post_attention_layernorm.weight": "transformer.h.{l}.norm_2.weight",
+        "model.layers.{}.post_attention_layernorm.bias": "transformer.h.{l}.norm_2.bias",
         "model.norm.weight": "transformer.ln_f.weight",
+        "model.norm.bias": "transformer.ln_f.bias",
         "lm_head.weight": "lm_head.weight",
     }
+    if config._mlp_class == "LLaMAMoE":
+        weight_map.update(
+            {
+                "model.layers.{}.block_sparse_moe.gate.weight": "transformer.h.{l}.mlp.gate.weight",
+                "model.layers.{}.block_sparse_moe.experts.{}.w1.weight": "transformer.h.{l}.mlp.experts.{e}.fc_1.weight",
+                "model.layers.{}.block_sparse_moe.experts.{}.w3.weight": "transformer.h.{l}.mlp.experts.{e}.fc_2.weight",
+                "model.layers.{}.block_sparse_moe.experts.{}.w2.weight": "transformer.h.{l}.mlp.experts.{e}.proj.weight",
+            }
+        )
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP"):
+        weight_map.update(
+            {
+                "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{l}.mlp.fc_1.weight",
+                "model.layers.{}.mlp.up_proj.weight": "transformer.h.{l}.mlp.fc_2.weight",
+                "model.layers.{}.mlp.down_proj.weight": "transformer.h.{l}.mlp.proj.weight",
+            }
+        )
+    else:
+        raise NotImplementedError
 
     for name, param in hf_weights.items():
         if "model.layers" in name:
-            from_name, number = layer_template(name, 2)
-            qkv = qkv_weights.setdefault(number, [None, None, None])
+            from_name, l = layer_template(name, 2)
+            e = None
+            if "block_sparse_moe.experts" in name:
+                from_name, e = layer_template(from_name, 5)
+            qkv = qkv_weights.setdefault(l, [None, None, None])
             if "q_proj" in name:
                 qkv[0] = param
             elif "k_proj" in name:
@@ -146,7 +171,7 @@ def copy_weights_hf_llama(
             to_name = weight_map[from_name]
             if to_name is None:
                 continue
-            to_name = to_name.format(number)
+            to_name = to_name.format(l=l, e=e)
         else:
             to_name = weight_map[name]
         param = load_param(param, name, dtype)
@@ -154,6 +179,10 @@ def copy_weights_hf_llama(
             param = saver.store_early(param)
         state_dict[to_name] = param
 
+    if "lm_head.weight" not in state_dict:
+        state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
+
+    # convert separate q, k, v matrices into an interleaved qkv
     for i, (q, k, v) in list(qkv_weights.items()):
         if q is None or k is None or v is None:
             # split across different .bin files
@@ -170,59 +199,77 @@ def copy_weights_hf_llama(
         state_dict[f"transformer.h.{i}.attn.attn.weight"] = qkv
         del qkv_weights[i]
 
-
 def copy_weights_phi(
     config: Config,
+    qkv_weights: dict,
     state_dict: Dict[str, torch.Tensor],
     hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
     saver: Optional[incremental_save] = None,
     dtype: Optional[torch.dtype] = None,
 ) -> None:
+    if any(layer_name.startswith(("layers.", "transformer.")) for layer_name in hf_weights):
+        raise ValueError(
+            "You are using an outdated Phi checkpoint. Please reload it as described in 'tutorials/download_phi.md'"
+        )
+
     weight_map = {
-        "layers.0.wte.weight": "transformer.wte.weight",
-        "layers.{}.ln.bias": "transformer.h.{}.norm_1.bias",
-        "layers.{}.ln.weight": "transformer.h.{}.norm_1.weight",
-        "layers.{}.mixer.Wqkv.bias": "transformer.h.{}.attn.attn.bias",
-        "layers.{}.mixer.Wqkv.weight": "transformer.h.{}.attn.attn.weight",
-        "layers.{}.mixer.out_proj.bias": "transformer.h.{}.attn.proj.bias",
-        "layers.{}.mixer.out_proj.weight": "transformer.h.{}.attn.proj.weight",
-        "layers.{}.mixer.rotary_emb.inv_freq": None,
-        "layers.{}.mlp.fc1.bias": "transformer.h.{}.mlp.fc.bias",
-        "layers.{}.mlp.fc1.weight": "transformer.h.{}.mlp.fc.weight",
-        "layers.{}.mlp.fc2.bias": "transformer.h.{}.mlp.proj.bias",
-        "layers.{}.mlp.fc2.weight": "transformer.h.{}.mlp.proj.weight",
-        f"layers.{config.n_layer + 1}.ln.bias": "transformer.ln_f.bias",
-        f"layers.{config.n_layer + 1}.ln.weight": "transformer.ln_f.weight",
-        f"layers.{config.n_layer + 1}.linear.weight": "lm_head.weight",
-        f"layers.{config.n_layer + 1}.linear.bias": "lm_head.bias",
+        "model.embed_tokens.weight": "transformer.wte.weight",
+        "model.layers.{}.input_layernorm.weight": "transformer.h.{}.norm_1.weight",
+        "model.layers.{}.input_layernorm.bias": "transformer.h.{}.norm_1.bias",
+        "model.layers.{}.self_attn.q_proj.weight": None,
+        "model.layers.{}.self_attn.q_proj.bias": None,
+        "model.layers.{}.self_attn.k_proj.weight": None,
+        "model.layers.{}.self_attn.k_proj.bias": None,
+        "model.layers.{}.self_attn.v_proj.weight": None,
+        "model.layers.{}.self_attn.v_proj.bias": None,
+        "model.layers.{}.self_attn.dense.weight": "transformer.h.{}.attn.proj.weight",
+        "model.layers.{}.self_attn.dense.bias": "transformer.h.{}.attn.proj.bias",
+        "model.layers.{}.mlp.fc1.weight": "transformer.h.{}.mlp.fc.weight",
+        "model.layers.{}.mlp.fc1.bias": "transformer.h.{}.mlp.fc.bias",
+        "model.layers.{}.mlp.fc2.weight": "transformer.h.{}.mlp.proj.weight",
+        "model.layers.{}.mlp.fc2.bias": "transformer.h.{}.mlp.proj.bias",
+        "model.final_layernorm.weight": "transformer.ln_f.weight",
+        "model.final_layernorm.bias": "transformer.ln_f.bias",
+        "lm_head.weight": "lm_head.weight",
+        "lm_head.bias": "lm_head.bias",
     }
 
     for name, param in hf_weights.items():
-        if "layers" in name:
-            from_name, number = layer_template(name, 1)
-            if number in (0, config.n_layer + 1):
-                # these are part of the layers in phi, but not in our implementation
-                to_name = weight_map[name]
-            else:
-                to_name = weight_map[from_name]
-                if to_name is None:
-                    continue
-                # the phi layer numbering is off by 1 compared to ours
-                to_name = to_name.format(number - 1)
+        if name.startswith("model.layers."):
+            from_name, l = layer_template(name, 2)
+            qkv = qkv_weights.setdefault(l, defaultdict(dict))
+            if any(w in from_name for w in ("q_proj", "k_proj", "v_proj")):
+                weight_name, weight_type = from_name.split(".")[-2:]
+                qkv[weight_type][weight_name] = param
+            to_name = weight_map[from_name]
+            if to_name is None:
+                continue
+            to_name = to_name.format(l)
         else:
             to_name = weight_map[name]
         param = load_param(param, name, dtype)
-        if "Wqkv" in name:
-            q_per_kv = config.n_head // config.n_query_groups
-            total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
-            param = param.view(total_qkv, config.n_query_groups, -1).transpose(0, 1)
-            param = param.reshape(config.n_embd * 3, -1)
-            if "bias" in name:
-                param = param.squeeze()
         if saver is not None:
             param = saver.store_early(param)
         state_dict[to_name] = param
 
+    for i in list(qkv_weights):
+        for weight_type in list(qkv_weights[i]):
+            qkv = qkv_weights[i][weight_type]
+            if len(qkv) != 3:
+                # split across different .bin files
+                continue
+            q = load_param(qkv["q_proj"], f"layer {i} q {weight_type}", dtype)
+            k = load_param(qkv["k_proj"], f"layer {i} k {weight_type}", dtype)
+            v = load_param(qkv["v_proj"], f"layer {i} v {weight_type}", dtype)
+            q_per_kv = config.n_head // config.n_query_groups
+            qs = torch.split(q, config.head_size * q_per_kv)
+            ks = torch.split(k, config.head_size)
+            vs = torch.split(v, config.head_size)
+            cycled = [t for group in zip(qs, ks, vs) for t in group]
+            qkv = torch.cat(cycled)
+            state_dict[f"transformer.h.{i}.attn.attn.{weight_type}"] = qkv
+            del qkv_weights[i][weight_type]
+
 
 def layer_template(layer_name: str, idx: int) -> Tuple[str, int]:
     split = layer_name.split(".")
@@ -263,12 +310,14 @@ def convert_hf_checkpoint(
 
     if "falcon" in model_name:
         copy_fn = partial(copy_weights_falcon, model_name)
-    elif config._mlp_class == "LLaMAMLP":
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
         copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
     elif "phi" in model_name:
-        copy_fn = partial(copy_weights_phi, config)
+        # holder to reconstitute the split q, k, v
+        qkv_weights = {}
+        copy_fn = partial(copy_weights_phi, config, qkv_weights)
     else:
         copy_fn = copy_weights_gpt_neox
 
diff --git a/scripts/convert_lit_checkpoint.py b/scripts/convert_lit_checkpoint.py
index 012aa68..1f04a9c 100644
--- a/scripts/convert_lit_checkpoint.py
+++ b/scripts/convert_lit_checkpoint.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import gc
 import sys
 from functools import partial
@@ -12,7 +14,7 @@
 sys.path.append(str(wd))
 
 from lit_gpt import Config
-from lit_gpt.utils import incremental_save, lazy_load
+from lit_gpt.utils import CLI, incremental_save, lazy_load
 from scripts.convert_hf_checkpoint import layer_template, load_param
 
 
@@ -104,26 +106,48 @@ def copy_weights_llama(
     config: Config,
     state_dict: Dict[str, torch.Tensor],
     lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    untie_weights: bool = False,
     saver: Optional[incremental_save] = None,
 ) -> None:
     weight_map = {
         "transformer.wte.weight": "model.embed_tokens.weight",
-        "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight",
-        "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.o_proj.weight",
-        "transformer.h.{}.norm_2.weight": "model.layers.{}.post_attention_layernorm.weight",
-        "transformer.h.{}.mlp.fc_1.weight": "model.layers.{}.mlp.gate_proj.weight",
-        "transformer.h.{}.mlp.fc_2.weight": "model.layers.{}.mlp.up_proj.weight",
-        "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.down_proj.weight",
+        "transformer.h.{}.norm_1.weight": "model.layers.{l}.input_layernorm.weight",
+        "transformer.h.{}.norm_1.bias": "model.layers.{l}.input_layernorm.bias",
+        "transformer.h.{}.attn.proj.weight": "model.layers.{l}.self_attn.o_proj.weight",
+        "transformer.h.{}.norm_2.weight": "model.layers.{l}.post_attention_layernorm.weight",
+        "transformer.h.{}.norm_2.bias": "model.layers.{l}.post_attention_layernorm.bias",
         "transformer.ln_f.weight": "model.norm.weight",
+        "transformer.ln_f.bias": "model.norm.bias",
         "lm_head.weight": "lm_head.weight",
     }
+    if config._mlp_class == "LLaMAMoE":
+        weight_map.update(
+            {
+                "transformer.h.{}.mlp.gate.weight": "model.layers.{l}.block_sparse_moe.gate.weight",
+                "transformer.h.{}.mlp.experts.{}.fc_1.weight": "model.layers.{l}.block_sparse_moe.experts.{e}.w1.weight",
+                "transformer.h.{}.mlp.experts.{}.fc_2.weight": "model.layers.{l}.block_sparse_moe.experts.{e}.w3.weight",
+                "transformer.h.{}.mlp.experts.{}.proj.weight": "model.layers.{l}.block_sparse_moe.experts.{e}.w2.weight",
+            }
+        )
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP"):
+        weight_map.update(
+            {
+                "transformer.h.{}.mlp.fc_1.weight": "model.layers.{l}.mlp.gate_proj.weight",
+                "transformer.h.{}.mlp.fc_2.weight": "model.layers.{l}.mlp.up_proj.weight",
+                "transformer.h.{}.mlp.proj.weight": "model.layers.{l}.mlp.down_proj.weight",
+            }
+        )
+    else:
+        raise NotImplementedError
 
     for name, param in lit_weights.items():
+        if name == "lm_head.weight" and untie_weights:
+            continue
         if name.endswith(".attn.attn.weight"):
-            from_name, number = layer_template(name, 2)
-            q = "model.layers.{}.self_attn.q_proj.weight".format(number)
-            k = "model.layers.{}.self_attn.k_proj.weight".format(number)
-            v = "model.layers.{}.self_attn.v_proj.weight".format(number)
+            from_name, l = layer_template(name, 2)
+            q = "model.layers.{}.self_attn.q_proj.weight".format(l)
+            k = "model.layers.{}.self_attn.k_proj.weight".format(l)
+            v = "model.layers.{}.self_attn.v_proj.weight".format(l)
             qkv = load_param(param, name, None)
             qp, kp, vp = qkv_split(qkv, config)
             for to_name, param in zip((q, k, v), (qp, kp, vp)):
@@ -132,9 +156,12 @@ def copy_weights_llama(
                 state_dict[to_name] = param
         else:
             if "transformer.h" in name:
-                from_name, number = layer_template(name, 2)
+                from_name, l = layer_template(name, 2)
+                e = None
+                if "mlp.experts" in name:
+                    from_name, e = layer_template(from_name, 5)
                 to_name = weight_map[from_name]
-                to_name = to_name.format(number)
+                to_name = to_name.format(l=l, e=e)
             else:
                 to_name = weight_map[name]
             param = load_param(param, name, None)
@@ -150,36 +177,45 @@ def copy_weights_phi(
     saver: Optional[incremental_save] = None,
 ) -> None:
     weight_map = {
-        "transformer.wte.weight": "layers.0.wte.weight",
-        "transformer.h.{}.norm_1.bias": "layers.{}.ln.bias",
-        "transformer.h.{}.norm_1.weight": "layers.{}.ln.weight",
-        "transformer.h.{}.attn.attn.bias": "layers.{}.mixer.Wqkv.bias",
-        "transformer.h.{}.attn.attn.weight": "layers.{}.mixer.Wqkv.weight",
-        "transformer.h.{}.attn.proj.bias": "layers.{}.mixer.out_proj.bias",
-        "transformer.h.{}.attn.proj.weight": "layers.{}.mixer.out_proj.weight",
-        "transformer.h.{}.mlp.fc.bias": "layers.{}.mlp.fc1.bias",
-        "transformer.h.{}.mlp.fc.weight": "layers.{}.mlp.fc1.weight",
-        "transformer.h.{}.mlp.proj.bias": "layers.{}.mlp.fc2.bias",
-        "transformer.h.{}.mlp.proj.weight": "layers.{}.mlp.fc2.weight",
-        "transformer.ln_f.bias": f"layers.{config.n_layer + 1}.ln.bias",
-        "transformer.ln_f.weight": f"layers.{config.n_layer + 1}.ln.weight",
-        "lm_head.weight": f"layers.{config.n_layer + 1}.linear.weight",
-        "lm_head.bias": f"layers.{config.n_layer + 1}.linear.bias",
+        "transformer.wte.weight": "model.embed_tokens.weight",
+        "transformer.h.{}.norm_1.weight": "model.layers.{}.input_layernorm.weight",
+        "transformer.h.{}.norm_1.bias": "model.layers.{}.input_layernorm.bias",
+        "transformer.h.{}.attn.proj.weight": "model.layers.{}.self_attn.dense.weight",
+        "transformer.h.{}.attn.proj.bias": "model.layers.{}.self_attn.dense.bias",
+        "transformer.h.{}.mlp.fc.weight": "model.layers.{}.mlp.fc1.weight",
+        "transformer.h.{}.mlp.fc.bias": "model.layers.{}.mlp.fc1.bias",
+        "transformer.h.{}.mlp.proj.weight": "model.layers.{}.mlp.fc2.weight",
+        "transformer.h.{}.mlp.proj.bias": "model.layers.{}.mlp.fc2.bias",
+        "transformer.ln_f.weight": "model.final_layernorm.weight",
+        "transformer.ln_f.bias": "model.final_layernorm.bias",
+        "lm_head.weight": "lm_head.weight",
+        "lm_head.bias": "lm_head.bias",
     }
 
     for name, param in lit_weights.items():
-        if "transformer.h" in name:
-            from_name, number = layer_template(name, 2)
-            to_name = weight_map[from_name]
-            to_name = to_name.format(number + 1)
+        if name.endswith((".attn.attn.weight", ".attn.attn.bias")):
+            from_name, l = layer_template(name, 2)
+            weight_type = name.split(".")[-1]  # weight or bias
+            q = f"model.layers.{l}.self_attn.q_proj.{weight_type}"
+            k = f"model.layers.{l}.self_attn.k_proj.{weight_type}"
+            v = f"model.layers.{l}.self_attn.v_proj.{weight_type}"
+            qkv = load_param(param, name, None)
+            qp, kp, vp = qkv_split(qkv, config)
+            for to_name, param in zip((q, k, v), (qp, kp, vp)):
+                if saver is not None:
+                    param = saver.store_early(param)
+                state_dict[to_name] = param
         else:
-            to_name = weight_map[name]
-        param = load_param(param, name, None)
-        if "attn.attn." in name:
-            param = torch.cat(qkv_split(param, config))
-        if saver is not None:
-            param = saver.store_early(param)
-        state_dict[to_name] = param
+            if "transformer.h" in name:
+                from_name, l = layer_template(name, 2)
+                to_name = weight_map[from_name]
+                to_name = to_name.format(l)
+            else:
+                to_name = weight_map[name]
+            param = load_param(param, name, None)
+            if saver is not None:
+                param = saver.store_early(param)
+            state_dict[to_name] = param
 
 
 def qkv_split(
@@ -211,10 +247,15 @@ def check_conversion_supported(lit_weights: Dict[str, torch.Tensor]) -> None:
 def convert_lit_checkpoint(checkpoint_path: Path, output_path: Path, config_path: Path) -> None:
     config = Config.from_json(config_path)
 
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
     if "falcon" in config.name:
         copy_fn = partial(copy_weights_falcon, config.name)
-    elif config._mlp_class == "LLaMAMLP":
-        copy_fn = partial(copy_weights_llama, config)
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
+        untie_weights = "Gemma" in config.name
+        copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights)
+    elif "phi" in config.name:
+        copy_fn = partial(copy_weights_phi, config)
     else:
         copy_fn = copy_weights_gpt_neox
 
@@ -230,6 +271,4 @@ def convert_lit_checkpoint(checkpoint_path: Path, output_path: Path, config_path
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
-    CLI(convert_lit_checkpoint, as_positional=False)
+    CLI(convert_lit_checkpoint)
diff --git a/scripts/convert_pretrained_checkpoint.py b/scripts/convert_pretrained_checkpoint.py
new file mode 100644
index 0000000..b32103e
--- /dev/null
+++ b/scripts/convert_pretrained_checkpoint.py
@@ -0,0 +1,78 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+import shutil
+import sys
+from dataclasses import asdict
+from pathlib import Path
+
+import torch
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from lit_gpt import Config
+from lit_gpt.utils import CLI, incremental_save
+
+
+@torch.inference_mode()
+def convert_checkpoint(checkpoint_file: Path, tokenizer_dir: Path, config_name: str, output_dir: Path) -> None:
+    """Convert a checkpoint after pretraining.
+
+    The pretrained checkpoint contains optimizer states and several other metadata that are not needed after training
+    is finished. This script will export the state-dict of the model and place it in the chosen output folder together
+    with the tokenizer and model config, which then can be loaded by other scripts for inference, evaluation, etc.
+
+    Args:
+        checkpoint_file: Path to a checkpoint file scripts produced by the scripts in ``lit_gpt/pretrain/``.
+        tokenizer_dir: A path to the folder that holds the tokenizer configuration files that were used to train
+            the model. All files with a name starting with 'tokenizer' will be copied to the output folder.
+        config_name: The name of the model loaded with the ``lit_gpt.Config``. The configuration will be saved as a
+            JSON file to the output folder.
+        output_dir: The output folder where model state-dict file, the tokenizer config file, and the model config
+            file will be saved.
+    """
+
+    if output_dir.is_dir() and output_dir.glob("*"):
+        raise FileExistsError(
+            f"The output folder exists and is not empty: {str(output_dir)}."
+            " Please delete it first or choose a different name."
+        )
+    if not tokenizer_dir.is_dir():
+        raise FileNotFoundError(f"The tokenizer_dir must be a directory: {str(output_dir)}.")
+
+    output_dir.mkdir(parents=True)
+    output_checkpoint_file = output_dir / "lit_model.pth"
+    output_config_file = output_dir / "lit_config.json"
+
+    # Save the config to output folder
+    config = Config.from_name(config_name)
+    with open(output_config_file, "w") as json_config:
+        json.dump(asdict(config), json_config)
+
+    # Export the tokenizer configuration to output folder
+    for tokenizer_file in tokenizer_dir.glob("tokenizer*"):
+        shutil.copyfile(tokenizer_file, output_dir / tokenizer_file.name)
+
+    # Copy config for tokenization if found
+    if (tokenizer_dir / "generation_config.json").is_file():
+        shutil.copyfile(tokenizer_dir / "generation_config.json", output_dir / "generation_config.json")
+
+    # Extract the model state dict and save to output folder
+    with incremental_save(output_checkpoint_file) as saver:
+        print("Processing", checkpoint_file)
+        full_checkpoint = torch.load(str(checkpoint_file), mmap=True)
+        loaded_state_dict = full_checkpoint["model"]
+        converted_state_dict = {}
+        for param_name, param in loaded_state_dict.items():
+            saver.store_early(param)
+            # remove prefix for compiled model (if any)
+            param_name = param_name.replace("_orig_mod.", "")
+            converted_state_dict[param_name] = param
+        print(f"Saving converted checkpoint to {str(output_checkpoint_file)}.")
+        saver.save(converted_state_dict)
+
+
+if __name__ == "__main__":
+    CLI(convert_checkpoint)
diff --git a/scripts/download.py b/scripts/download.py
index a971141..7bb5883 100644
--- a/scripts/download.py
+++ b/scripts/download.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import os
 import sys
 from pathlib import Path
@@ -10,16 +12,23 @@
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
+from lit_gpt.utils import CLI
+
 _SAFETENSORS_AVAILABLE = RequirementCache("safetensors")
+_HF_TRANSFER_AVAILABLE = RequirementCache("hf_transfer")
 
 
 def download_from_hub(
-    repo_id: Optional[str] = None, access_token: Optional[str] = os.getenv("HF_TOKEN"), from_safetensors: bool = False
+    repo_id: Optional[str] = None,
+    access_token: Optional[str] = os.getenv("HF_TOKEN"),
+    from_safetensors: bool = False,
+    tokenizer_only: bool = False,
+    checkpoint_dir: Path = Path("checkpoints"),
 ) -> None:
     if repo_id is None:
         from lit_gpt.config import configs
 
-        options = [f"{config['org']}/{config['name']}" for config in configs]
+        options = [f"{config['hf_config']['org']}/{config['hf_config']['name']}" for config in configs]
         print("Please specify --repo_id <repo_id>. Available values:")
         print("\n".join(options))
         return
@@ -33,15 +42,28 @@ def download_from_hub(
             " https://huggingface.co/settings/tokens"
         )
 
-    download_files = ["tokenizer*", "generation_config.json"]
-    if from_safetensors:
-        if not _SAFETENSORS_AVAILABLE:
-            raise ModuleNotFoundError(str(_SAFETENSORS_AVAILABLE))
-        download_files.append("*.safetensors")
-    else:
-        download_files.append("*.bin*")
+    download_files = ["tokenizer*", "generation_config.json", "config.json"]
+    if not tokenizer_only:
+        if from_safetensors:
+            if not _SAFETENSORS_AVAILABLE:
+                raise ModuleNotFoundError(str(_SAFETENSORS_AVAILABLE))
+            download_files.append("*.safetensors")
+        else:
+            # covers `.bin` files and `.bin.index.json`
+            download_files.append("*.bin*")
+    elif from_safetensors:
+        raise ValueError("`--from_safetensors=True` won't have an effect with `--tokenizer_only=True`")
+
+    import huggingface_hub._snapshot_download as download
+    import huggingface_hub.constants as constants
 
-    directory = Path("checkpoints") / repo_id
+    previous = constants.HF_HUB_ENABLE_HF_TRANSFER
+    if _HF_TRANSFER_AVAILABLE and not previous:
+        print("Setting HF_HUB_ENABLE_HF_TRANSFER=1")
+        constants.HF_HUB_ENABLE_HF_TRANSFER = True
+        download.HF_HUB_ENABLE_HF_TRANSFER = True
+
+    directory = checkpoint_dir / repo_id
     snapshot_download(
         repo_id,
         local_dir=directory,
@@ -51,6 +73,9 @@ def download_from_hub(
         token=access_token,
     )
 
+    constants.HF_HUB_ENABLE_HF_TRANSFER = previous
+    download.HF_HUB_ENABLE_HF_TRANSFER = previous
+
     # convert safetensors to PyTorch binaries
     if from_safetensors:
         from safetensors import SafetensorError
@@ -69,6 +94,4 @@ def download_from_hub(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(download_from_hub)
diff --git a/scripts/merge_lora.py b/scripts/merge_lora.py
index d72f2f5..e1c0217 100644
--- a/scripts/merge_lora.py
+++ b/scripts/merge_lora.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """This script merges the LoRA weights with the base model"""
 
 import sys
@@ -12,7 +14,7 @@
 sys.path.append(str(wd))
 
 from lit_gpt.lora import GPT, Config, lora_filter, merge_lora_weights
-from lit_gpt.utils import check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
+from lit_gpt.utils import CLI, check_valid_checkpoint_dir, get_default_supported_precision, lazy_load
 
 lora_r = 16
 lora_alpha = 32
@@ -24,12 +26,20 @@
 lora_mlp = True
 lora_head = True
 
-
 def merge_lora(
     lora_path: Path = Path("out/lora/alpaca/lit_model_lora_finetuned.pth"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
     out_dir: Path = Path("out/lora/checkpoint"),
     precision: Optional[str] = None,
+    lora_r: int = 8,
+    lora_alpha: int = 16,
+    lora_dropout: float = 0.05,
+    lora_query: bool = True,
+    lora_key: bool = False,
+    lora_value: bool = True,
+    lora_projection: bool = False,
+    lora_mlp: bool = False,
+    lora_head: bool = False,
 ) -> None:
     """Generates a response based on a given instruction and an optional input.
     This script will only work with checkpoints from the instruction-tuned GPT-LoRA model.
@@ -42,11 +52,12 @@ def merge_lora(
         out_dir: The path to the merged model that is created by this script.
         precision: Indicates the Fabric precision setting to use.
     """
+    check_valid_checkpoint_dir(checkpoint_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
     precision = precision or get_default_supported_precision(training=False)
     fabric = L.Fabric(devices=1, precision=precision)
 
-    check_valid_checkpoint_dir(checkpoint_dir)
-
     config = Config.from_json(
         checkpoint_dir / "lit_config.json",
         r=lora_r,
@@ -78,6 +89,4 @@ def merge_lora(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(merge_lora)
diff --git a/scripts/prepare_alpaca.py b/scripts/prepare_alpaca.py
index a7af4ac..61ca7bf 100644
--- a/scripts/prepare_alpaca.py
+++ b/scripts/prepare_alpaca.py
@@ -1,11 +1,14 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """Implementation derived from https://github.com/tloen/alpaca-lora"""
+
 import json
 import sys
 from pathlib import Path
 from typing import Optional
 
-import requests
 import torch
+from lightning_utilities.core.imports import RequirementCache
 from torch.utils.data import random_split
 from tqdm import tqdm
 
@@ -14,6 +17,7 @@
 sys.path.append(str(wd))
 
 from lit_gpt.tokenizer import Tokenizer
+from lit_gpt.utils import CLI
 
 
 def prepare(
@@ -87,6 +91,11 @@ def download_if_missing(file_path: Path, file_url: str) -> None:
     """Downloads the raw json data file and saves it in the given destination."""
     if file_path.exists() and file_path.stat().st_size > 0:
         return
+    requests_available = RequirementCache("requests")
+    if not requests_available:
+        raise ModuleNotFoundError(str(requests_available))
+    import requests
+
     with open(file_path, "w", encoding="utf-8") as f:
         f.write(requests.get(file_url).text)
 
@@ -118,12 +127,7 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
     if mask_inputs:
         labels[: len(encoded_full_prompt)] = ignore_index
 
-    return {
-        **example,
-        "input_ids": encoded_full_prompt_and_response,
-        "input_ids_no_response": encoded_full_prompt,
-        "labels": labels,
-    }
+    return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
 
 
 def generate_prompt(example: dict) -> str:
@@ -144,6 +148,4 @@ def generate_prompt(example: dict) -> str:
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(prepare)
diff --git a/scripts/prepare_csv.py b/scripts/prepare_csv.py
index fe5ad7a..89dd43f 100644
--- a/scripts/prepare_csv.py
+++ b/scripts/prepare_csv.py
@@ -1,8 +1,10 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import logging
 import sys
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 from torch.utils.data import random_split
@@ -14,8 +16,7 @@
 sys.path.append(str(wd))
 
 from lit_gpt.tokenizer import Tokenizer
-
-COLUMNS = ("instruction", "input", "output")
+from lit_gpt.utils import CLI
 
 
 def prepare(
@@ -27,6 +28,7 @@ def prepare(
     mask_inputs: bool = False,
     ignore_index: int = -1,
     max_seq_length: Optional[int] = None,
+    columns: Tuple[str, ...] = ("instruction", "input", "output"),
 ) -> None:
     """Prepare a CSV dataset for instruction tuning.
 
@@ -43,8 +45,8 @@ def prepare(
     import pandas as pd
 
     df = pd.read_csv(csv_path, dtype=str).fillna("")
-    if not (df.columns.values == COLUMNS).all():
-        raise ValueError(f"CSV columns must be {COLUMNS}, found {df.columns.values}")
+    if not (df.columns.values == columns).all():
+        raise ValueError(f"CSV columns must be {columns}, found {df.columns.values}")
     data = json.loads(df.to_json(orient="records", indent=4))
 
     print("Loading tokenizer...")
@@ -113,12 +115,7 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
     if mask_inputs:
         labels[: len(encoded_full_prompt)] = ignore_index
 
-    return {
-        **example,
-        "input_ids": encoded_full_prompt_and_response,
-        "input_ids_no_response": encoded_full_prompt,
-        "labels": labels,
-    }
+    return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
 
 
 def generate_prompt(example: dict) -> str:
@@ -139,6 +136,4 @@ def generate_prompt(example: dict) -> str:
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
-    CLI(prepare, as_positional=False)
+    CLI(prepare)
diff --git a/scripts/prepare_dolly.py b/scripts/prepare_dolly.py
index 2ae57b4..56da37c 100644
--- a/scripts/prepare_dolly.py
+++ b/scripts/prepare_dolly.py
@@ -1,10 +1,12 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """Implementation derived from https://github.com/tloen/alpaca-lora"""
+
 import json
 import sys
 from pathlib import Path
 from typing import Optional
 
-import requests
 import torch
 from torch.utils.data import random_split
 from tqdm import tqdm
@@ -14,6 +16,8 @@
 sys.path.append(str(wd))
 
 from lit_gpt.tokenizer import Tokenizer
+from lit_gpt.utils import CLI
+from scripts.prepare_alpaca import download_if_missing
 
 
 def prepare(
@@ -89,15 +93,7 @@ def prepare(
     torch.save(test_set, destination_path / "test.pt")
 
 
-def download_if_missing(file_path: Path, file_url: str) -> None:
-    """Downloads the raw json data file and saves it in the given destination."""
-    if file_path.exists():
-        return
-    with open(file_path, "w", encoding="utf-8") as f:
-        f.write(requests.get(file_url).text)
-
-
-def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> None:
+def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
     """Processes a single sample.
 
     Each sample in the dataset consists of:
@@ -124,12 +120,7 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
     if mask_inputs:
         labels[: len(encoded_full_prompt)] = ignore_index
 
-    return {
-        **example,
-        "input_ids": encoded_full_prompt_and_response,
-        "input_ids_no_response": encoded_full_prompt,
-        "labels": labels,
-    }
+    return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
 
 
 def generate_prompt(example: dict) -> str:
@@ -150,6 +141,4 @@ def generate_prompt(example: dict) -> str:
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(prepare)
diff --git a/scripts/prepare_flan.py b/scripts/prepare_flan.py
new file mode 100644
index 0000000..59d3a7f
--- /dev/null
+++ b/scripts/prepare_flan.py
@@ -0,0 +1,232 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+import torch
+from tqdm import tqdm
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from lit_gpt.tokenizer import Tokenizer
+from lit_gpt.utils import CLI
+from scripts.prepare_alpaca import download_if_missing
+
+
+def load_jsonl(filename):
+    data = []
+    with open(filename, "r", encoding="utf-8") as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+
+def prepare(
+    destination_path: Path = Path("data/flan"),
+    checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
+    mask_inputs: bool = False,  # as in alpaca-lora
+    subsets: Optional[str] = None,
+    ignore_index: int = -1,
+    max_seq_length: Optional[int] = None,
+) -> None:
+    """Prepare the FLAN-collection datasets for instruction tuning.
+
+    The output is a training and test dataset saved as `train.pt` and `test.pt`,
+    which stores the preprocessed and tokenized prompts and labels.
+
+    Since the original test set does not have responses, the validation set
+    is used as the test set.
+    """
+
+    supported_subsets = {
+        "aeslc_10templates",
+        "ag_news_subset_10templates",
+        "anli_r1_10templates",
+        "anli_r2_10templates",
+        "anli_r3_10templates",
+        "arc_challenge_10templates",
+        "arc_easy_10templates",
+        "bool_q_10templates",
+        "cb_10templates",
+        "cnn_dailymail_10templates",
+        "cola_10templates",
+        "common_gen_10templates",
+        "copa_10templates",
+        "coqa_10templates",
+        "cosmos_qa_10templates",
+        "dart_10templates",
+        "definite_pronoun_resolution_10templates",
+        "drop_10templates",
+        "e2e_nlg_10templates",
+        "fix_punct_10templates",
+        "gigaword_10templates",
+        "glue_mrpc_10templates",
+        "glue_qqp_10templates",
+        "hellaswag_10templates",
+        "imdb_reviews_10templates",
+        "math_dataset_10templates",
+        "mnli_matched_10templates",
+        "mnli_mismatched_10templates",
+        "multi_news_10templates",
+        "multirc_10templates",
+        "natural_questions_10templates",
+        "openbookqa_10templates",
+        "opinion_abstracts_idebate_10templates",
+        "opinion_abstracts_rotten_tomatoes_10templates",
+        "para_crawl_enes_10templates",
+        "paws_wiki_10templates",
+        "piqa_10templates",
+        "qnli_10templates",
+        "quac_10templates",
+        "record_10templates",
+        "rte_10templates",
+        "samsum_10templates",
+        "sentiment140_10templates",
+        "snli_10templates",
+        "squad_v1_10templates",
+        "squad_v2_10templates",
+        "sst2_10templates",
+        "story_cloze_10templates",
+        "stsb_10templates",
+        "trec_10templates",
+        "trivia_qa_10templates",
+        "true_case_10templates",
+        "web_nlg_en_10templates",
+        "wic_10templates",
+        "wiki_lingua_english_en_10templates",
+        "wmt14_enfr_10templates",
+        "wmt16_translate_csen_10templates",
+        "wmt16_translate_deen_10templates",
+        "wmt16_translate_fien_10templates",
+        "wmt16_translate_roen_10templates",
+        "wmt16_translate_ruen_10templates",
+        "wmt16_translate_tren_10templates",
+        "wnli_10templates",
+        "word_segment_10templates",
+        "wsc_10templates",
+        "yelp_polarity_reviews_10templates",
+    }
+
+    if subsets is not None:
+        subsets = subsets.split(",")
+        for sub in subsets:
+            if sub not in supported_subsets:
+                raise ValueError(f"{sub} not in {supported_subsets}")
+    else:
+        subsets = list(supported_subsets)
+
+    if max_seq_length is None:
+        with open(checkpoint_dir / "lit_config.json", "r", encoding="utf-8") as file:
+            config = json.load(file)
+            max_seq_length = config["block_size"]
+
+    destination_path.mkdir(parents=True, exist_ok=True)
+    print("Loading data file...")
+
+    base_url = "https://huggingface.co/datasets/Muennighoff/flan/resolve/main/"
+
+    train_set, test_set = [], []
+    for sub in subsets:
+        train_sub = sub + "_train"
+        data_file_name = train_sub + ".jsonl"
+        data_file_path = destination_path / data_file_name
+        data_file_url = base_url + "train/" + data_file_name
+
+        print(f"Loading training data file {sub}...")
+        download_if_missing(data_file_path, data_file_url)
+        sub_train_set = load_jsonl(data_file_path)
+        train_set.extend(sub_train_set)
+
+        test_sub = sub + "_test"
+        data_file_name = test_sub + ".jsonl"
+        data_file_path = destination_path / data_file_name
+        data_file_url = base_url + "test/" + data_file_name
+
+        print(f"Loading test data file {sub}...")
+        download_if_missing(data_file_path, data_file_url)
+        sub_test_set = load_jsonl(data_file_path)
+        test_set.extend(sub_test_set)
+
+    print("Loading tokenizer...")
+    tokenizer = Tokenizer(checkpoint_dir)
+
+    train_set, test_set = list(train_set), list(test_set)
+
+    print(f"train has {len(train_set):,} samples")
+    print(f"test has {len(test_set):,} samples")
+
+    print("Processing train split ...")
+    train_set = [
+        prepare_sample(
+            example=sample,
+            tokenizer=tokenizer,
+            max_length=max_seq_length,
+            mask_inputs=mask_inputs,
+            ignore_index=ignore_index,
+        )
+        for sample in tqdm(train_set)
+    ]
+    torch.save(train_set, destination_path / "train.pt")
+
+    print("Processing test split ...")
+    test_set = [
+        prepare_sample(
+            example=sample,
+            tokenizer=tokenizer,
+            max_length=max_seq_length,
+            mask_inputs=mask_inputs,
+            ignore_index=ignore_index,
+        )
+        for sample in tqdm(test_set)
+    ]
+    torch.save(test_set, destination_path / "test.pt")
+
+
+def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int):
+    """Processes a single sample.
+
+    Each sample in the dataset consists of:
+    - instruction: A string describing the task
+    - input: A string holding a special input value for the instruction.
+        This only applies to some samples, and in others this is empty.
+    - output: The response string
+
+    This function processes this data to produce a prompt text and a label for
+    supervised training. The prompt text is formed as a single message including both
+    the instruction and the input. The label/target is the same message but with the
+    response attached.
+
+    Finally, both the prompt and the label get tokenized. If desired, all tokens
+    in the label that correspond to the original input prompt get masked out (default).
+    """
+    full_prompt = generate_prompt(example)
+    full_prompt_and_response = full_prompt + example["targets"]
+    encoded_full_prompt = tokenizer.encode(full_prompt, max_length=max_length)
+    encoded_full_prompt_and_response = tokenizer.encode(full_prompt_and_response, eos=True, max_length=max_length)
+
+    # The labels are the full prompt with response, but with the prompt masked out
+    labels = encoded_full_prompt_and_response.clone()
+    if mask_inputs:
+        labels[: len(encoded_full_prompt)] = ignore_index
+
+    return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
+
+
+def generate_prompt(example):
+    """Generates a standardized message to prompt the model with an instruction, optional input and a
+    'response' field."""
+
+    return (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        f"### Instruction:\n{example['inputs']}\n\n### Response:"
+    )
+
+
+if __name__ == "__main__":
+    CLI(prepare)
diff --git a/scripts/prepare_lima.py b/scripts/prepare_lima.py
index d234568..ca35e62 100644
--- a/scripts/prepare_lima.py
+++ b/scripts/prepare_lima.py
@@ -1,4 +1,7 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """Implementation derived from https://github.com/tloen/alpaca-lora"""
+
 import json
 import os
 import sys
@@ -14,6 +17,7 @@
 sys.path.append(str(wd))
 
 from lit_gpt.tokenizer import Tokenizer
+from lit_gpt.utils import CLI
 
 
 def prepare(
@@ -51,7 +55,7 @@ def prepare(
 
     from datasets import load_dataset
 
-    dataset = load_dataset(data_repo_id, use_auth_token=access_token)
+    dataset = load_dataset(data_repo_id, token=access_token)
     train_data = format_dataset(dataset["train"], include_multiturn_conversations)
 
     # test set is present but doesn't have any solutions, so we cannot use it here
@@ -140,12 +144,7 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
     if mask_inputs:
         labels[: len(encoded_full_prompt)] = ignore_index
 
-    return {
-        **example,
-        "input_ids": encoded_full_prompt_and_response,
-        "input_ids_no_response": encoded_full_prompt,
-        "labels": labels,
-    }
+    return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
 
 
 def generate_prompt(example: dict) -> str:
@@ -166,6 +165,4 @@ def generate_prompt(example: dict) -> str:
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(prepare)
diff --git a/scripts/prepare_longform.py b/scripts/prepare_longform.py
index 30fde72..2a46e7d 100644
--- a/scripts/prepare_longform.py
+++ b/scripts/prepare_longform.py
@@ -1,10 +1,12 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 """Implementation derived from https://github.com/tloen/alpaca-lora"""
+
 import json
 import sys
 from pathlib import Path
 from typing import Optional
 
-import requests
 import torch
 from tqdm import tqdm
 
@@ -13,12 +15,13 @@
 sys.path.append(str(wd))
 
 from lit_gpt.tokenizer import Tokenizer
+from lit_gpt.utils import CLI
+from scripts.prepare_alpaca import download_if_missing
 
 
 def prepare(
     destination_path: Path = Path("data/longform"),
     checkpoint_dir: Path = Path("checkpoints/stabilityai/stablelm-base-alpha-3b"),
-    seed: int = 42,
     mask_inputs: bool = False,  # as in alpaca-lora
     ignore_index: int = -1,
     max_seq_length: Optional[int] = None,
@@ -88,14 +91,6 @@ def prepare(
     torch.save(test_data, destination_path / "test.pt")
 
 
-def download_if_missing(file_path: Path, file_url: str) -> None:
-    """Downloads the raw json data file and saves it in the given destination."""
-    if file_path.exists() and file_path.stat().st_size > 0:
-        return
-    with open(file_path, "w", encoding="utf-8") as f:
-        f.write(requests.get(file_url).text)
-
-
 def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
     """Processes a single sample.
 
@@ -123,12 +118,7 @@ def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_in
     if mask_inputs:
         labels[: len(encoded_full_prompt)] = ignore_index
 
-    return {
-        **example,
-        "input_ids": encoded_full_prompt_and_response,
-        "input_ids_no_response": encoded_full_prompt,
-        "labels": labels,
-    }
+    return {**example, "input_ids": encoded_full_prompt_and_response, "labels": labels}
 
 
 def generate_prompt(example: dict) -> str:
@@ -143,6 +133,4 @@ def generate_prompt(example: dict) -> str:
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(prepare)
diff --git a/scripts/prepare_openwebtext.py b/scripts/prepare_openwebtext.py
index 7e70999..2578ab9 100644
--- a/scripts/prepare_openwebtext.py
+++ b/scripts/prepare_openwebtext.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 # saves the openwebtext dataset to a binary file for training. following was helpful:
 # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
 import os
@@ -13,6 +15,7 @@
 sys.path.append(str(wd))
 
 from lit_gpt import Tokenizer
+from lit_gpt.utils import CLI
 
 
 def prepare(
@@ -75,6 +78,4 @@ def process(example):
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(prepare)
diff --git a/scripts/prepare_redpajama.py b/scripts/prepare_redpajama.py
index b44a504..f2c87a3 100644
--- a/scripts/prepare_redpajama.py
+++ b/scripts/prepare_redpajama.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import glob
 import json
 import os
@@ -13,6 +15,7 @@
 
 import lit_gpt.packed_dataset as packed_dataset
 from lit_gpt import Config, Tokenizer
+from lit_gpt.utils import CLI
 
 filenames_sample = [
     "arxiv_sample.jsonl",
@@ -147,7 +150,7 @@ def prepare(
     match: str = "",
 ) -> None:
     """Prepare the "Red Pajama" dataset. We assume tokenizer has been trained."""
-    config = Config.from_json(checkpoint_dir / "lit_config.json")
+    config = Config.from_checkpoint(checkpoint_dir)
 
     prepare_fn = prepare_sample if sample else prepare_full
     prepare_fn(
@@ -160,6 +163,4 @@ def prepare(
 
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
-
     CLI(prepare)
diff --git a/scripts/prepare_slimpajama.py b/scripts/prepare_slimpajama.py
new file mode 100644
index 0000000..7a83316
--- /dev/null
+++ b/scripts/prepare_slimpajama.py
@@ -0,0 +1,63 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import zstandard as zstd
+from lightning.data.streaming import DataChunkRecipe, DataProcessor
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from lit_gpt import Tokenizer
+from lit_gpt.utils import CLI
+
+
+class SlimPajamaDataRecipe(DataChunkRecipe):
+    def __init__(self, tokenizer: Tokenizer, chunk_size: int):
+        super().__init__(chunk_size)
+        self.tokenizer = tokenizer
+
+    def prepare_structure(self, input_dir):
+        files = Path(input_dir).rglob("*.zst")
+        return [str(file) for file in files]
+
+    def prepare_item(self, filepath):
+        with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+            for row in f:
+                text = json.loads(row)["text"]
+                if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub":
+                    continue  # exclude the GitHub data since it overlaps with starcoder
+                text_ids = self.tokenizer.encode(text, bos=False, eos=True)
+                yield text_ids
+
+
+def prepare(
+    input_dir: Path = Path("data/SlimPajama-627B/train"),
+    output_dir: Path = Path("data/slimpajama/train"),
+    tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
+    chunk_size: int = (2049 * 16384),
+    fast_dev_run: bool = False,
+) -> None:
+    tokenizer = Tokenizer(tokenizer_path)
+    data_recipe = SlimPajamaDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
+    data_processor = DataProcessor(
+        input_dir=str(input_dir),
+        output_dir=str(output_dir),
+        fast_dev_run=fast_dev_run,
+        num_workers=os.cpu_count(),
+        num_downloaders=1,
+    )
+
+    start_time = time.time()
+    data_processor.run(data_recipe)
+    elapsed_time = time.time() - start_time
+    print(f"Time taken: {elapsed_time:.2f} seconds")
+
+
+if __name__ == "__main__":
+    CLI(prepare)
diff --git a/scripts/prepare_starcoder.py b/scripts/prepare_starcoder.py
new file mode 100644
index 0000000..ea260eb
--- /dev/null
+++ b/scripts/prepare_starcoder.py
@@ -0,0 +1,74 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import os
+import sys
+import time
+import traceback
+from pathlib import Path
+
+import pyarrow.parquet as pq
+from lightning.data.streaming import DataChunkRecipe, DataProcessor
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from lit_gpt import Tokenizer
+from lit_gpt.utils import CLI
+
+
+class StarcoderDataRecipe(DataChunkRecipe):
+    def __init__(self, tokenizer: Tokenizer, chunk_size: int):
+        super().__init__(chunk_size)
+        self.tokenizer = tokenizer
+
+    def prepare_structure(self, input_dir):
+        files = Path(input_dir).rglob("*.parquet")
+        return [str(file) for file in files]
+
+    def prepare_item(self, item_metadata):
+        filepath = item_metadata
+        start = time.time()
+
+        try:
+            parquet_file = pq.ParquetFile(filepath)
+            # reduce RAM usage
+            for batch in parquet_file.iter_batches(batch_size=8192, columns=["content"]):
+                for text in batch.to_pandas()["content"]:
+                    yield self.tokenizer.encode(text, bos=False, eos=True)
+
+        except Exception:
+            print(traceback.format_exc())
+            print(f"Error reading {filepath}")
+            return
+
+        parquet_file.close()
+        end = time.time()
+        print(f"Took {end - start:.2f} seconds total", filepath)
+
+
+def prepare(
+    input_dir: Path = Path("data/starcoderdata"),
+    output_dir: Path = Path("data/starcoder"),
+    tokenizer_path: Path = Path("checkpoints/Llama-2-7b-hf/"),
+    chunk_size: int = (2049 * 8192),
+    fast_dev_run: bool = False,
+) -> None:
+    tokenizer = Tokenizer(tokenizer_path)
+    data_recipe = StarcoderDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)
+    data_processor = DataProcessor(
+        input_dir=str(input_dir),
+        output_dir=str(output_dir),
+        fast_dev_run=fast_dev_run,
+        num_workers=os.cpu_count(),
+        num_downloaders=1,
+    )
+
+    start_time = time.time()
+    data_processor.run(data_recipe)
+    elapsed_time = time.time() - start_time
+    print(f"Time taken: {elapsed_time:.2f} seconds")
+
+
+if __name__ == "__main__":
+    CLI(prepare)
diff --git a/setup.py b/setup.py
index 3720f56..30393cf 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import os
 
 from setuptools import find_packages, setup
@@ -14,8 +16,8 @@
     author="Lightning AI",
     url="https://github.com/lightning-AI/lit-gpt",
     install_requires=[
-        "torch>=2.1.0",
-        "lightning @ git+https://github.com/Lightning-AI/lightning@71aed751f7f0ca8422ddca256e602099070f490b",
+        "torch>=2.2.0",
+        "lightning @ git+https://github.com/Lightning-AI/lightning@ed367ca675861cdf40dbad2e4d66f7eee2ec50af",
     ],
     packages=find_packages(),
     long_description=readme,
diff --git a/tests/conftest.py b/tests/conftest.py
index 456256b..3414ce4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,9 +1,13 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import os
 import sys
 from pathlib import Path
+from typing import List
 
 import pytest
 import torch
+from lightning.fabric.utilities.testing import _runif_reasons
 
 wd = Path(__file__).parent.parent.absolute()
 
@@ -34,3 +38,85 @@ def __eq__(self, other):
 @pytest.fixture()
 def tensor_like():
     return TensorLike()
+
+
+class FloatLike:
+    def __eq__(self, other):
+        return not isinstance(other, int) and isinstance(other, float)
+
+
+@pytest.fixture()
+def float_like():
+    return FloatLike()
+
+
+@pytest.fixture(autouse=True)
+def restore_default_dtype():
+    # just in case
+    torch.set_default_dtype(torch.float32)
+
+
+class MockTokenizer:
+    """A dummy tokenizer that encodes each character as its ASCII code."""
+    def encode(self, text: str, eos: bool = False, max_length: int = -1) -> torch.Tensor:
+        output = [ord(c) for c in text]
+        if eos:
+            output.append(1)
+        output = output[:max_length] if max_length > 0 else output
+        return torch.tensor(output)
+
+    def decode(self, tokens: torch.Tensor) -> str:
+        return "".join(chr(int(t)) for t in tokens.tolist())
+
+
+@pytest.fixture()
+def mock_tockenizer():
+    return MockTokenizer()
+
+
+def RunIf(**kwargs):
+    reasons, marker_kwargs = _runif_reasons(**kwargs)
+    return pytest.mark.skipif(condition=len(reasons) > 0, reason=f"Requires: [{' + '.join(reasons)}]", **marker_kwargs)
+
+
+# https://github.com/Lightning-AI/lightning/blob/6e517bd55b50166138ce6ab915abd4547702994b/tests/tests_fabric/conftest.py#L140
+def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None:
+    initial_size = len(items)
+    conditions = []
+    filtered, skipped = 0, 0
+
+    options = {"standalone": "PL_RUN_STANDALONE_TESTS", "min_cuda_gpus": "PL_RUN_CUDA_TESTS"}
+    if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1":
+        # special case: we don't have a CPU job for standalone tests, so we shouldn't run only cuda tests.
+        # by deleting the key, we avoid filtering out the CPU tests
+        del options["min_cuda_gpus"]
+
+    for kwarg, env_var in options.items():
+        # this will compute the intersection of all tests selected per environment variable
+        if os.getenv(env_var, "0") == "1":
+            conditions.append(env_var)
+            for i, test in reversed(list(enumerate(items))):  # loop in reverse, since we are going to pop items
+                already_skipped = any(marker.name == "skip" for marker in test.own_markers)
+                if already_skipped:
+                    # the test was going to be skipped anyway, filter it out
+                    items.pop(i)
+                    skipped += 1
+                    continue
+                has_runif_with_kwarg = any(
+                    marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
+                )
+                if not has_runif_with_kwarg:
+                    # the test has `@RunIf(kwarg=True)`, filter it out
+                    items.pop(i)
+                    filtered += 1
+
+    if config.option.verbose >= 0 and (filtered or skipped):
+        writer = config.get_terminal_writer()
+        writer.write(
+            f"\nThe number of tests has been filtered from {initial_size} to {initial_size - filtered} after the"
+            f" filters {conditions}.\n{skipped} tests are marked as unconditional skips.\nIn total,"
+            f" {len(items)} tests will run.\n",
+            flush=True,
+            bold=True,
+            purple=True,  # oh yeah, branded pytest messages
+        )
diff --git a/tests/data/__init__.py b/tests/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/conftest.py b/tests/data/conftest.py
new file mode 100644
index 0000000..f048f5d
--- /dev/null
+++ b/tests/data/conftest.py
@@ -0,0 +1,28 @@
+import shutil
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture()
+def alpaca_path(tmp_path):
+    file = Path(__file__).parent / "fixtures" / "alpaca.json"
+    shutil.copyfile(file, tmp_path / "alpaca.json")
+    return tmp_path / "alpaca.json"
+
+
+@pytest.fixture()
+def dolly_path(tmp_path):
+    file = Path(__file__).parent / "fixtures" / "dolly.json"
+    shutil.copyfile(file, tmp_path / "dolly.json")
+    return tmp_path / "dolly.json"
+
+
+@pytest.fixture()
+def longform_path(tmp_path):
+    path = tmp_path / "longform"
+    path.mkdir()
+    for split in ("train", "val"):
+        file = Path(__file__).parent / "fixtures" / f"longform_{split}.json"
+        shutil.copyfile(file, path / f"{split}.json")
+    return path
diff --git a/tests/data/fixtures/alpaca.json b/tests/data/fixtures/alpaca.json
new file mode 100644
index 0000000..10f50be
--- /dev/null
+++ b/tests/data/fixtures/alpaca.json
@@ -0,0 +1,117 @@
+[
+    {
+        "instruction": "Give three tips for staying healthy.",
+        "input": "",
+        "output": "1. Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."
+    },
+    {
+        "instruction": "What are the three primary colors?",
+        "input": "",
+        "output": "The three primary colors are red, blue, and yellow."
+    },
+    {
+        "instruction": "Describe the structure of an atom.",
+        "input": "",
+        "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."
+    },
+    {
+        "instruction": "How can we reduce air pollution?",
+        "input": "",
+        "output": "There are a number of ways to reduce air pollution, such as shifting to renewable energy sources, encouraging the use of public transportation, prohibiting the burning of fossil fuels, implementing policies to reduce emissions from industrial sources, and implementing vehicle emissions standards. Additionally, individuals can do their part to reduce air pollution by reducing car use, avoiding burning materials such as wood, and changing to energy efficient appliances."
+    },
+    {
+        "instruction": "Pretend you are a project manager of a construction company. Describe a time when you had to make a difficult decision.",
+        "input": "",
+        "output": "I had to make a difficult decision when I was working as a project manager at a construction company. I was in charge of a project that needed to be completed by a certain date in order to meet the client\u2019s expectations. However, due to unexpected delays, we were not able to meet the deadline and so I had to make a difficult decision. I decided to extend the deadline, but I had to stretch the team\u2019s resources even further and increase the budget. Although it was a risky decision, I ultimately decided to go ahead with it to ensure that the project was completed on time and that the client\u2019s expectations were met. The project was eventually successfully completed and this was seen as a testament to my leadership and decision-making abilities."
+    },
+    {
+        "instruction": "Identify the odd one out.",
+        "input": "Twitter, Instagram, Telegram",
+        "output": "Telegram"
+    },
+    {
+        "instruction": "Explain why the following fraction is equivalent to 1/4",
+        "input": "4/16",
+        "output": "The fraction 4/16 is equivalent to 1/4 because both numerators and denominators are divisible by 4. Dividing both the top and bottom numbers by 4 yields the fraction 1/4."
+    },
+    {
+        "instruction": "Write a short story in third person narration about a protagonist who has to make an important career decision.",
+        "input": "",
+        "output": "John was at a crossroads in his life. He had just graduated college and was now facing the big decision of what career to pursue. After much deliberation, he decided that he wanted to be an accountant and help the financially disadvantaged. He had always been good with numbers and enjoyed seeing the tangible results of his work. \n\nJohn enrolled in accounting courses and initially found it quite challenging. He had to learn multiple systems and regulations quickly, but he worked hard and eventually excelled in his studies. After a few years, John started working at an accounting firm in his city. He was eager to put his knowledge of taxes and accounting to use in a real-world setting.\n\nJohn loved his job, as it let him express his creativity in finding strategies to save his clients money. After a few years at the firm, he became a senior accountant and was asked to manage bigger and more challenging cases. He was now a respected figure in the financial industry, but he still remembers when he was just a recent college graduate, unsure of the direction in which his life would take him."
+    },
+    {
+        "instruction": "Evaluate this sentence for spelling and grammar mistakes",
+        "input": "He finnished his meal and left the resturant",
+        "output": "He finished his meal and left the restaurant."
+    },
+    {
+        "instruction": "How did Julius Caesar die?",
+        "input": "",
+        "output": "Julius Caesar was assassinated by a group of up to 60 conspirators, led by Gaius Cassius Longinus and Marcus Junius Brutus, in the Senate House on the Ides of March (15 March) of 44 BC."
+    },
+    {
+        "instruction": "What is the capital of France?",
+        "input": "",
+        "output": "The capital of France is Paris."
+    },
+    {
+        "instruction": "Generate a list of ten items a person might need for a camping trip",
+        "input": "",
+        "output": "1. Tent\n2. Sleeping bags\n3. Flashlight\n4. Matches/lighter\n5. Insect repellent\n6. Extra clothes\n7. First aid kit\n8. Camp stove\n9. Cooking utensils\n10. Biodegradable soaps/shampoos"
+    },
+    {
+        "instruction": "Discuss the causes of the Great Depression",
+        "input": "",
+        "output": "The Great Depression was a severe global economic downturn that lasted from 1929 to 1939. It was caused by several factors, including the stock market crash of 1929, structural weaknesses in the economy, low consumer spending, high unemployment, and a lack of confidence in the banking system. Other contributing factors included overproduction of commodities, reliance on borrowing to finance investments, agricultural overproduction, and a decrease in international trade."
+    },
+    {
+        "instruction": "Classify the following into animals, plants, and minerals",
+        "input": "Oak tree, copper ore, elephant",
+        "output": "Oak tree: Plant\n Copper ore: Mineral\n Elephant: Animal"
+    },
+    {
+        "instruction": "Explain the use of word embeddings in Natural Language Processing",
+        "input": "",
+        "output": "Word embeddings are one of the most powerful tools available for Natural Language Processing (NLP). They are mathematical representations of words or phrases in a vector space, allowing similarities between words and the context in which they are used to be measured. Word embeddings are useful for tasks such as sentiment analysis, text classification, predicting the next word in a sequence, and understanding synonyms and analogies. They allow for words to be processed as numerical values, giving machines an easier way to perform NLP tasks."
+    },
+    {
+        "instruction": "Describe the function of a computer motherboard",
+        "input": "",
+        "output": "A computer motherboard is the main circuit board in a computer. It is the backbone of the system, connecting all other components such as the processor, RAM, storage, graphics card and peripheral ports. It provides communication pathways between all of these components, allowing them to interact and work together. The motherboard is responsible for regulating power, and also houses slots for expansion cards, as well as ports for USB, audio, Ethernet, and other connection types."
+    },
+    {
+        "instruction": "Reverse engineer this code to create a new version",
+        "input": "def factorialize(num):\n  factorial = 1\n  for i in range(1, num):\n    factorial *= i\n  \n  return factorial",
+        "output": "def factorialize(num):\n  factorial = 1\n  for i in range(num, 0, -1):\n    factorial *= i\n  \n  return factorial"
+    },
+    {
+        "instruction": "Propose an ethical solution to the problem of data privacy",
+        "input": "",
+        "output": "One ethical solution to the problem of data privacy is to implement an opt-in data collection policy. This means that companies cannot collect and use personal data without the explicit permission of the data owners. This approach gives users the power to determine which data companies can use and for what purpose. Companies should also be transparent about how they use the collected data, so that users can make informed decisions and have control over their own data."
+    },
+    {
+        "instruction": "Generate three verbs that mean the same as \"to apologize\"",
+        "input": "",
+        "output": "Acknowledge, express regret, make amends."
+    },
+    {
+        "instruction": "What does DNA stand for?",
+        "input": "",
+        "output": "DNA stands for deoxyribonucleic acid."
+    },
+    {
+        "instruction": "Compare and contrast the Cuban Missile Crisis and the Vietnam War.",
+        "input": "",
+        "output": "The Cuban Missile Crisis and the Vietnam War were two of the largest and deadliest conflicts of the 20th century. The Cuban Missile Crisis involved a standoff between the United States and the Soviet Union over Soviet nuclear missile bases in Cuba, while the Vietnam War was a prolonged military conflict in Indochina between the North, which was backed by the Soviet Union and the South, which was backed by the United States. The Cuban Missile Crisis unfolded over a period of 13 days in October 1962, with the resolution coming in the form of a diplomatic agreement. The Vietnam War lasted for 19 years, beginning in 1954 and ending in 1973, with the signing of the Paris Peace Accords. While both of these conflicts were Cold War related, they were also very different in terms of causes, objectives, strategic objectives and outcomes. The Cuban Missile Crisis did not result in any direct military conflict and was resolved primarily through diplomacy and the decision to withdraw nuclear weapons from Cuba. On the other hand, the Vietnam War was a prolonged series of battles resulting in significant losses for both sides. The Vietnam War also had a significant impact on the United States, as it highlighted the realities of a prolonged conflict and the limits of U.S. military intervention. Additionally, the Vietnam War also drastically changed the way public opinion viewed the military and the use of force."
+    },
+    {
+        "instruction": "Generate a list of random words.",
+        "input": "",
+        "output": "sneeze, conflict, ancestor, thunder, companion, amulet, forge, serpent, magnify, jester, cascade, universe, surge, nausea, oasis, anomaly, pursue, rapture, pagoda, mosaic."
+    },
+    {
+        "instruction": "Transcribe the recording into text.",
+        "input": "Recording about the history of Waterloo",
+        "output": "The Battle of Waterloo, fought on June 18, 1815, was the last major battle of the Napoleonic Wars. It marked the end of Napoleon Bonaparte's rule. The battle was fought between Napoleon Bonaparte's French forces and an allied force of British, Dutch, Belgian and Prussian troops, led by the Duke of Wellington. It was fought near the town of Waterloo in the former Austrian Netherlands, now present-day Belgium. The battle was a defeat for Napoleon's forces and ended Napoleon's rule over France. Napoleon himself was exiled to St. Helena, where he would die in 1821. The battle marked the end of French domination of Europe and the beginning of a period of peace in the region. It was also the last major battle of the Napoleonic Wars and is remembered as one of the most important battles in history. The victory of the allied forces at Waterloo helped to bring an end to French imperial ambitions in Europe. It also established the British as one of the most powerful nations in Europe. The battle was fought with a combination of infantry, cavalry, and artillery tactics, and showed the beginning of the changing nature of warfare. Although the French forces greatly outnumbered the allies, their strategic and tactical mistake led to the loss of the battle. This defeat signaled the end of French imperial power in Europe. The legacy of Waterloo still stands today, and it cemented the British Empire's position for the next one hundred years."
+    }
+]
\ No newline at end of file
diff --git a/tests/data/fixtures/dolly.json b/tests/data/fixtures/dolly.json
new file mode 100644
index 0000000..8324344
--- /dev/null
+++ b/tests/data/fixtures/dolly.json
@@ -0,0 +1,12 @@
+{"instruction": "When did Virgin Australia start operating?", "context": "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", "response": "Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.", "category": "closed_qa"}
+{"instruction": "Which is a species of fish? Tope or Rope", "context": "", "response": "Tope", "category": "classification"}
+{"instruction": "Why can camels survive for long without water?", "context": "", "response": "Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time.", "category": "open_qa"}
+{"instruction": "Alice's parents have three daughters: Amy, Jessy, and what\u2019s the name of the third daughter?", "context": "", "response": "The name of the third daughter is Alice", "category": "open_qa"}
+{"instruction": "When was Tomoaki Komorida born?", "context": "Komorida was born in Kumamoto Prefecture on July 10, 1981. After graduating from high school, he joined the J1 League club Avispa Fukuoka in 2000. Although he debuted as a midfielder in 2001, he did not play much and the club was relegated to the J2 League at the end of the 2001 season. In 2002, he moved to the J2 club Oita Trinita. He became a regular player as a defensive midfielder and the club won the championship in 2002 and was promoted in 2003. He played many matches until 2005. In September 2005, he moved to the J2 club Montedio Yamagata. In 2006, he moved to the J2 club Vissel Kobe. Although he became a regular player as a defensive midfielder, his gradually was played less during the summer. In 2007, he moved to the Japan Football League club Rosso Kumamoto (later Roasso Kumamoto) based in his local region. He played as a regular player and the club was promoted to J2 in 2008. Although he did not play as much, he still played in many matches. In 2010, he moved to Indonesia and joined Persela Lamongan. In July 2010, he returned to Japan and joined the J2 club Giravanz Kitakyushu. He played often as a defensive midfielder and center back until 2012 when he retired.", "response": "Tomoaki Komorida was born on July 10,1981.", "category": "closed_qa"}
+{"instruction": "If I have more pieces at the time of stalemate, have I won?", "context": "Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive.[citation needed] Stalemate is also a common theme in endgame studies and other chess problems.\n\nThe outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win for that player, or a loss for that player; not being permitted; and resulting in the stalemated player missing a turn. Stalemate rules vary in other games of the chess family.", "response": "No. \nStalemate is a drawn position. It doesn't matter who has captured more pieces or is in a winning position", "category": "information_extraction"}
+{"instruction": "Given a reference text about Lollapalooza, where does it take place, who started it and what is it?", "context": "Lollapalooza /\u02ccl\u0252l\u0259p\u0259\u02c8lu\u02d0z\u0259/ (Lolla) is an annual American four-day music festival held in Grant Park in Chicago. It originally started as a touring event in 1991, but several years later, Chicago became its permanent location. Music genres include but are not limited to alternative rock, heavy metal, punk rock, hip hop, and electronic dance music. Lollapalooza has also featured visual arts, nonprofit organizations, and political organizations. The festival, held in Grant Park, hosts an estimated 400,000 people each July and sells out annually. Lollapalooza is one of the largest and most iconic music festivals in the world and one of the longest-running in the United States.\n\nLollapalooza was conceived and created in 1991 as a farewell tour by Perry Farrell, singer of the group Jane's Addiction.", "response": "Lollapalooze is an annual musical festival held in Grant Park in Chicago, Illinois. It was started in 1991 as a farewell tour by Perry Farrell, singe of the group Jane's Addiction. The festival includes an array of musical genres including alternative rock, heavy metal, punk rock, hip hop, and electronic dance music. The festivals welcomes an estimated 400,000 people each year and sells out annually. Some notable headliners include: the Red Hot Chili Peppers, Chance the Rapper, Metallica, and Lady Gage. Lollapalooza is one of the largest and most iconic festivals in the world and a staple of Chicago.", "category": "closed_qa"}
+{"instruction": "Who gave the UN the land in NY to build their HQ", "context": "", "response": "John D Rockerfeller", "category": "open_qa"}
+{"instruction": "Why mobile is bad for human", "context": "", "response": "We are always engaged one phone which is not good.", "category": "brainstorming"}
+{"instruction": "Who was John Moses Browning?", "context": "John Moses Browning (January 23, 1855 \u2013 November 26, 1926) was an American firearm designer who developed many varieties of military and civilian firearms, cartridges, and gun mechanisms \u2013 many of which are still in use around the world. He made his first firearm at age 13 in his father's gun shop and was awarded the first of his 128 firearm patents on October 7, 1879, at the age of 24. He is regarded as one of the most successful firearms designers of the 19th and 20th centuries and pioneered the development of modern repeating, semi-automatic, and automatic firearms.\n\nBrowning influenced nearly all categories of firearms design, especially the autoloading of ammunition. He invented, or made significant improvements to, single-shot, lever-action, and pump-action rifles and shotguns. He developed the first reliable and compact autoloading pistols by inventing the telescoping bolt, then integrating the bolt and barrel shroud into what is known as the pistol slide. Browning's telescoping bolt design is now found on nearly every modern semi-automatic pistol, as well as several modern fully automatic weapons. He also developed the first gas-operated firearm, the Colt\u2013Browning Model 1895 machine gun \u2013 a system that surpassed mechanical recoil operation to become the standard for most high-power self-loading firearm designs worldwide. He also made significant contributions to automatic cannon development.\n\nBrowning's most successful designs include the M1911 pistol, the water-cooled M1917, the air-cooled M1919, and heavy M2 machine guns, the M1918 Browning Automatic Rifle, and the Browning Auto-5 \u2013 the first semi-automatic shotgun. Some of these arms are still manufactured, often with only minor changes in detail and cosmetics to those assembled by Browning or his licensees. The Browning-designed M1911 and Hi-Power are some of the most copied firearms in the world.", "response": "John Moses Browning is one of the most well-known designer of modern firearms.  He started building firearms in his father's shop at the age of 13, and was awarded his first patent when he was 24.\n\nHe  designed the first reliable automatic pistol, and the first gas-operated firearm, as well inventing or improving single-shot, lever-action, and pump-action rifles and shotguns.\n\nToday, he is most well-known for the M1911 pistol, the Browning Automatic Rifle, and the Auto-5 shotgun, all of which are in still in current production in either their original design, or with minor changes.  His M1911 and Hi-Power pistols designs are some of the most reproduced firearms in the world today.", "category": "information_extraction"}
+{"instruction": "Who is Thomas Jefferson?", "context": "Thomas Jefferson (April 13, 1743 \u2013 July 4, 1826) was an American statesman, diplomat, lawyer, architect, philosopher, and Founding Father who served as the third president of the United States from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first United States secretary of state under George Washington and then the nation's second vice president under John Adams.", "response": "Thomas Jefferson (April 13, 1743 \u2013 July 4, 1826) was an American statesman, diplomat, lawyer, architect, philosopher, and Founding Father who served as the third president of the United States from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first United States secretary of state under George Washington and then the nation's second vice president under John Adams.\n\nAmong the nation's Founding Fathers, Jefferson is considered unmatched in his intellectual depth and breadth. His passionate writings and advocacy for human rights, including freedom of thought, speech, and religion, were a leading inspiration behind the American Revolution, which ultimately gave rise to the American Revolutionary War, American independence, and the United States Constitution. Jefferson's ideas were globally influential in shaping and inspiring the Age of Enlightenment, which proved transformational in the late 17th and 18th centuries. He was a leading proponent of democracy, republicanism, and individual rights, and produced formative documents and decisions at the state, national, and international levels.\n\nDuring the American Revolution, Jefferson represented Virginia in the Second Continental Congress in Philadelphia, which adopted the Declaration of Independence on July 4, 1776. As a Virginia legislator, he drafted a state law for religious freedom. He served as the second Governor of Virginia from 1779 to 1781, during the Revolutionary War. In 1785, Jefferson was appointed the United States Minister to France, and subsequently, the nation's first secretary of state under President George Washington from 1790 to 1793. Jefferson and James Madison organized the Democratic-Republican Party to oppose the Federalist Party during the formation of the First Party System. With Madison, he anonymously wrote the Kentucky and Virginia Resolutions in 1798 and 1799, which sought to strengthen states' rights by nullifying the federal Alien and Sedition Acts.\n\nJefferson and Federalist John Adams became friends as well as political rivals, serving in the Continental Congress and drafting the Declaration of Independence together. In the 1796 presidential election between the two, Jefferson came in second, which according to electoral procedure at the time, made him vice president to Adams. Jefferson challenged Adams again in 1800 and won the presidency. After his term in office, Jefferson eventually reconciled with Adams and they shared a correspondence that lasted 14 years. He and Adams both died on the same day, July 4, 1826, which was also the 50th anniversary of Declaration of Independence.\n\nAs president, Jefferson pursued the nation's shipping and trade interests against Barbary pirates and aggressive British trade policies. Starting in 1803, he promoted a western expansionist policy with the Louisiana Purchase, which doubled the nation's claimed land area. To make room for settlement, Jefferson began the process of Indian tribal removal from the newly acquired territory. As a result of peace negotiations with France, his administration reduced military forces. He was re-elected in 1804, but his second term was beset with difficulties at home, including the trial of former vice president Aaron Burr. In 1807, American foreign trade was diminished when Jefferson implemented the Embargo Act in response to British threats to U.S. shipping. The same year, Jefferson signed the Act Prohibiting Importation of Slaves.\n\nJefferson was a plantation owner, lawyer, and politician, and mastered many disciplines including surveying, mathematics, horticulture, and mechanics. He was also an architect in the Palladian tradition. Jefferson's keen interest in religion and philosophy led to his appointment as president of the American Philosophical Society. He largely shunned organized religion but was influenced by Christianity, Epicureanism, and deism. Jefferson rejected fundamental Christianity, denying Christ's divinity. A philologist, Jefferson knew several languages. He was a prolific letter writer and corresponded with many prominent people, including Edward Carrington, John Taylor of Caroline, and James Madison. In 1785, Jefferson authored Notes on the State of Virginia, considered perhaps the most important American book published before 1800. Jefferson championed the ideals, values, and teachings of the Enlightenment.\n\nSince the 1790s, Jefferson was rumored to have had children by his sister-in-law and slave Sally Hemings, leading to what is known as the Jefferson-Hemings controversy. A 1998 DNA test concluded that one of Sally Hemings's children, Eston Hemings, was of the Jefferson male line. According to scholarly consensus, based on documentary and statistical evaluation, as well as oral history, Jefferson probably fathered at least six children with Hemings, including four that survived to adulthood.\n\nAfter retiring from public office, Jefferson founded the University of Virginia. Presidential scholars and historians generally praise Jefferson's public achievements, including his advocacy of religious freedom and tolerance in Virginia, his peaceful acquisition of the Louisiana Territory from France without war or controversy, and his ambitious and successful Lewis and Clark Expedition. Some modern historians are critical of Jefferson's personal involvement with slavery. Jefferson is consistently ranked among the top ten presidents of American history.", "category": "information_extraction"}
+{"instruction": "Who was Kyle Van Zyl playing against when he scored 36 of hisa teams 61 points?", "context": "Van Zyl joined the Eastern Province Kings Academy, where he played for the Eastern Province U19 side in the 2010 Under-19 Provincial Championship. He was a key player for the Eastern Province U21 side in the 2012 Under-21 Provincial Championship, scoring 71 points in eight appearances. Van Zyl was under the Top SARU Performers, scoring the most tries at 6 in the 2012 Provincial Under 21 in the Rugby Junior Provincials.\n\nThis included a record and a remarkable personal haul in their opening match, when he scored 36 of his team's points in a 61\u20133 victory over Boland U21, consisting of four tries and eight conversions and was awarded Man of the Match.", "response": "Kyle Van Zyl was playing against Boland U21 when he scored 36 points, leading his team to victory in a 61-3 win.", "category": "closed_qa"}
diff --git a/tests/data/fixtures/longform_train.json b/tests/data/fixtures/longform_train.json
new file mode 100644
index 0000000..457fb81
--- /dev/null
+++ b/tests/data/fixtures/longform_train.json
@@ -0,0 +1,104 @@
+[
+  {
+    "input": "What are the positions held by Beto O'Rourke, Lupe Valdez, and Veronica Escobar on decriminalizing unauthorized border crossings in the US?",
+    "output": "The liberal mainstream media squealed with delight last week when they announced Ted Cruz was only 4 points ahead of Democrat Beto O\u2019Rourke.\nThe the bottom dropped out on Beto\u2019s campaign.\nFirst O\u2019Rourke told a liberal Texas crowd that there was nothing more American than kneeling for the National Anthem.\nBeto O\u2019Rourke is now pushing to legalize unauthorized border crossers into America.\nA trio of Democrats running for top offices in Texas want to decriminalize unauthorized border crossings.\nU.S. Rep. Beto O\u2019Rourke, who\u2019s challenging Republican Sen. Ted Cruz for his Senate seat, gubernatorial candidate Lupe Valdez, and congressional candidate Veronica Escobar told HuffPost they are in favor of such a move, which the report says would be simpler to implement than abolishing Immigration and Customs Enforcement, as some Democrats have demanded.\nDecriminalizing unauthorized border crossings would go against the Trump administration\u2019s \u201czero tolerance\u201d policy that led to family separations at the border. The Department of Homeland Security would still be able to charge unauthorized immigrants with civil violations, the report notes, but it would save the federal government billions in incarceration costs.",
+    "source": "C4",
+    "subset": "search-engine"
+  },
+  {
+    "input": "For example, for the original quote we shall use, 'One swallow does not a summer make'. If someone were to say 'One good quote does not a philosopher make' (No dig at Aristotle intended...), or 'A fancy suit does not a businessman make'. Essentially, a clear allusion to a quote, but not the quote itself.",
+    "output": "The word I've heard used for this on the linguistics blog Language Log is snowclone (it's derived from phrases of the format 'If Eskimos have N words for snow, X surely have Y words for Z.') Here's a Language Log post that describes the moment the word was coined and that links to some examples: Snowclones: Lexicographical Dating to the Second And here's how it's defined by Collins Dictionaries online: a verbal formula that is adapted for reuse by changing only a few words so that the allusion to the original phrase remains clear You might also be interested in looking at the answers to the following questions: This book will change your wife trope? Is there a general name for this: 'The more you X the more you'll Y' A term describing the replacement of a specific word in a saying with one that ryhmes",
+    "source": "Natural Instructions",
+    "subset": "task225 english language answer generation"
+  },
+  {
+    "input": "Describe the U.S.-Mexico border. Respond in detail.",
+    "output": "President Donald Trump\u2019s border wall with Mexico will have an initial price tag of between $12 billion and $15 billion, according to Congressional Republican leaders, but the method of paying for Trump\u2019s promised project is still not settled.\nAt the Congressional Republican Retreat in Philadelphia on Thursday, House Speaker Paul Ryan said the wall would cost up to $15 billion, more than the $8 billion Trump has estimated the wall will cost.\nA U.S. Border Patrol agent removes a ladder used by undocumented immigrants to climb a border fence on Oct. 18, 2016 near McAllen, Texas.\nU.S. Customs and Border Protection agents fly near the U.S.-Mexico border while on helicopter patrol on Oct. 18, 2016 near McAllen, Texas.\nA U.S. Border Patrol agent detains a group of undocumented immigrants on Oct. 18, 2016 near McAllen, Texas.\nPeople stand in line to cross legally into the United States from Mexico on Sept. 24, 2016 in Tijuana, Mexico.\nThe Rio Grande forms a stretch of the U.S.-Mexico border on Oct. 15, 2016 in the Big Bend region of West Texas near Lajitas, Texas. Big Bend is a rugged, vast and remote region along the U.S.-Mexico border and includes the Big Bend National Park.\nU.S. Border Patrol agents with a K-9 unit detain undocumented immigrants after they illegally crossed the U.S.-Mexico border on Oct. 18, 2016, in McAllen, Texas.\nThis photo made with a smart phone through night vision goggles shows the Rio Grande flowing along the U.S.-Mexico border, as seen from a U.S. Customs and Border Protection helicopter during a patrol over the U.S.-Mexico border on Oct. 18, 2016 in McAllen, Texas.\nThe moon rises over the swirling current of the Rio Grande on Oct. 15, 2016 in the Big Bend region of West Texas near Lajitas, Texas.\nA bullet-proof shield stands to aid U.S. Border Patrol agents on the U.S.-Mexico border on Oct. 3, 2016 in El Paso, Texas.\nA child plays in the Pacific surf near the U.S.-Mexico border fence on Sept. 25, 2016 in Tijuana, Mexico. The nearby Friendship Park is one of the few places on the 2,000-mile border where separated families are allowed to meet.\nDunes stretch into the distance near the U.S.-Mexico border on Sept. 27, 2016 in the Imperial Sand Dunes recreation area, California.\nMexican farm workers hoe a cabbage field on Sept. 27, 2016, in Holtville, Calif. Thousands of Mexican seasonal workers legally cross over daily from Mexicali, Mexico to work the fields of Imperial Valley, Calif., which is some of the most productive farmland in the United States.\nA man looks through the U.S.-Mexico border fence into the United States on Septt. 25, 2016 in Tijuana, Mexico.\nA Studebaker sits outside a cotton field near the U.S.-Mexico border on Oct. 14, 2016 near Fort Hancock, Texas.\nBodies of suspected undocumented immigrants lie in the morgue at the Office of the Pima County Medical Examiner on Sept. 29, 2016 in Tucson, Ariz. Hundreds of migrants die every year in the desert while crossing illegally from Mexico into the United States.\nPersonal effects found on the body of a suspected undocumented immigrant are stored at the Office of the Pima County Medical Examiner on Sept. 29, 2016 in Tucson, Ariz.\nDinner tables await immigrants, many of them deportees from the United States, at the Casa del Migrante Sept. 23, 2016 in Tijuana, Mexico. The shelter, run by Catholic priests, is part of the Coalicion Pro Defensa, which aids immigrants and asylum applicants seeking entry into the United States. The coalition of immigrant advocates has helped thousands of migrants on their journey north and many more after being being deported from the United States.\nMexicans enjoy a late afternoon near the U.S.-Mexico border fence, which ends in the Pacific Ocean, on Sept. 25, 2016 in Tijuana, Mexico. Friendship Park, located on the border between the two countries is one of the few places on the 2,000-mile border where separated families are allowed to meet.\nTrump signed two executive orders on Wednesday directing the construction of the border wall and increasing the number of immigration enforcement officers carrying out deportation, as well as border patrol forces.",
+    "source": "C4",
+    "subset": "instruction"
+  },
+  {
+    "input": "Must a Boss use their Level Up ability?\n\nMost Level Up abilities clearly benefit the Boss, but Robobo's is:\n\nEach opponent mush choose and destroy one Room in his or her Dungeon.\n\nThe opponents get to destroy a room (often quite beneficial), and Robobo doesn't!\nIs there any way - short of only building a four room dungeon - that Robobo can avoid using this ability? What play strategies can Robobo use to maximize his Level Up ability?",
+    "output": "According to the rules, your boss's \"Level Up\" ability is triggered when your dungeon is 5 rooms in size. \"Triggering\" implies necessity. So yes, the only way to avoid levelling up is by not building a fifth room.\nSecondly, and more to your concern... according to the Advanced Rules (see the entry for Robobo in the glossary using the link above), using Robobo's level up does not trigger the \"destroy this Room\" ability for the destroyed room.",
+    "source": "StackExchange",
+    "subset": "boardgames"
+  },
+  {
+    "input": "What is Podd\u0119bice?",
+    "output": "Podd\u0119bice () is a village in the administrative district of Gmina W\u0142oc\u0142awek, within W\u0142oc\u0142awek County, Kuyavian-Pomeranian Voivodeship, in north-central Poland. It lies approximately south of W\u0142oc\u0142awek and south-east of Toru\u0144.",
+    "source": "Wikipedia",
+    "subset": "chatbot"
+  },
+  {
+    "input": "Describe the history of MV \"Alexia\" Respond in 4 sentences.",
+    "output": "MV \"Alexia\" was one of nine Royal Dutch/Shell oil tankers converted to become a Merchant Aircraft Carrier (MAC ship). The group is collectively known as the \"Rapana\" class.\nMV \"Alexia\" was built at Bremer Vulkan and completed in April, 1935 as an oil tanker for the Anglo Saxon Royal Dutch/Shell line. She sustained severe damage in two separate U-boat attacks in 1940 and 1942.",
+    "source": "Wikipedia",
+    "subset": "instruction"
+  },
+  {
+    "input": "What is the definition of absurdity?",
+    "output": "1. the state or quality of being absurd.\nAttending the Gerald R. Ford Symposium on Humor and the Presidency is sort of like attending the Ayatollah Khomeini Symposium on the sexual revolution \u2014Pat Paulsen, at September 19, 1986 symposium in Grand Rapids, Michigan.\n2. absurdity - a ludicrous folly; \"the crowd laughed at the absurdity of the clown's behavior\"\nnoun ridiculousness, nonsense, folly, stupidity, foolishness, silliness, idiocy, irrationality, incongruity, meaninglessness, daftness (informal), senselessness, illogicality, ludicrousness, unreasonableness, preposterousness, farcicality, craziness (informal), b\u00eatise (rare), farcicalness, illogicalness I get angry at the absurdity of a situation.\nfolly, foolery, foolishness, idiocy, imbecility, insanity, lunacy, madness, nonsense, preposterousness, senselessness, silliness, tomfoolery, zaniness.\nBut in the Epic poem the absurdity passes unnoticed.\nFor if absurdity be the subject of laughter, doubt you not but great boldness is seldom without some absurdity.\nI am temperate to the verge of absurdity,\" replied the Tramp.\nPoets, of course, may be satisfactorily read in volumes of, selections; but to me, at least, a book of brief extracts from twenty or a hundred prose authors is an absurdity.\nThen,\" suggested the idea, with a blush for its own absurdity, \"why not go on pilgrimage and seek her?\nJudges and starters have been conveniently blind to this absurdity, but the public demonstration off St.\nAnd now that the providential occurrence was apparently close at hand, it would have been sheer absurdity to think that the supply would be short of the need: as absurd as a faith that believed in half a miracle for want of strength to believe in a whole one.\nutterances, the absurdity being attested by his motley costume.\nOr what greater absurdity can there be than putting before us an old man as a swashbuckler, a young man as a poltroon, a lackey using fine language, a page giving sage advice, a king plying as a porter, a princess who is a kitchen-maid?\nNow to say that the honour I here mean, and which was, I thought, all the honour I could be supposed to mean, will uphold, much less dictate an untruth, is to assert an absurdity too shocking to be conceived.\nHence the absurdity of the interview; the gulf between them was economic as well as spiritual.",
+    "source": "C4",
+    "subset": "chatbot"
+  },
+  {
+    "input": "Can felons run for federal office in Minnesota?",
+    "output": "Minnesota law doesn't block felons from running for federal office.\nEven if enough voters choose Leonard J. Richards as the DFL candidate for U.S. Senate, there\u2019s no way he will ever get to take the oath of office and begin a six-year term.\nRichards is already serving a lifetime term. In Stillwater prison. For murder. Make that two murders.\nYes, it is legal in Minnesota for felons to run for office, so long as it is a federal seat. Nobody knows that better than Richards, who is trying to wrest the party nod away from incumbent Amy Klobuchar.\nNow 75 years old and sporting Department of Corrections ID No. 149837, Richards has run for federal office several times \u2014 without a victory \u2014 since his imprisonment, most recently when he sought the seat that U.S. Rep. Tom Emmer now holds.\nIn 1992, Richards ran in the DFL primary for the Eighth Congressional District seat and received more than 14,500 votes. He ran for the U.S. Senate in the DFL primary in 1994, winning more than 4,000 votes.\nMinnesota law does not permit inmates to run for a state-level office.\nThe official ballot for Minnesota\u2019s primary next month lists double murderer Leonard Richards among those seeking the DFL nomination for U.S. Senate.\nRichards was convicted of murder twice in Hennepin County for the 1982 slaying of his half-sister, May Wilson, and the 1987 shooting death of his attorney, Robert Stratton. His life sentence imposed nearly 30 years ago offers no parole.\nRichards did not respond to a message Thursday seeking an interview.",
+    "source": "C4",
+    "subset": "chatbot"
+  },
+  {
+    "input": "What is the purpose of consciousness?",
+    "output": "Cardiff University and University College London provide funding as founding partners of The Conversation UK.\nMost experts think that consciousness can be divided into two parts: the experience of consciousness (or personal awareness), and the contents of consciousness, which include things such as thoughts, beliefs, sensations, perceptions, intentions, memories and emotions.\nIt\u2019s easy to assume that these contents of consciousness are somehow chosen, caused or controlled by our personal awareness \u2013 after all, thoughts don\u2019t exist until until we think them. But in a new research paper in Frontiers of Psychology, we argue that this is a mistake.\nWe suggest that our personal awareness does not create, cause or choose our beliefs, feelings or perceptions. Instead, the contents of consciousness are generated \u201cbehind the scenes\u201d by fast, efficient, non-conscious systems in our brains. All this happens without any interference from our personal awareness, which sits passively in the passenger seat while these processes occur.\nPut simply, we don\u2019t consciously choose our thoughts or our feelings \u2013 we become aware of them.\nIf this sounds strange, consider how effortlessly we regain consciousness each morning after losing it the night before; how thoughts and emotions \u2013 welcome or otherwise \u2013 arrive already formed in our minds; how the colours and shapes we see are constructed into meaningful objects or memorable faces without any effort or input from our conscious mind.\nConsider that all the neuropsychological processes responsible for moving your body or using words to form sentences take place without involving your personal awareness. We believe that the processes responsible for generating the contents of consciousness do the same.\nOur thinking has been influenced by research into neuropsychological and neuropsychiatric disorders, as well as more recent cognitive neuroscience studies using hypnosis. The studies using hypnosis show that a person\u2019s mood, thoughts and perceptions can be profoundly altered by suggestion.\nIn such studies, participants go through a hypnosis induction procedure, to help them to enter a mentally focused and absorbed state. Then, suggestions are made to change their perceptions and experiences.\nFor example, in one study, researchers recorded the brain activity of participants when they raised their arm intentionally, when it was lifted by a pulley, and when it moved in response to a hypnotic suggestion that it was being lifted by a pulley.\nSimilar areas of the brain were active during the involuntary and the suggested \u201calien\u201d movement, while brain activity for the intentional action was different. So, hypnotic suggestion can be seen as a means of communicating an idea or belief that, when accepted, has the power to alter a person\u2019s perceptions or behaviour.\nAll this may leave one wondering where our thoughts, emotions and perceptions actually come from. We argue that the contents of consciousness are a subset of the experiences, emotions, thoughts and beliefs that are generated by non-conscious processes within our brains.\nThis subset takes the form of a personal narrative, which is constantly being updated. The personal narrative exists in parallel with our personal awareness, but the latter has no influence over the former.\nThe personal narrative is important because it provides information to be stored in your autobiographical memory (the story you tell yourself, about yourself), and gives human beings a way of communicating the things we have perceived and experienced to others.\nThis, in turn, allows us to generate survival strategies; for example, by learning to predict other people\u2019s behaviour. Interpersonal skills like this underpin the development of social and cultural structures, which have promoted the survival of human kind for millennia.\nSo, we argue that it is the ability to communicate the contents of one\u2019s personal narrative \u2013\u2013 and not personal awareness \u2013 that gives humans their unique evolutionary advantage.\nIf the experience of consciousness does not confer any particular advantage, it\u2019s not clear what its purpose is. But as a passive accompaniment to non-conscious processes, we don\u2019t think that the phenomenon of personal awareness has a purpose, in much the same way that rainbows do not. Rainbows simply result from the reflection, refraction and dispersion of sunlight through water droplets \u2013 none of which serves any particular purpose.\nOur conclusions also raise questions about the notions of free will and personal responsibility. If our personal awareness does not control the contents of the personal narrative which reflects our thoughts, feelings, emotions, actions and decisions, then perhaps we should not be held responsible for them.\nIn response to this, we argue that free will and personal responsibility are notions that have been constructed by society. As such, they are built into the way we see and understand ourselves as individuals, and as a species. Because of this, they are represented within the non-conscious processes that create our personal narratives, and in the way we communicate those narratives to others.\nJust because consciousness has been placed in the passenger seat, does not mean we need to dispense with important everyday notions such as free will and personal responsibility. In fact, they are embedded in the workings of our non-conscious brain systems. They have a powerful purpose in society and have a deep impact on the way we understand ourselves.",
+    "source": "C4",
+    "subset": "chatbot"
+  },
+  {
+    "input": "What is Nemaha, IA? Respond briefly.",
+    "output": "Nemaha is a city in Sac County, Iowa, United States. The population was 85 at the 2010 census.\nAccording to the United States Census Bureau, the city has a total area of , all of it land.",
+    "source": "Wikipedia",
+    "subset": "chatbot"
+  },
+  {
+    "input": "In the book I am reading (Patrick Fairbairn, The Interpretation of Prophecy, 2nd ed., 1865), the abbreviation 'sq.' is used sometimes in referencing both Scripture and other books (as 'Typology of Scripture, vol. i. p. 100 sq.' or 'Ezek xxvi. 7, sq.'). What does 'sq.' mean in this context? (Instances of this abbreviation's usage may be found at Google Books here.).",
+    "output": "Sq. is short for the Latin word sequiturque, which means the following. So p. 100 sq. means page 100 and those following pages. Here's the source.",
+    "source": "Natural Instructions",
+    "subset": "task225 english language answer generation"
+  },
+  {
+    "input": "\"Describe Brugnens.\" Respond in 11 words.",
+    "output": " Brugnens is a commune in the Gers department in southwestern France.",
+    "source": "Wikipedia",
+    "subset": "instruction"
+  },
+  {
+    "input": "What is the DeLand Municipal Airport? Respond in 1 sentences.",
+    "output": "DeLand Municipal Airport , also known as Sidney H. Taylor Field, is a city-owned public-use airport located three nautical miles (5\u00a0km) northeast of the central business district of DeLand, a city in Volusia County, Florida, United States.",
+    "source": "Wikipedia",
+    "subset": "chatbot"
+  },
+  {
+    "input": "Respond in 203 words. Describe the plot of the 1976 Filipino drama film \"Minsa'y Isang Gamu-gamo\".",
+    "output": "Minsa'y Isang Gamu-gamo (\"\") is a 1976 Filipino drama film directed by Lupita Aquino-Kashiwahara and written by Marina Feleo-Gonzales. It concerns a Filipino nurse, Cora de la Cruz, who dreams of moving to America. When her brother is killed, her ideas change. The film criticises American military presence in the Philippines. It was an entry to the 1976 Metro Manila Film Festival and won five awards at the 25th FAMAS Awards, including those for best picture, director and screenplay. The film was restored by the ABS-CBN Film Restoration through Kantana Post Production in Thailand and the restored version was released on October 12, 2018 as part of Cinema One Originals film festival.\nThe de la Cruzes and the Santos are two lower-middle-class families who live in Pampanga. Cora de la Cruz is a nurse who dreams of living in the United States. Her papers are ready and she organises employment at an American hospital. She hopes to get a green card allowing her to stay, achieve immigrant status and then bring her family to America for a better life. Cora's mother and younger brother encourage her but her father, Ingkong, disagrees. He believes that moving to America is a betrayal of their country.",
+    "source": "Wikipedia",
+    "subset": "instruction"
+  },
+  {
+    "input": "Hey, when does Marvel's Runaways Season 2 start?",
+    "output": "The second season picks up where the first left off.\nHulu has announced that the second season of Marvel's Runaways will premiere December 21st, Deadline reports. And there won't be any weekly installments this time around as there were with the first season because all 13 episodes are getting dropped at once. Hulu renewed the series for a second season just as the first one wrapped up.\nSeason 2 will kick off right where the first ended -- with the kids having escaped their evil parents and setting out their own. Now they'll have to figure out how to live with each other, how to take care of themselves and how to take down their parents' nefarious Pride squad. But they might also have to sniff out a mole since someone's sending the Pride leader mysterious messages. Meanwhile, the Pride is bound and determined to get them back.",
+    "source": "C4",
+    "subset": "chatbot"
+  },
+  {
+    "input": "I need to serve at a dinner party, but I'm not sure where to start. Can you give me 10 actionable steps?",
+    "output": "Which way should you pass the plates? How should you clear the table? Serving at a dinner party isn't exactly as easy as pie. Here are some basic guidelines to help you out in your next dinner party.\n1. Steamed or roasted vegetables, rice, salad, potatoes, and condiments are some of the things you can usually leave in the middle of the table for guests to help themselves to. This isn't strictly necessary, though. If you prefer to keep things simple, you don't have to leave anything more than salt and pepper on the table.\n2. This is known as \"plating-up.\" Don't expect guests to help themselves to food that requires assembly or is difficult to manage. The only way they'll get to see your creativity is if you put it out on the plate for them before you even get to the dinner table.\nKeep the rim of the plate clear of sauces, spills, anything; it frames the food.\n3. You can do this if you want to keep with tradition and the occasion is quite formal. Otherwise, choose one end of the table and move around it in a clockwise fashion, regardless of the genders of your guests.\n\nPass all dishes from the left. Guests and servers should pass dishes from left. The logic behind this is that most people are right-handed and this allows them them to serve themselves from the dish while it is being held by the passer. Nowadays it is less likely that the passer will continue to hold the dish, but will expect you to take it, so if you are left-handed, it won't be a problem. Place the dish down on your side plate to serve from it.\n\nAs the cook, or host, always serve yourself last. This is polite and also sensible, since you'll probably be busy anyway with host's duties.\n4. They'll get fidgety, anxious and gossipy about what you're doing.\n5. On the other hand, do not ever go into details about how the flesh portion of the meal was hunted/killed. This is bad taste and makes some guests very queasy. Leave it for discussion around the fireplace with a like-minded friend after dinner.\n6. The host or hired help should clear no more than two plates at a time to avoid bumping guests and interfering with their eating. There is nothing more annoying than the server's elbows in your face when you're just about to take the next bite.\n7. Preferably the noises should not reach the guests but this is unrealistic for most homes. Just do it as quietly as possible and try not to clank, crack, break or drop the dishes. The last thing you need on top of anything else is a dropped plate to clean up.\n8. This means all the dishes on the table, the condiments and the side plates. If you haven't already set out the dessert spoons, this is the time to do so.\n9. Chocolates will have their own method of getting around the table; it has been suspected they have legs...\n10. Don't take advantage of the situation because that guest is there to enjoy himself too but don't hesitate to ask for a quick hand with a simple task that won't risk spills on their clothes.\n",
+    "source": "WikiHow",
+    "subset": "main"
+  },
+  {
+    "input": "What can you tell me about Brushcreek Township? Respond in 2 sentences.",
+    "output": "Brushcreek Township is one of the seventeen townships of Highland County, Ohio, United States. As of the 2010 census the population was 1,381, of whom 1,248 lived in the unincorporated portion of the township.",
+    "source": "Wikipedia",
+    "subset": "chatbot"
+  }
+]
\ No newline at end of file
diff --git a/tests/data/fixtures/longform_val.json b/tests/data/fixtures/longform_val.json
new file mode 100644
index 0000000..dd2f94e
--- /dev/null
+++ b/tests/data/fixtures/longform_val.json
@@ -0,0 +1,56 @@
+[
+  {
+    "input": "The Big Mistake\n\nThis day was full of joy and happiness, but something went wrong after when she told him that she is pregnant. \nShe was so happy about this news and she thought that he will be happy as well, but... At the moment when she told him that she is pregnant, he just frozen. It was too much for him.\nHe thought it's easier to walk away and avoid this conversation with her. He tried to understand what's going on with him. It was weird how many questions and doubts came up on his head. One part of him was really happy, because to be a father is his one of the biggest dream, but on the other hand he couldn\u2019t handle this news. \nThis woman, who stands next to him, is his girlfriend, his future wife. He should be happy and hug her and celebrate this good news. But somehow he didn't feel anything. All that he felt was anxiety. He couldn't handle the feeling, when she tried to touch him, to kiss and hug him. He knew that she loves him as much as he loves her, but everything was just too wrong on this moment.  Her voice, her smile and even her body was to wrong on this moment.\nShe couldn\u2019t understand what\u2019s going on, because it was just three weeks ago when they had a conversation about kids and family and he said that he would like to have kids with her. And now, he just avoids her and he is not happy about the news. \nIt was a big mess. She was more than happy and he couldn\u2019t handle this news. They both knew that they love each other and they care about each other, but it was too much for him.  He stood up and said to her girlfriend that he needs a break and he have to be alone.  Michael closed the door and knew at that moment he had made a mistake.\n\nFix all grammar errors and style issues and rewrite this document",
+    "output": "The Big Mistake\n\nThis day was full of joy and happiness, but something went wrong  when she told him that she was pregnant. \nShe was so happy about this news and she thought that he would be happy as well, but... At the moment when she told him that she was pregnant, he just frozen. It was too much for him.\nHe thought it was easier to walk away and avoid this conversation with her. He tried to understand what was going on with him. It was weird how many questions and doubts came into his head. One part of him was really happy, because to be a father was one of his  biggest dreams, but on the other hand, he couldn\u2019t handle this news. \nThis woman, who was standing next to him, was his girlfriend, his future wife. He should be happy and hug her and celebrate this good news. But somehow he didn't feel anything. All that he felt was anxiety. He couldn't handle the feeling, when she tried to touch him, to kiss and hug him. He knew that she loved him as much as he loved her, but everything was just all wrong at this moment.  Her voice, her smile and even her body was  wrong on this moment.\nShe couldn\u2019t understand what was going on, because it was just three weeks ago that they had a conversation about kids and family and he said that he would like to have kids with her. And now, he was just avoiding her and he was not happy about the news. \nIt was a big mess. She was more than happy and he couldn\u2019t handle this news. They both knew that they loved each other and they cared about each other, but it was too much for him.  He stood up and said to his girlfriend that he needed a break and he had to be alone.  Michael closed the door and knew at that moment he had made a mistake.\n",
+    "source": "BEA-GEC",
+    "subset": "main"
+  },
+  {
+    "input": "Although the fact that personal vehicle like car now is popular, in the future, the stance of public transport is still very crucial. \n\nNowaday, people use fossil fuel as the most important source of energy for travelling, manufacturing, this source of energy doesn't always maintain at the constant level, but someday it will be run out. People must find ways to use other energy sources like nuclear electricity, wind power, public transportation such as electric metro use this kind of energy. As a result, governments will invest more in researching the usage of these new types of energy, travelling using public transport will become much cheaper than personal car. The range of public services will be expanded to remote areas, it become much more convenient. Commuting, travelling using public services will be the first prior choice.\n\nIn addition, the population is increasing rapidly, in the near future, almost cities in the world will become overcrowded, It will lead to many other problems to be solved, for example, traffic jams, air pollution. In some perespectives, these problems are led by the over-using of personal vehicle. People will realise that using their own car is just to waste their time for traffic jams or to gradually killing themselves for breathing polluted air. So the best way for them to travel is public transpotations. \nEach person should practice saving when using any source of eneny to protect his own live. \nIn conclusion, investments on developing public transport will be increased considerably, public services of travelling have a bright potential and it's stance in the future can't be replaceble\n\nTweak this document to make it read smoother and fix any mistakes you see.",
+    "output": "Despite the fact that personal vehicles like the car are now popular, in the future, the existence of public transport will still absolutelyery crucial. \n\nNowadays, people use fossil fuels as the most important source of energy for travelling and manufacturing. This source of energy doesn't always remain at a constant level, and someday it will  run out. People must find ways to use other energy sources like nuclear power and wind power. Public transportation such as the electric metro use this kind of energy. As a result, governments will invest more in researching the use of these new types of energy, and travelling using public transport will become much cheaper than privatepersonal car. The range of public services will be expanded to remote areas, and it will become much more convenient. Commuting and travelling using public services will be the first priority.\n\nIn addition, the population is increasing rapidly. In the near future, most cities in the world will become overcrowded. This will lead to many other problems that will need to be solved. For example, traffic jams and air pollution. In some ways, these problems are caused by the overuse of personal vehicles. People will realise that using their own car is just  wasting their time in traffic jams or  gradually killing themselves by breathing polluted air. So the best way for them to travel is public transport. \nEach person should practice saving energy when using any source of energy to protect his own life. \nIn conclusion, investments in developing public transport will be increased considerably. Public transport services have a bright future and their existence in the future can't be replaced.\n",
+    "source": "BEA-GEC",
+    "subset": "main"
+  },
+  {
+    "input": "What can you tell me about Nang Ka Teih? Respond in 58 words.",
+    "output": "Founded by the people of Chin (Zo), Nang Ka Teih is a town ward within Sagain Division or next to Khampat town in the Myanmar state of Chin and Sagain Division. It has a very high concentration of the religion. Nang Ka Teih is one of the best location in Chin State and listed among the peace area.",
+    "source": "Wikipedia",
+    "subset": "chatbot"
+  },
+  {
+    "input": "I'm having a serious argument with a friend on the status of the word 'suck' when I used it about him by saying 'You suck!' because he missed a train. We are both non-native English speakers. He claimed that I used it as a foul or slang term. I vehemently disagreed with him about it. I told him that the word 'suck' can be used to describe something inefficient or not good enough as well like it is used in the example 'Samsung mobile sucks'. Am I right to describe the use of the word 'suck' in such context?.",
+    "output": "Yes, you can use the word in the way that you have described, but it's considered more harsh than polite, and it has somewhat vulgar overtones. How it's regarded or received might be generational. I typed is suck vulgar? on Google, and found mixed responses. Feel free to do the same if you want diverse opinions on the matter. I thought this excerpt from a blog post, though, was worth pasting into an answer here: Some may not believe this, but suck as in 'Man, this class sucks' was also in the raw obscenity category when I was a teenager. It was used plenty in the school hallways but not in front of your teacher and never in front of your mother. I remember some agitation by certain culturally-advanced youngsters who tried to railroad their elders into accepting sucks as a safe and harmless substitute for stinks. The elders weren't having any of it, last I checked, but the liberalizing linguists seem to have carried the day. I have always assumedrightly or wrongly, I do not know that the word was originally intended to carry sexual overtones, which was the reason for its suppression. Today, the sexual overtones are either forgotten or are now acceptable in mixed company. I'm not sure which explanation disturbs me more. I think you and your friend are unlikely to come up with an agreed-upon viewpoint, because you're both right in a way. Feel free to use it on message boards and the like when you want to express a negative opinion, but realize you'll risk sounding a bit uncouth to some when you do. Then again, maybe I'm just showing my age here. As a footnote, you might want to check out our sister site, English Language Learners.",
+    "source": "Natural Instructions",
+    "subset": "task225 english language answer generation"
+  },
+  {
+    "input": "How is BeeHighve Inc. in Corner Brook, NL infusing cannabis with honey products and bringing them to markets, both local and global?",
+    "output": "BeeHighve CEO Rita Hall intends to bring Newfoundland honey and honey-based products to the market, some infused with cannabis, while others will be \"buzz free.\"\nA selection of some of the products available from Corner Brook, N.L.-based BeeHighve Inc.\nThere's a lot of buzz around a joint venture between two Newfoundland companies that want to bring cannabis-infused honey and honey products to local and global markets.\nBeeHighve Inc., based in Corner Brook, will be cultivating the cannabis crop, while G and M Family Farms, near Placentia, will supply the honey.\n\"It ranges from pure honey to sauces as well as chocolate and health bars, and everything is organic,\" said BeeHighve CEO Rita Hall. \"And everything is very healthy for you. We don't use sugar in our products.\"\nThe partnership is the brainchild of Hall, a trailblazer who is on track to become the first Indigenous woman to gain licensed producer [of marijuana] status in Canada, as well as one of the country's few Indigenous female CEOs.\nHall intends for Newfoundland honey to be the backbone of her operation, and utilizes it in all the products, including the flagship Nuts About Honey bars.\nAlthough no official date has been set for the legalization of cannabis-infused edibles \u2014 with recreational marijuana use legal as of Oct. 17, 2018 \u2014 BeeHighve plans to produce the same line of products, without the cannabis elements before and after the legalization of edibles.\n\"I don't think it's going to have a negative impact on the business at all. The honey is really generating a lot of interest,\" Hall said.\nPart of the interest is because of Newfoundland's uniquely thriving honey-bee population, who enjoy the benefits of a closed ecosystem comparatively free from mites and diseases associated with colony collapse.\n\"They love the idea of mite-free, antibiotic-free honey. So I have no doubt that the production and sale of honey and the consumables without cannabis infusions will go very well.\"\nEventually BeeHighve intends to get into the beekeeping business as well, allowing them to produce larger volumes of product in less time.\nAside from the plans to export the cannabis-infused products \u2014 where legal \u2014 as well as the \"buzz-free\" ones globally, BeeHighve is looking to expand its production to another province.\nPartnering with the Madawaska Maliseet First Nation reserve in New Brunswick, crops will be cultivated on the reserve to later be infused with Newfoundland honey. Hall believes the partnership will be a fruitful one, and has great respect for the Madawaska Maliseet, whose senior leadership is entirely made of women.\n\"It shows the strength of women in any marketplace. Women are really underrepresented in the cannabis industry right now, so it's, I'll say, a feather in our caps, no pun intended, to be a part of this industry as well.\"\nHall has just as much faith in this venture as the one in Newfoundland.\n\"We're very strong women and I think very successful, and we'll succeed at this as well.\"",
+    "source": "C4",
+    "subset": "search-engine"
+  },
+  {
+    "input": "Respond briefly. What is the history of the Eagle Ranger Station?",
+    "output": "The Eagle Ranger Station, also known as the Eagle Guard Station and presently known as the Sol Duc Ranger Station, is a complex of three buildings built in the 1930s in what would become Olympic National Park. The primary structures were built by the U.S. Forest Service in what was at the time the Olympic National Forest., While the main residence was built by the USFS, the generating plant and landscaping were built by the National Park Service using labor provided by the Civilian Conservation Corps.",
+    "source": "Wikipedia",
+    "subset": "chatbot"
+  },
+  {
+    "input": "How to deal with non negotiable change\n\nOne of the issues my organisation has is where change requests are requested during a sprint cycle in a particular high volume worksteam. The development team are often forced to accept these change requests, where any resistance results in an escalation, which results with somebody high up in the business just telling the dev team to do it. Thus, team is constantly over-committing every sprint cycle.\nI have highlighted this as an issue to Senior management, showing the impact the additional work is having on morale and quality, but nothing seems to be changing at org level, and they seem to think the problem is at team level. Hence, keep trying to address the issue by adding and removing Scrum Masters; where they all are having the same problem.\nWhat is the best way to address this?\n\nA:\n\nTL;DR\nYou have both a political problem and a process problem. There is no silver bullet, and you can't fix these problems solely within the team. Instead, you must rely on the agile concepts of transparency and visibility to ensure that the costs associated with bypassing the process are charged to the project budget.\nMake Costs Visible\nIn agile frameworks, change is never non-negotiable. If you are faced with \"non-negotiable\" requirements that make the goal or plan for the current iteration obsolete, then the team must scrap the plan.\nThis is true regardless of the agile framework you're using. In Scrum, the Product Owner cancels the Sprint and the Scrum Team returns to Sprint Planning. In Kanban, work is pulled off the board to meet work-in-progress (WIP) limits, and queued in the appropriate bucket until capacity is once again available. Your specific framework may vary in the way it implements this freeing up of capacity, but it's an essential component of any viable agile (and even non-agile) process.\nThe cost of replanning and re-queuing is made visible through metrics such as cycle time, cumulative flow, velocity, burn-down, or other framework- and implementation-specific metrics. Communicating about these costs is also essential. By presenting the data to stakeholders and company officers, you place the responsibility for those costs squarely where it belongs.\nSchedule risk (like any risk) can be accepted, transferred, or mitigated. Which of these three options is best is a business decision, not a team decision. Regardless of the choice, senior management owns both the choice and the outcome.\nSee Also\nhttps://pm.stackexchange.com/a/10829/4271",
+    "output": "I agree with Todd about it being a political and process problem.\nI want to add that apart from showing them the impact on morale and quality -which I believe that they are quite important for your team- I would recommend having a discussion with them to understand why they believe it is useful to have this way of working and also prepare for them a \"business case\" that explores what is they key impact of this way of working for the senior management (trying to talk their own language basically). \nMost of the times, this comes down to how much money they are losing by having this way of working. Examples of the impact to senior management are:\n\nDelayed features due to the introduction of these changes \nBugs introduced due to quality issues that required to introduce more tasks in future releases.",
+    "source": "StackExchange",
+    "subset": "pm"
+  },
+  {
+    "input": "Use of LaTeX commands\n\nIt appears that LaTeX commands cannot be used on SE Linguistics, but they can on other SE sites. Personally, I wanted to use this feature a number of times to insert a formula and here's another question with the same problem. Expressing formulae can be useful when discussing things as simple as the number of combinations that arise from the different conditions in an experiment, or when discussing statistics questions relevant to linguistics. \nCould we please allow the use of LaTeX commands? I don't think having this feature will be a problem for anyone not wanting to use it and it will help those who do.\n\nA:\n\nAbsolutely subscribing here!\nI am surprised that this is not a feature already, I am really missing TeX support - for a wide range of uses that I consider essential in the field of linguistics:  \n\nMathematical formulas.\nAs someone who frequently answers questions on formal semantics, like here, here and here, I feel that answering (and asking) these questions in the current state is a mess, to an extent where it has sometimes kept me from writing answers to those questions altogether, simply because the process of setting them up is so annoying.\nWhile I am aware that there are tools which ease the inclusion of Unicode symbols as HTML, all of these solutions (at least the ones I am aware of) still make typing more than one line of math quite cumbersome given that every symbol has to be clicked on individually, but worse than that, many characters and necessary formatting options, like pretty much half of the inventory I needed in this linguistically originated question, aren't even available as predefined Unicode symbols at all. Not to mention the fact that the output is visually not particularly pleasing and often makes the question hard to read, as can well be seen in this post.\nBesides, I doubt that the average user is aware of the existence of such tools; it took me more than a year on SE to find out about these, while new users have repeatedly been asking for LaTeX support, because this what users are used to.\nApart from formulas for questions on logic, MathJax would prove useful in questions involving statistics or combinatorics, which time and time again turn up this site.  \nIPA, as already discussed here and here, to frequently be used in posts like here, here, here and many posts by other users.\nAt this point I am not sure whether enabling MathJax will also provide the opportunity to support other LaTeX packages, but if so, being able to use tipa would be of great help in typing IPA symbols.\nI am aware of IPA typeit which I have used so far, and some other external tools, but again, clicking is cumbersome, and users will be more comfortable writing questions and answers with the tools thy are used to (and the standard way of typesetting IPA symbols is LaTeX tipa).     \nOther, less urgent cases where I could imagine TeX support to be useful - again under the premise that support for special packages is in principal possible - include\n\ntree structures, most importantly syntax trees, like here, here and here; a similar request was already made on meta here.\nlinguistic example formatting, which I could have used here, and interlinear glosses for posts like here, which are rather inconvenient to do appropriately without automatic alignment.\nThe current solution to these use cases is to typeset them externally and then include them in the post via a screenshot - I could live with this solution for these particular cases, but if MathJax comes, then this would certainly be a nice for these less frequent use cases as well.  \n\nA:\n\nIt won't be a problem, but we need to show that the site actually needs it. \nOn Physics SE, the same question has been asked and you can see it got at least 32 votes. If there is not enough support from the community, which means enough people that want it, then it's probably not gonna happen. It's the same as for Migration paths in other sites, you need to prove there is a case for it to be implemented.\nIf it gets enough support, it'll be added. ",
+    "output": "A clear yes. And do not forget to include tipa support for IPA in LaTeX, I find tipa input from the keyboard very convenient (compared to searching, cutting, and pasting IPA characters from a Unicode character table).",
+    "source": "StackExchange",
+    "subset": "linguistics"
+  },
+  {
+    "input": "In this task, you are given text for US Congressional and California state bills, your task is to generate a summary for this bill.\n\"SECTION 1. SHORT TITLE.\n\n    This Act may be cited as the ``Sequestration Relief Act of 2013''.\n\nSEC. 2. FINDINGS AND PURPOSE.\n\n    (a) Findings.--Congress finds the following:\n            (1) Congress must enact a comprehensive, deficit reduction \n        plan to solve the country's fiscal challenges and to promote \n        national security, economic stability, and the continued growth \n        and prosperity of the United States.\n            (2) The keys to a comprehensive, deficit reduction solution \n        are increased revenues and changes in mandatory spending.\n            (3) The Budget Control Act of 2011 was enacted to avert a \n        default on Federal debt obligations, and it reduced \n        discretionary spending by approximately $1 trillion through \n        fiscal year 2021.\n            (4) Because the Joint Select Committee on Deficit Reduction \n        failed to recommend legislation providing an additional $1.2 \n        trillion in deficit reduction, Federal law mandates that the \n        additional savings be sequestered.\n            (5) Sequestration was designed as a forcing mechanism for \n        an agreement on a comprehensive, deficit reduction plan. It has \n        failed to produce the intended results.\n            (6) It no longer makes sense to rely on sequestration as a \n        forcing mechanism for a balanced solution. The costs to our \n        government and to the economy are too great.\n            (7) Under sequestration, automatic, indiscriminate cuts \n        would be applied, through fiscal year 2021, to a wide variety \n        of discretionary spending programs to achieve $1.2 trillion in \n        savings, forestalling the sound planning needed for prudent and \n        meaningful investments in national security, the workforce, \n        transportation infrastructure, education, health care, public \n        safety, housing, innovation, small business development, and \n        many other facets of enduring national strength.\n            (8) Even the prospect of sequestration is disruptive to \n        regular order and to the congressional appropriations process, \n        and it fosters damaging economic uncertainty, while short-term \n        solutions only suspend the prospect and continue to undermine \n        the certainty needed for economic recovery.\n            (9) Therefore, Congress must eliminate the threat of \n        sequestration.\n            (10) Given the magnitude of the Federal deficit, it is \n        likely that additional cuts to discretionary spending will be \n        necessary for a comprehensive deficit reduction solution.\n            (11) Congress must establish a manageable, long-term \n        discretionary spending plan. An additional $320 billion in \n        targetable cuts to discretionary appropriations from fiscal \n        year 2014 through fiscal year 2021 represents one-third of the \n        net amount that would have been indiscriminately cut by \n        sequestration over fiscal years 2013 through 2021.\n            (12) It is recognized that a reduction of $167 billion to \n        discretionary appropriations within budget function 050 from \n        fiscal year 2014 through fiscal year 2021 will affect the \n        National Military Strategy. The Department of Defense is highly \n        encouraged to revisit its current strategic guidance and to \n        work closely with Congress in building a new National Military \n        Strategy that accounts for available resource levels.\n    (b) Purposes.--The purposes of this Act are to--\n            (1) eliminate the threat of sequestration to the American \n        economy;\n            (2) offer the Federal Government, industry, and the \n        American people the predictability that economic recovery \n        demands;\n            (3) enable the Congress to pass appropriations legislation \n        in regular order with a clear discretionary spending budget and \n        grant the legislative and executive branches of government the \n        flexibility needed to identify and implement specific \n        discretionary spending reductions in a responsible and \n        deliberate manner; and\n            (4) provide a practicable, long-term discretionary spending \n        plan that will contribute to a comprehensive, balanced, long-\n        term, deficit reduction solution that includes affordable \n        revisions to mandatory spending and new revenues.\n\nSEC. 3. REPEAL OF SECTION 251A SEQUESTRATIONS.\n\n    Section 251A of the Balanced Budget and Emergency Deficit Control \nAct of 1985 is repealed.\n\nSEC. 4. $320 BILLION REDUCTION IN DISCRETIONARY SPENDING LIMITS.\n\n    The discretionary spending limits set forth in paragraphs (3) \nthrough (10) of section 251(c) of the Balanced Budget and Emergency \nDeficit Control Act of 1985 are amended to read as follows:\n            ``(3) for fiscal year 2014--\n                    ``(A) for the security category, $546,000,000,000 \n                in budget authority; and\n                    ``(B) for the nonsecurity category, \n                $501,000,000,000 in budget authority;\n            ``(4) with respect to fiscal year 2015--\n                    ``(A) for the security category, $550,000,000,000 \n                in new budget authority; and\n                    ``(B) for the nonsecurity category, \n                $505,000,000,000 in new budget authority;\n            ``(5) with respect to fiscal year 2016--\n                    ``(A) for the security category, $559,000,000,000 \n                in new budget authority; and\n                    ``(B) for the nonsecurity category, \n                $513,000,000,000 in new budget authority;\n            ``(6) with respect to fiscal year 2017--\n                    ``(A) for the security category, $569,000,000,000 \n                in new budget authority; and\n                    ``(B) for the nonsecurity category, \n                $522,000,000,000 in new budget authority;\n            ``(7) with respect to fiscal year 2018--\n                    ``(A) for the security category, $579,000,000,000 \n                in new budget authority; and\n                    ``(B) for the nonsecurity category, \n                $531,000,000,000 in new budget authority;\n            ``(8) with respect to fiscal year 2019--\n                    ``(A) for the security category, $589,500,000,000 \n                in new budget authority; and\n                    ``(B) for the nonsecurity category, \n                $541,000,000,000 in new budget authority;\n            ``(9) with respect to fiscal year 2020--\n                    ``(A) for the security category, $602,500,000,000 \n                in new budget authority; and\n                    ``(B) for the nonsecurity category, \n                $553,000,000,000 in new budget authority;\n            ``(10) with respect to fiscal year 2021--\n                    ``(A) for the security category, $616,000,000,000 \n                in new budget authority; and\n                    ``(B) for the nonsecurity category, \n                $565,000,000,000 in new budget authority;''.\n\nSEC. 5. DEFINITION OF SECURITY CATEGORY.\n\n    Section 250(c)(4)(B) of the Balanced Budget and Emergency Deficit \nControl Act of 1985 is amended to read as follows:\n            ``(B)(i) For fiscal years 2012 and 2013, the term `security \n        category' means discretionary appropriations associated with \n        agency budgets for the Department of Defense, the Department of \n        Homeland Security, the Department of Veterans Affairs, the \n        National Nuclear Security Administration, the intelligence \n        community management account (95-0401-0-1-054), and all budget \n        accounts in budget function 150 (international affairs).\n            ``(ii) For fiscal years 2014 through 2021, the term \n        `security category' means discretionary appropriations in \n        budget function 050 (national defense).''.\n\nSEC. 6. SUSPENSION OF STATUTORY LIMIT ON THE PUBLIC DEBT UNTIL FEBRUARY \n              1, 2017.\n\n    Section 2 of the No Budget, No Pay Act of 2013 is amended--\n            (1) in subsection (a), by striking ``May 18, 2013'' and \n        inserting ``January 31, 2017''; and\n            (2) in subsection (b), by striking ``May 19, 2013'' each \n        place it appears and inserting ``February 1, 2017''.\"",
+    "output": "Sequestration Relief Act of 2013 - Amends the Balanced Budget and Emergency Deficit Controlled Act of 1985 (Gramm-Rudman-Hollings Act), as amended by the Budget Control Act of 2011, to repeal automatic sequestration to enforce a specified budget goal. Reduces the discretionary spending limits for security and nonsecurity categories by $320 billion for FY2014-FY2021. Confines the current meaning of &quot;security category&quot; under the Gramm-Rudman-Hollings Act to FY2012-FY2013. Revises the definition of &quot;security category&quot; for FY2014-FY2021 to mean discretionary appropriations in all of budget function 050 (national defense). Amends the No Budget, No Pay Act of 2013 to suspend through January 31, 2017, the current $16.394 trillion public debt limit. Postpones until February 1, 2017, an automatic increase in the public debt limit to the extent that: (1) the face amount of obligations issued and the face amount of obligations whose principal and interest are guaranteed by the federal government (except guaranteed obligations held by the Secretary of the Treasury) outstanding on February 1, 2017, exceeds (2) the face amount of such obligations outstanding on the date of enactment of the No Budget, No Pay Act of 2013 (February 24, 2013). Prohibits an obligation from being taken into account unless its issuance was necessary to fund a commitment incurred by the federal government that required payment before February 1, 2017.",
+    "source": "Natural Instructions",
+    "subset": "task1658 billsum summarization"
+  }
+]
\ No newline at end of file
diff --git a/tests/data/test_alpaca.py b/tests/data/test_alpaca.py
new file mode 100644
index 0000000..c9aa856
--- /dev/null
+++ b/tests/data/test_alpaca.py
@@ -0,0 +1,27 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+def test_alpaca(mock_tockenizer, alpaca_path):
+    from lit_gpt.data import Alpaca
+
+    alpaca = Alpaca(
+        test_split_fraction=0.5,
+        download_dir=alpaca_path.parent,
+        data_file_name=alpaca_path.name,
+        num_workers=0,
+    )
+    alpaca.connect(mock_tockenizer, batch_size=2, max_seq_length=10)
+    alpaca.prepare_data()
+    alpaca.setup()
+
+    train_dataloader = alpaca.train_dataloader()
+    val_dataloader = alpaca.val_dataloader()
+
+    assert len(train_dataloader) == 6
+    assert len(val_dataloader) == 6
+
+    train_batch = next(iter(train_dataloader))
+    val_batch = next(iter(val_dataloader))
+
+    assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"}
+    assert all(seq.shape == (2, 10) for seq in train_batch.values())
+    assert all(seq.shape == (2, 10) for seq in val_batch.values())
diff --git a/tests/data/test_base.py b/tests/data/test_base.py
new file mode 100644
index 0000000..243d44d
--- /dev/null
+++ b/tests/data/test_base.py
@@ -0,0 +1,86 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+
+@pytest.mark.parametrize("mask_prompt", [True, False])
+@pytest.mark.parametrize("ignore_index", [-1, -100])
+@pytest.mark.parametrize("max_seq_length", [1000, 5])
+def test_sft_dataset(max_seq_length, ignore_index, mask_prompt, mock_tockenizer):
+    from lit_gpt.data import SFTDataset
+
+    i = ignore_index
+    prompt_template = "In: {instruction} Out:"
+    data = [
+        {"instruction": "Foo", "output": "Bar"},
+        {"instruction": "Boo", "output": "Ahh"},
+    ]
+
+    dataset = SFTDataset(
+        data=data,
+        tokenizer=mock_tockenizer,
+        prompt_template=prompt_template,
+        mask_prompt=mask_prompt,
+        ignore_index=ignore_index,
+        max_seq_length=max_seq_length,
+    )
+    assert len(dataset) == len(data)
+
+    expected_input_ids = torch.tensor([73, 110, 58, 32, 70, 111, 111, 32, 79, 117, 116, 58, 66, 97, 114, 1])
+    # If prompt is not masked, labels == input_ids
+    expected_labels = torch.tensor([i, i, i, i, i, i, i, i, i, i, i, i, 66, 97, 114, 1]) if mask_prompt else expected_input_ids
+
+    assert torch.equal(dataset[0]["input_ids"], expected_input_ids[:max_seq_length])
+    assert torch.equal(dataset[0]["labels"], expected_labels[:max_seq_length])
+
+
+@pytest.mark.parametrize("ignore_index", [-1, -100])
+@pytest.mark.parametrize("pad_id", [0, 100])
+def test_sft_collate_fn_padding(pad_id, ignore_index):
+    from lit_gpt.data import get_sft_collate_fn
+
+    collate = get_sft_collate_fn(pad_id=pad_id, ignore_index=ignore_index)
+    samples = [
+        {"input_ids": torch.tensor([1, 2, 3]), "labels": torch.tensor([10, 20, 30])},
+        {"input_ids": torch.tensor([4, 5, 6, 7, 8]), "labels": torch.tensor([40, 50, 60, 70, 80])},
+    ]
+    expected = {
+        "input_ids": torch.tensor([[1, 2, 3, pad_id, pad_id], [4, 5, 6, 7, 8]]),
+        "labels": torch.tensor([[10, 20, 30, ignore_index, ignore_index], [40, 50, 60, 70, 80]])
+    }
+    batch = collate(samples)
+    assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels"))
+
+
+def test_sft_collate_fn_truncation():
+    from lit_gpt.data import get_sft_collate_fn
+
+    collate = get_sft_collate_fn(max_seq_length=2)
+    samples = [
+        {"input_ids": torch.tensor([1, 2, 3]), "labels": torch.tensor([10, 20, 30])},
+        {"input_ids": torch.tensor([4, 5, 6, 7, 8]), "labels": torch.tensor([40, 50, 60, 70, 80])},
+    ]
+    expected = {
+        "input_ids": torch.tensor([[1, 2], [4, 5]]),
+        "labels": torch.tensor([[10, 20], [40, 50]])
+    }
+    batch = collate(samples)
+    assert all(torch.equal(batch[k], expected[k]) for k in ("input_ids", "labels"))
+
+
+def test_apply_prompt_template():
+    from lit_gpt.data import apply_prompt_template
+
+    # As a format-string
+    template = "Human: {instruction} {smile} Assistant:"
+    example = {"instruction": "Is a coconut a nut?", "smile": ":)"}
+    expected = "Human: Is a coconut a nut? :) Assistant:"
+    assert apply_prompt_template(template, example) == expected
+
+    # As a callable
+    template = lambda x: f"Human: {x['instruction']} {x.get('smile', '')}Assistant:"
+    example = {"instruction": "Is a coconut a nut?"}
+    expected = "Human: Is a coconut a nut? Assistant:"
+    assert apply_prompt_template(template, example) == expected
diff --git a/tests/data/test_dolly.py b/tests/data/test_dolly.py
new file mode 100644
index 0000000..df464f1
--- /dev/null
+++ b/tests/data/test_dolly.py
@@ -0,0 +1,27 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+def test_dolly(mock_tockenizer, dolly_path):
+    from lit_gpt.data import Dolly
+
+    alpaca = Dolly(
+        test_split_fraction=0.5,
+        download_dir=dolly_path.parent,
+        data_file_name=dolly_path.name,
+        num_workers=0,
+    )
+    alpaca.connect(mock_tockenizer, batch_size=2, max_seq_length=10)
+    alpaca.prepare_data()
+    alpaca.setup()
+
+    train_dataloader = alpaca.train_dataloader()
+    val_dataloader = alpaca.val_dataloader()
+
+    assert len(train_dataloader) == 3
+    assert len(val_dataloader) == 3
+
+    train_batch = next(iter(train_dataloader))
+    val_batch = next(iter(val_dataloader))
+
+    assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"}
+    assert all(seq.shape == (2, 10) for seq in train_batch.values())
+    assert all(seq.shape == (2, 10) for seq in val_batch.values())
diff --git a/tests/data/test_json.py b/tests/data/test_json.py
new file mode 100644
index 0000000..5ba465e
--- /dev/null
+++ b/tests/data/test_json.py
@@ -0,0 +1,54 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+import json
+from unittest import mock
+
+import pytest
+
+
+@mock.patch("lit_gpt.data.json.prompt_template", "X: {instruction} {input} Y:")
+def test_json(tmp_path, mock_tockenizer):
+    from lit_gpt.data import JSON
+
+    json_path = tmp_path / "data.json"
+    mock_data = [
+        {"instruction": "Add", "input": "2+2", "output": "4"},
+        {"instruction": "Subtract", "input": "5-3", "output": "2"},
+        {"instruction": "Multiply", "input": "6*4", "output": "24"},
+        {"instruction": "Divide", "input": "10/2", "output": "5"},
+        {"instruction": "Exponentiate", "input": "2^3", "output": "8"},
+        {"instruction": "Square root", "input": "√9", "output": "3"},
+    ]
+
+    with open(json_path, "w", encoding="utf-8") as fp:
+        json.dump(mock_data, fp)
+
+    with pytest.raises(FileNotFoundError):
+        JSON(tmp_path / "not exist")
+
+    # TODO: Make prompt template an argumenet
+    data = JSON(json_path, test_split_fraction=0.5, num_workers=0)
+    data.connect(tokenizer=mock_tockenizer, batch_size=2)
+    data.prepare_data()  # does nothing
+    data.setup()
+
+    train_dataloader = data.train_dataloader()
+    val_dataloader = data.val_dataloader()
+
+    assert len(train_dataloader) == 2
+    assert len(val_dataloader) == 2
+
+    train_data = list(train_dataloader)
+    val_data = list(val_dataloader)
+
+    assert train_data[0]["input_ids"].size(0) == 2
+    assert train_data[1]["input_ids"].size(0) == 1
+    assert val_data[0]["input_ids"].size(0) == 2
+    assert val_data[1]["input_ids"].size(0) == 1
+
+    assert mock_tockenizer.decode(train_data[0]["input_ids"][0]).startswith("X: Divide 10/2 Y:5")
+    assert mock_tockenizer.decode(train_data[0]["input_ids"][1]).startswith("X: Add 2+2 Y:4")
+    assert mock_tockenizer.decode(train_data[1]["input_ids"][0]).startswith("X: Multiply 6*4 Y:24")
+
+    assert mock_tockenizer.decode(val_data[0]["input_ids"][0]).startswith("X: Exponentiate 2^3 Y:8")
+    assert mock_tockenizer.decode(val_data[0]["input_ids"][1]).startswith("X: Subtract 5-3 Y:2")
+    assert mock_tockenizer.decode(val_data[1]["input_ids"][0]).startswith("X: Square root √9 Y:3")
diff --git a/tests/data/test_longform.py b/tests/data/test_longform.py
new file mode 100644
index 0000000..983bcc6
--- /dev/null
+++ b/tests/data/test_longform.py
@@ -0,0 +1,22 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+def test_longform(mock_tockenizer, longform_path):
+    from lit_gpt.data import LongForm
+
+    alpaca = LongForm(download_dir=longform_path, num_workers=0)
+    alpaca.connect(mock_tockenizer, batch_size=2, max_seq_length=10)
+    alpaca.prepare_data()
+    alpaca.setup()
+
+    train_dataloader = alpaca.train_dataloader()
+    val_dataloader = alpaca.val_dataloader()
+
+    assert len(train_dataloader) == 9
+    assert len(val_dataloader) == 5
+
+    train_batch = next(iter(train_dataloader))
+    val_batch = next(iter(val_dataloader))
+
+    assert train_batch.keys() == val_batch.keys() == {"input_ids", "labels"}
+    assert all(seq.shape == (2, 10) for seq in train_batch.values())
+    assert all(seq.shape == (2, 10) for seq in val_batch.values())
diff --git a/tests/data/test_tinyllama.py b/tests/data/test_tinyllama.py
new file mode 100644
index 0000000..7360ff6
--- /dev/null
+++ b/tests/data/test_tinyllama.py
@@ -0,0 +1,35 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import pytest
+from torch.utils.data import DataLoader
+
+
+def test_tinyllama(tmp_path, monkeypatch):
+    from lit_gpt.data import TinyLlama
+    from lightning.data.streaming import StreamingDataLoader, StreamingDataset, CombinedStreamingDataset
+
+    data = TinyLlama(data_path=(tmp_path / "data"))
+    assert data.seq_length == 2048
+    assert data.batch_size == 1
+
+    data.connect(batch_size=2, max_seq_length=1024)
+    assert data.seq_length == 1025
+    assert data.batch_size == 2
+
+    with pytest.raises(FileNotFoundError, match="The directory .*data/slimpajama/train does not exist"):
+        data.prepare_data()
+
+    (tmp_path / "data" / "slimpajama" / "train").mkdir(parents=True)
+    (tmp_path / "data" / "slimpajama" / "val").mkdir(parents=True)
+    (tmp_path / "data" / "starcoder").mkdir(parents=True)
+
+    data.prepare_data()
+    data.setup()
+
+    train_dataloader = data.train_dataloader()
+    assert isinstance(train_dataloader, StreamingDataLoader)
+    assert isinstance(train_dataloader.dataset, CombinedStreamingDataset)
+
+    val_dataloader = data.val_dataloader()
+    assert isinstance(val_dataloader, DataLoader)
+    assert isinstance(val_dataloader.dataset, StreamingDataset)
diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh
new file mode 100644
index 0000000..ea282e0
--- /dev/null
+++ b/tests/run_standalone_tests.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+set -e
+
+# Batch size for testing: Determines how many standalone test invocations run in parallel
+# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE
+test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-1}"
+
+# this environment variable allows special tests to run
+export PL_RUN_STANDALONE_TESTS=1
+# python arguments
+defaults="-m pytest --no-header -v --disable-pytest-warnings --strict-markers --color=yes -s --timeout 120"
+echo "Using defaults: ${defaults}"
+
+# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
+grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
+
+# file paths, remove duplicates
+files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq)
+
+# get the list of parametrizations. we need to call them separately. the last two lines are removed.
+# note: if there's a syntax error, this will fail with some garbled output
+if [[ "$OSTYPE" == "darwin"* ]]; then
+  parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | tail -r | sed -e '1,3d' | tail -r)
+else
+  parametrizations=$(python3 -m pytest $files --collect-only --quiet --disable-pytest-warnings "$@" | head -n -2)
+fi
+# remove the "tests/" path suffix
+path_suffix=$(basename "$(pwd)")"/"  # https://stackoverflow.com/a/8223345
+parametrizations=${parametrizations//$path_suffix/}
+parametrizations_arr=($parametrizations)
+
+report=''
+
+rm -f standalone_test_output.txt  # in case it exists, remove it
+function show_batched_output {
+  if [ -f standalone_test_output.txt ]; then  # if exists
+    cat standalone_test_output.txt
+    # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail
+    if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -qvE 'on_exception|xfailed'; then
+      echo "Potential error! Stopping."
+      rm standalone_test_output.txt
+      exit 1
+    fi
+    rm standalone_test_output.txt
+  fi
+}
+trap show_batched_output EXIT  # show the output on exit
+
+for i in "${!parametrizations_arr[@]}"; do
+  parametrization=${parametrizations_arr[$i]}
+  prefix="$((i+1))/${#parametrizations_arr[@]}"
+
+  echo "$prefix: Running $parametrization"
+  # execute the test in the background
+  # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
+  # output to std{out,err} because the outputs would be garbled together
+  python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
+  # save the PID in an array
+  pids[${i}]=$!
+  # add row to the final report
+  report+="Ran\t$parametrization\n"
+
+  if ((($i + 1) % $test_batch_size == 0)); then
+    # wait for running tests
+    for pid in ${pids[*]}; do wait $pid; done
+    unset pids  # empty the array
+    show_batched_output
+  fi
+done
+# wait for leftover tests
+for pid in ${pids[*]}; do wait $pid; done
+show_batched_output
+
+# echo test report
+printf '=%.s' {1..80}
+printf "\n$report"
+printf '=%.s' {1..80}
+printf '\n'
diff --git a/tests/test_adapter.py b/tests/test_adapter.py
index f21ccaf..27e5163 100644
--- a/tests/test_adapter.py
+++ b/tests/test_adapter.py
@@ -1,4 +1,4 @@
-import sys
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 from contextlib import redirect_stdout
 from dataclasses import asdict
 from io import StringIO
@@ -6,14 +6,17 @@
 
 import pytest
 import torch
+from conftest import RunIf
 from lightning import Fabric
+from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision
+from lightning.fabric.wrappers import _FabricOptimizer
 
 
 def test_config_identical():
     import lit_gpt.adapter as gpt_adapter
     import lit_gpt.model as gpt
 
-    name = "pythia-70m"
+    name = "pythia-14m"
     base_config = asdict(gpt.Config.from_name(name))
     adapter_config = asdict(gpt_adapter.Config.from_name(name))
     del adapter_config["adapter_prompt_length"]
@@ -30,7 +33,7 @@ def test_adapter_filter(tmp_path):
     from lit_gpt.adapter import GPT, adapter_filter
 
     fabric = Fabric(devices=1)
-    model = GPT.from_name("pythia-70m", n_layer=4)
+    model = GPT.from_name("pythia-14m", n_layer=4)
     save_path = tmp_path / "model.pth"
     fabric.save(save_path, {"model": model}, filter={"model": adapter_filter})
     saved = torch.load(save_path)["model"]
@@ -46,13 +49,7 @@ def test_adapter_filter(tmp_path):
 
 def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch):
     import finetune.adapter as module
-
-    module.gradient_accumulation_iters = 1
-    module.save_interval = 2
-    module.eval_interval = 2
-    module.eval_iters = 2
-    module.eval_max_new_tokens = 1
-    module.max_iters = 6
+    from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 
     data = [
         {"input_ids": torch.tensor([0, 1, 2]), "labels": torch.tensor([1, 2, 3])},
@@ -66,8 +63,7 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch):
     model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0)
     monkeypatch.setitem(name_to_config, "tmp", model_config)
 
-    monkeypatch.setattr(module, "lazy_load", Mock())
-    monkeypatch.setattr(module.GPT, "load_state_dict", Mock())
+    monkeypatch.setattr(module, "load_checkpoint", Mock())
 
     tokenizer_mock = Mock()
     tokenizer_mock.return_value = tokenizer_mock
@@ -76,19 +72,26 @@ def test_adapter_script(tmp_path, fake_checkpoint_dir, monkeypatch):
 
     stdout = StringIO()
     with redirect_stdout(stdout):
-        module.setup(data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path, precision="32-true")
+        module.setup(
+            io=IOArgs(
+                train_data_dir=tmp_path, val_data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path
+            ),
+            precision="32-true",
+            train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, epoch_size=6, micro_batch_size=1),
+            eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1),
+        )
 
     assert {p.name for p in tmp_path.glob("*.pth")} == {
-        "iter-000001-ckpt.pth",
-        "iter-000003-ckpt.pth",
-        "iter-000005-ckpt.pth",
+        "iter-000002-ckpt.pth",
+        "iter-000004-ckpt.pth",
+        "iter-000006-ckpt.pth",
         "lit_model_adapter_finetuned.pth",
     }
     assert (tmp_path / "version_0" / "metrics.csv").is_file()
 
     logs = stdout.getvalue()
-    assert logs.count("optimizer.step") == module.max_iters
-    assert logs.count("val loss") == module.max_iters // module.eval_interval
+    assert logs.count("optimizer.step") == 6
+    assert logs.count("val loss") == 3
     assert "of trainable parameters: 168" in logs
 
 
@@ -106,17 +109,17 @@ def test_adapter_gpt_init_weights():
     assert (param == 0).all()
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_compile():
     from lit_gpt.adapter import GPT
 
-    model = GPT.from_name("pythia-70m", n_layer=3)
+    model = GPT.from_name("pythia-14m", n_layer=3)
     x = torch.randint(model.config.vocab_size, size=(2, model.config.block_size), dtype=torch.int64)
 
     from torch._dynamo.backends import debugging
 
-    explanation = torch._dynamo.explain(model, x)
+    explanation = torch._dynamo.explain(model)(x)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
@@ -124,7 +127,112 @@ def test_adapter_compile():
     model = GPT(model.config)
     model.set_kv_cache(2)
     input_pos = torch.arange(model.config.block_size)
-    explanation = torch._dynamo.explain(model, x, input_pos)
+    explanation = torch._dynamo.explain(model)(x, input_pos)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
+
+
+@RunIf(min_cuda_gpus=1)
+def test_adapter_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir):
+    from lit_gpt.args import IOArgs
+
+    if not _BITSANDBYTES_AVAILABLE:
+        pytest.skip("BNB not available")
+
+    from bitsandbytes.optim import PagedAdamW
+
+    import finetune.adapter as module
+
+    data = []
+    torch.save(data, tmp_path / "train.pt")
+    torch.save(data, tmp_path / "test.pt")
+
+    from lit_gpt.config import name_to_config
+
+    model_config = dict(
+        block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0, bias=True
+    )
+    monkeypatch.setitem(name_to_config, "tmp", model_config)
+
+    monkeypatch.setattr(module, "load_checkpoint", Mock())
+    train_mock = Mock()
+    monkeypatch.setattr(module, "fit", train_mock)
+
+    stdout = StringIO()
+    with redirect_stdout(stdout):
+        module.setup(
+            precision="16-true",
+            quantize="bnb.nf4-dq",
+            io=IOArgs(
+                train_data_dir=tmp_path, val_data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path
+            ),
+        )
+
+    args, kwargs = train_mock.call_args
+    fabric, model, optimizer, *_ = args
+    assert isinstance(fabric.strategy.precision, BitsandbytesPrecision)
+    assert isinstance(optimizer, _FabricOptimizer)
+    assert isinstance(optimizer._optimizer, PagedAdamW)
+
+    dtype_to_name = {"torch.uint8": set(), "torch.float16": set()}
+    for name, layer in model.named_parameters():
+        name = name[len("_forward_module.") :]
+        dtype_to_name[str(layer.dtype)].add(name)
+    assert dtype_to_name == {
+        "torch.float16": {
+            "transformer.wte.weight",
+            "transformer.h.0.norm_1.weight",
+            "transformer.h.0.norm_1.bias",
+            "transformer.h.0.attn.gating_factor",
+            "transformer.h.0.attn.attn.bias",
+            "transformer.h.0.attn.proj.bias",
+            "transformer.h.0.attn.adapter_wte.weight",
+            "transformer.h.0.norm_2.weight",
+            "transformer.h.0.norm_2.bias",
+            "transformer.h.0.mlp.fc.bias",
+            "transformer.h.0.mlp.proj.bias",
+            "transformer.h.1.norm_1.weight",
+            "transformer.h.1.norm_1.bias",
+            "transformer.h.1.attn.gating_factor",
+            "transformer.h.1.attn.attn.bias",
+            "transformer.h.1.attn.proj.bias",
+            "transformer.h.1.attn.adapter_wte.weight",
+            "transformer.h.1.norm_2.weight",
+            "transformer.h.1.norm_2.bias",
+            "transformer.h.1.mlp.fc.bias",
+            "transformer.h.1.mlp.proj.bias",
+            "transformer.ln_f.weight",
+            "transformer.ln_f.bias",
+        },
+        "torch.uint8": {
+            "lm_head.weight",
+            "transformer.h.0.attn.attn.weight",
+            "transformer.h.0.attn.proj.weight",
+            "transformer.h.0.mlp.fc.weight",
+            "transformer.h.0.mlp.proj.weight",
+            "transformer.h.1.attn.attn.weight",
+            "transformer.h.1.attn.proj.weight",
+            "transformer.h.1.mlp.fc.weight",
+            "transformer.h.1.mlp.proj.weight",
+        },
+    }
+
+    assert {p.name for p in tmp_path.glob("*.pth")} == {"lit_model_adapter_finetuned.pth"}
+    state_dict = torch.load(tmp_path / "lit_model_adapter_finetuned.pth")
+    assert len(state_dict) == 1
+    dtype_to_name = {"torch.float16": set()}
+    for name, layer in state_dict["model"].items():
+        dtype_to_name[str(layer.dtype)].add(name)
+    assert dtype_to_name == {
+        "torch.float16": {
+            "transformer.h.0.attn.adapter_wte.weight",
+            "transformer.h.0.attn.gating_factor",
+            "transformer.h.1.attn.adapter_wte.weight",
+            "transformer.h.1.attn.gating_factor",
+        }
+    }
+
+    logs = stdout.getvalue()
+    assert "of trainable parameters: 168" in logs
+    assert "of non trainable parameters: 1,888" in logs
diff --git a/tests/test_adapter_v2.py b/tests/test_adapter_v2.py
index bc549ec..f0e0423 100644
--- a/tests/test_adapter_v2.py
+++ b/tests/test_adapter_v2.py
@@ -1,18 +1,30 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 from contextlib import redirect_stdout
 from io import StringIO
+from pathlib import Path
 from unittest.mock import Mock
 
 import pytest
 import torch
+from conftest import RunIf
 from lightning import Fabric
+from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision
+from lightning.fabric.wrappers import _FabricOptimizer
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import lit_gpt.config as config_module
 
 
 def test_config_identical():
     import lit_gpt.adapter_v2 as gpt_adapter
     import lit_gpt.model as gpt
 
-    name = "pythia-70m"
+    name = "pythia-14m"
     with Fabric(accelerator="cpu").init_module(empty_init=True):
         base_model = gpt.GPT.from_name(name)
         adapter_model = gpt_adapter.GPT.from_name(name)
@@ -27,7 +39,7 @@ def test_adapter_v2_filter(tmp_path):
     from lit_gpt.adapter_v2 import GPT, adapter_filter
 
     fabric = Fabric(devices=1)
-    model = GPT.from_name("pythia-70m", n_layer=3)
+    model = GPT.from_name("pythia-14m", n_layer=3)
     save_path = tmp_path / "model.pth"
     fabric.save(save_path, {"model": model}, filter={"model": adapter_filter})
     saved = torch.load(save_path)["model"]
@@ -61,13 +73,7 @@ def test_adapter_v2_filter(tmp_path):
 
 def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch):
     import finetune.adapter_v2 as module
-
-    module.gradient_accumulation_iters = 1
-    module.save_interval = 2
-    module.eval_interval = 2
-    module.eval_iters = 2
-    module.eval_max_new_tokens = 1
-    module.max_iters = 6
+    from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 
     data = [
         {"input_ids": torch.tensor([0, 1, 2]), "labels": torch.tensor([1, 2, 3])},
@@ -81,8 +87,7 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch):
     model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0)
     monkeypatch.setitem(name_to_config, "tmp", model_config)
 
-    monkeypatch.setattr(module, "lazy_load", Mock())
-    monkeypatch.setattr(module.GPT, "load_state_dict", Mock())
+    monkeypatch.setattr(module, "load_checkpoint", Mock())
 
     tokenizer_mock = Mock()
     tokenizer_mock.return_value = tokenizer_mock
@@ -91,19 +96,26 @@ def test_adapter_v2_script(tmp_path, fake_checkpoint_dir, monkeypatch):
 
     stdout = StringIO()
     with redirect_stdout(stdout):
-        module.setup(data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path, precision="32-true")
+        module.setup(
+            io=IOArgs(
+                train_data_dir=tmp_path, val_data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path
+            ),
+            precision="32-true",
+            train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, epoch_size=6, micro_batch_size=1),
+            eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1),
+        )
 
     assert {p.name for p in tmp_path.glob("*.pth")} == {
-        "iter-000001-ckpt.pth",
-        "iter-000003-ckpt.pth",
-        "iter-000005-ckpt.pth",
+        "iter-000002-ckpt.pth",
+        "iter-000004-ckpt.pth",
+        "iter-000006-ckpt.pth",
         "lit_model_adapter_finetuned.pth",
     }
     assert (tmp_path / "version_0" / "metrics.csv").is_file()
 
     logs = stdout.getvalue()
-    assert logs.count("optimizer.step") == module.max_iters
-    assert logs.count("val loss") == module.max_iters // module.eval_interval
+    assert logs.count("optimizer.step") == 6
+    assert logs.count("val loss") == 3
     assert "of trainable parameters: 552" in logs
 
 
@@ -121,31 +133,33 @@ def test_adapter_v2_gpt_init_weights():
         assert (param == 0).all()
 
 
-def test_base_model_can_be_adapter_v2_loaded():
+@pytest.mark.parametrize("name", [c["name"] for c in config_module.configs])
+def test_base_model_can_be_adapter_v2_loaded(name):
     from lit_gpt.adapter_v2 import GPT as AdapterV2GPT
     from lit_gpt.adapter_v2 import adapter_filter
     from lit_gpt.model import GPT as BaseGPT
 
-    base_model = BaseGPT.from_name("pythia-70m", bias=True, n_layer=2)
+    kwargs = {"n_layer": 2, "n_head": 8, "n_embd": 16, "padded_vocab_size": 32}
+    base_model = BaseGPT.from_name(name, **kwargs)
     base_model_state_dict = base_model.state_dict()
-    lora_model = AdapterV2GPT.from_name("pythia-70m", bias=True, n_layer=2, adapter_start_layer=0)
+    lora_model = AdapterV2GPT.from_name(name, **kwargs, adapter_start_layer=0)
     keys = lora_model.load_state_dict(base_model_state_dict, strict=False)
     assert not keys.unexpected_keys
     for k in keys.missing_keys:
         assert adapter_filter(k, None)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_adapter_v2_compile():
     from lit_gpt.adapter_v2 import GPT
 
-    model = GPT.from_name("pythia-70m", n_layer=3)
+    model = GPT.from_name("pythia-14m", n_layer=3)
     x = torch.randint(model.config.vocab_size, size=(2, model.config.block_size), dtype=torch.int64)
 
     from torch._dynamo.backends import debugging
 
-    explanation = torch._dynamo.explain(model, x)
+    explanation = torch._dynamo.explain(model)(x)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
@@ -153,7 +167,208 @@ def test_adapter_v2_compile():
     model = GPT(model.config)
     model.set_kv_cache(2)
     input_pos = torch.arange(model.config.block_size)
-    explanation = torch._dynamo.explain(model, x, input_pos)
+    explanation = torch._dynamo.explain(model)(x, input_pos)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
+
+
+@torch.inference_mode()
+def test_against_hf_mixtral():
+    from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
+
+    from lit_gpt.adapter_v2 import GPT, Config
+    from scripts.convert_hf_checkpoint import copy_weights_hf_llama
+
+    device = torch.device("cpu")
+    dtype = torch.float32
+    ours_config = Config.from_name(
+        "Mixtral-8x7B-Instruct-v0.1",
+        padded_vocab_size=10000,
+        n_layer=2,
+        n_embd=32,
+        n_head=8,
+        n_query_groups=2,
+        intermediate_size=86,
+        n_expert=4,
+    )
+    T = 5
+    theirs_config = MixtralConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        num_local_experts=ours_config.n_expert,
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    theirs_model = MixtralForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    state_dict = {}
+    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    # strict=False because missing keys due to adapter weights not contained in state dict
+    ours_model.load_state_dict(state_dict, strict=False)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304], [23, 345, 65, 123, 321]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@RunIf(min_cuda_gpus=1)
+def test_adapter_v2_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir):
+    from lit_gpt.args import IOArgs
+
+    if not _BITSANDBYTES_AVAILABLE:
+        pytest.skip("BNB not available")
+
+    from bitsandbytes.optim import PagedAdamW
+
+    import finetune.adapter_v2 as module
+
+    data = []
+    torch.save(data, tmp_path / "train.pt")
+    torch.save(data, tmp_path / "test.pt")
+
+    from lit_gpt.config import name_to_config
+
+    model_config = dict(
+        block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8, adapter_start_layer=0, bias=True
+    )
+    monkeypatch.setitem(name_to_config, "tmp", model_config)
+
+    monkeypatch.setattr(module, "load_checkpoint", Mock())
+    train_mock = Mock()
+    monkeypatch.setattr(module, "fit", train_mock)
+
+    stdout = StringIO()
+    with redirect_stdout(stdout):
+        module.setup(
+            precision="16-true",
+            quantize="bnb.nf4-dq",
+            io=IOArgs(
+                train_data_dir=tmp_path, val_data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path
+            ),
+        )
+
+    args, kwargs = train_mock.call_args
+    fabric, model, optimizer, *_ = args
+    assert isinstance(fabric.strategy.precision, BitsandbytesPrecision)
+    assert isinstance(optimizer, _FabricOptimizer)
+    assert isinstance(optimizer._optimizer, PagedAdamW)
+
+    dtype_to_name = {"torch.uint8": set(), "torch.float16": set()}
+    for name, layer in model.named_parameters():
+        name = name[len("_forward_module.") :]
+        dtype_to_name[str(layer.dtype)].add(name)
+    assert dtype_to_name == {
+        "torch.uint8": {
+            "transformer.h.0.mlp.fc.linear.weight",
+            "transformer.h.1.mlp.proj.linear.weight",
+            "transformer.h.1.attn.attn.linear.weight",
+            "transformer.h.0.attn.proj.linear.weight",
+            "lm_head.linear.weight",
+            "transformer.h.1.attn.proj.linear.weight",
+            "transformer.h.0.mlp.proj.linear.weight",
+            "transformer.h.0.attn.attn.linear.weight",
+            "transformer.h.1.mlp.fc.linear.weight",
+        },
+        "torch.float16": {
+            "transformer.h.1.attn.attn.adapter_bias",
+            "transformer.h.1.mlp.proj.adapter_bias",
+            "transformer.h.0.attn.attn.adapter_bias",
+            "transformer.h.0.norm_1.bias",
+            "transformer.h.0.attn.attn.linear.bias",
+            "transformer.h.1.attn.adapter_wte.weight",
+            "transformer.ln_f.weight",
+            "transformer.h.0.mlp.fc.linear.bias",
+            "transformer.h.0.mlp.proj.linear.bias",
+            "transformer.h.1.mlp.fc.linear.bias",
+            "transformer.h.0.attn.proj.adapter_scale",
+            "transformer.h.0.attn.attn.adapter_scale",
+            "transformer.h.1.norm_2.bias",
+            "transformer.h.1.attn.proj.adapter_scale",
+            "transformer.h.0.norm_2.bias",
+            "transformer.h.0.mlp.fc.adapter_scale",
+            "transformer.h.0.attn.proj.linear.bias",
+            "transformer.h.1.attn.proj.linear.bias",
+            "transformer.h.1.norm_1.bias",
+            "transformer.h.0.norm_1.weight",
+            "transformer.h.1.attn.proj.adapter_bias",
+            "transformer.h.0.mlp.proj.adapter_scale",
+            "transformer.h.0.mlp.proj.adapter_bias",
+            "transformer.h.1.mlp.fc.adapter_bias",
+            "transformer.h.1.mlp.proj.adapter_scale",
+            "transformer.h.1.attn.gating_factor",
+            "transformer.h.1.norm_1.weight",
+            "transformer.ln_f.bias",
+            "transformer.h.0.mlp.fc.adapter_bias",
+            "lm_head.adapter_scale",
+            "lm_head.adapter_bias",
+            "transformer.h.1.norm_2.weight",
+            "transformer.h.0.attn.adapter_wte.weight",
+            "transformer.h.1.attn.attn.adapter_scale",
+            "transformer.h.1.mlp.fc.adapter_scale",
+            "transformer.h.1.attn.attn.linear.bias",
+            "transformer.wte.weight",
+            "transformer.h.0.norm_2.weight",
+            "transformer.h.1.mlp.proj.linear.bias",
+            "transformer.h.0.attn.gating_factor",
+            "transformer.h.0.attn.proj.adapter_bias",
+        },
+    }
+
+    assert {p.name for p in tmp_path.glob("*.pth")} == {"lit_model_adapter_finetuned.pth"}
+    state_dict = torch.load(tmp_path / "lit_model_adapter_finetuned.pth")
+    assert len(state_dict) == 1
+    dtype_to_name = {"torch.float16": set()}
+    for name, layer in state_dict["model"].items():
+        dtype_to_name[str(layer.dtype)].add(name)
+    assert dtype_to_name == {
+        "torch.float16": {
+            "transformer.h.1.attn.adapter_wte.weight",
+            "transformer.h.1.attn.proj.adapter_bias",
+            "transformer.h.1.mlp.fc.adapter_scale",
+            "lm_head.adapter_bias",
+            "transformer.h.0.mlp.proj.adapter_scale",
+            "transformer.ln_f.bias",
+            "lm_head.adapter_scale",
+            "transformer.h.1.norm_2.weight",
+            "transformer.h.0.attn.attn.adapter_scale",
+            "transformer.h.0.mlp.proj.adapter_bias",
+            "transformer.h.0.attn.gating_factor",
+            "transformer.h.1.norm_1.bias",
+            "transformer.h.1.mlp.fc.adapter_bias",
+            "transformer.h.1.mlp.proj.adapter_scale",
+            "transformer.h.0.mlp.fc.adapter_scale",
+            "transformer.h.1.attn.attn.adapter_bias",
+            "transformer.h.0.norm_2.weight",
+            "transformer.h.1.norm_2.bias",
+            "transformer.h.0.norm_1.weight",
+            "transformer.h.0.attn.proj.adapter_scale",
+            "transformer.h.1.mlp.proj.adapter_bias",
+            "transformer.h.0.attn.attn.adapter_bias",
+            "transformer.h.0.attn.adapter_wte.weight",
+            "transformer.ln_f.weight",
+            "transformer.h.1.attn.gating_factor",
+            "transformer.h.0.mlp.fc.adapter_bias",
+            "transformer.h.1.attn.proj.adapter_scale",
+            "transformer.h.0.attn.proj.adapter_bias",
+            "transformer.h.0.norm_1.bias",
+            "transformer.h.0.norm_2.bias",
+            "transformer.h.1.norm_1.weight",
+            "transformer.h.1.attn.attn.adapter_scale",
+        }
+    }
+
+    logs = stdout.getvalue()
+    assert "of trainable parameters: 552" in logs
+    assert "of non trainable parameters: 1,808" in logs
diff --git a/tests/test_chat.py b/tests/test_chat.py
index ec7315a..a8fe346 100644
--- a/tests/test_chat.py
+++ b/tests/test_chat.py
@@ -1,8 +1,13 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
 import subprocess
 import sys
+from contextlib import redirect_stderr, redirect_stdout
+from io import StringIO
 from itertools import repeat
 from pathlib import Path
-from unittest.mock import MagicMock
+from unittest.mock import ANY, MagicMock, Mock, call, patch
 
 import pytest
 import torch
@@ -14,33 +19,103 @@
         (repeat(1), (), [1] * 8),
         ([1, 2, 3, 0], ([0],), [1, 2, 3]),
         ([1, 2, 3, 0], ([9], [2, 4], [1, 2, 3, 0]), []),
-        ([1, 2, 3, 0, 0], ([0, 0, 0], [0, 0]), [1, 2, [3]]),
+        ([1, 2, 3, 0, 0], ([0, 0, 0], [0, 0]), [1, 2, 3]),
+        ([3, 1, 2], ([1, 2], [3]), []),
+        ([1, 2, 3, 0, 3, 2, 1, 0], ([4, 3, 2, 1], [2, 4]), [1, 2, 3, 0, 3, 2, 1, 0]),
     ],
 )
-def test_generate(generated, stop_tokens, expected):
+def test_generate(monkeypatch, generated, stop_tokens, expected):
     import chat.base as chat
+    import generate.base as generate
 
     input_idx = torch.tensor([5, 3])
     max_returned_tokens = len(input_idx) + 8
     model = MagicMock()
     model.config.block_size = 100
     model.max_seq_length = 100
-
-    original_multinomial = torch.multinomial
     it = iter(generated)
 
     def multinomial(*_, **__):
         out = next(it)
         return torch.tensor([out])
 
-    chat.torch.multinomial = multinomial
+    monkeypatch.setattr(generate, "multinomial_num_samples_1", multinomial)
     actual = chat.generate(model, input_idx, max_returned_tokens, stop_tokens=stop_tokens)
     actual = list(actual)
-    chat.torch.multinomial = original_multinomial
 
-    for t in actual:
-        assert t.dtype == torch.long
-    assert [t.tolist() for t in actual] == expected
+    assert len(actual) == len(expected)
+    if not actual:
+        assert actual == expected
+    else:
+        for t in actual:
+            assert t.dtype == torch.long
+        assert torch.cat(actual).tolist() == expected
+
+
+@pytest.mark.parametrize("tokenizer_backend", ["huggingface", "sentencepiece"])
+def test_decode(tokenizer_backend):
+    from lightning.fabric import Fabric
+
+    import chat.base as chat
+
+    class Tokenizer:
+        backend = tokenizer_backend
+        id2token = {1: "foo ", 2: "bar ", 3: "baz "}
+
+        def decode(self, tensor: torch.Tensor) -> str:
+            tensor = [tensor] if tensor.ndim == 0 else tensor
+            return "".join(self.id2token[int(value)] for value in tensor)
+
+    tokenizer_mock = Tokenizer()
+
+    fabric = Fabric(devices=1, accelerator="cpu")
+
+    token_stream = torch.tensor([3, 2, 1])
+    out, err = StringIO(), StringIO()
+    with redirect_stdout(out), redirect_stderr(err):
+        chat.decode(fabric, tokenizer_mock, token_stream)
+
+    assert out.getvalue() == "baz bar foo "
+
+
+@patch("chat.base.input")
+@pytest.mark.parametrize("stop_iteration", [KeyboardInterrupt, ""])
+def test_main(mocked_input, stop_iteration, fake_checkpoint_dir, monkeypatch, tensor_like):
+    import chat.base as chat
+
+    # these values will be iteratively provided for each `input()` call
+    mocked_input.side_effect = ["Hello", stop_iteration]
+
+    config_path = fake_checkpoint_dir / "lit_config.json"
+    config = {"block_size": 128, "vocab_size": 50, "n_layer": 2, "n_head": 4, "n_embd": 8, "rotary_percentage": 1}
+    config_path.write_text(json.dumps(config))
+
+    load_mock = Mock()
+    load_mock.return_value = load_mock
+    monkeypatch.setattr(chat, "load_checkpoint", load_mock)
+    tokenizer_mock = Mock()
+    tokenizer_mock.return_value.backend = "sentencepiece"
+    tokenizer_mock.return_value.encode.return_value = torch.tensor([1, 2, 3])
+    tokenizer_mock.return_value.decode.return_value = "foo bar baz"
+    monkeypatch.setattr(chat, "Tokenizer", tokenizer_mock)
+    generate_mock = Mock()
+    generate_mock.return_value = torch.tensor([3, 2, 1])
+    monkeypatch.setattr(chat, "generate", generate_mock)
+
+    out, err = StringIO(), StringIO()
+    with redirect_stdout(out), redirect_stderr(err):
+        chat.main(temperature=2.0, top_k=2, checkpoint_dir=fake_checkpoint_dir)
+
+    # decoding is done per each generated item
+    assert len(tokenizer_mock.return_value.decode.mock_calls) == generate_mock.return_value.numel()
+    assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
+    assert generate_mock.mock_calls == [
+        call(ANY, tensor_like, 128, temperature=2.0, top_k=2, stop_tokens=([tokenizer_mock.return_value.eos_id],))
+    ]
+    # # only the generated result is printed to stdout
+    assert out.getvalue() == ">> Reply: foo bar baz\n"
+
+    assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4" in err.getvalue()
 
 
 def test_cli():
diff --git a/tests/test_ci.py b/tests/test_ci.py
new file mode 100644
index 0000000..04a6542
--- /dev/null
+++ b/tests/test_ci.py
@@ -0,0 +1,19 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+# this file is just to validate on the CI logs that these tests were run
+from conftest import RunIf
+
+
+@RunIf(min_cuda_gpus=1)
+def test_runif_min_cuda_gpus():
+    assert True
+
+
+@RunIf(min_cuda_gpus=1, standalone=True)
+def test_runif_min_cuda_gpus_standalone():
+    assert True
+
+
+@RunIf(standalone=True)
+def test_runif_standalone():
+    assert True
diff --git a/tests/test_config.py b/tests/test_config.py
index 409931e..d1b7bd9 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,26 +1,42 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
+import sys
+from pathlib import Path
+
+import pytest
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import lit_gpt.config as config_module
 
 
 def test_config():
     from lit_gpt import Config
 
     config = Config()
+    assert config.name == ""
     assert config.block_size == 4096
 
     config = Config(block_size=2048)
     assert config.block_size == 2048
 
-    config = Config.from_name("pythia-70m")
-    assert config.block_size == 2048
+    config = Config.from_name("pythia-14m")
+    assert config.block_size == 512
 
-    config = Config.from_name("pythia-70m", block_size=4096)
+    config = Config.from_name("pythia-14m", block_size=4096)
     assert config.block_size == 4096
 
+    config = Config(hf_config={"name": "pythia-14m"})
+    assert config.name == "pythia-14m"
+
 
 def test_legacy_args(tmp_path):
     from lit_gpt import Config
 
-    config = Config.from_name("pythia-70m", condense_ratio=2)
+    config = Config.from_name("pythia-14m", condense_ratio=2)
     assert not hasattr(config, "condense_ratio")
     assert config.rope_condense_ratio == 2
 
@@ -34,3 +50,89 @@ def test_legacy_args(tmp_path):
     config = Config.from_json(json_path, condense_ratio=2)
     assert not hasattr(config, "condense_ratio")
     assert config.rope_condense_ratio == 2
+
+
+def test_from_hf_name():
+    from lit_gpt import Config
+
+    # by short-hand name
+    config0 = Config.from_name("tiny-llama-1.1b")
+    # or by huggingface hub repo name
+    config1 = Config.from_name("TinyLlama-1.1B-intermediate-step-1431k-3T")
+    assert config0 == config1
+
+
+def test_hf_config_from_json(tmp_path):
+    """Test for backward compatibility with older configs that didn't have the `hf_config` field."""
+    from lit_gpt import Config
+
+    legacy_config = {"name": "falcon-40b", "org": "tiiuae"}
+    with open(tmp_path / "config.json", "w") as file:
+        json.dump(legacy_config, file)
+    new_config = Config.from_json(tmp_path / "config.json")
+    assert new_config.name == "falcon-40b"
+    assert not hasattr(new_config, "org")
+    assert new_config.hf_config["org"] == "tiiuae"
+    assert new_config.hf_config["name"] == "falcon-40b"
+
+    new_config = Config.from_json(tmp_path / "config.json", org="new-org")
+    assert new_config.hf_config["org"] == "new-org"
+
+
+@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
+def test_short_and_hf_names_are_equal_unless_on_purpose(config):
+    from lit_gpt import Config
+
+    # by short-hand name
+    config0 = Config.from_name(config["name"])
+    # or by huggingface hub repo name
+    config1 = Config.from_name(config["hf_config"]["name"])
+    assert config0.name == config1.name
+
+
+def test_nonexisting_name():
+    from lit_gpt import Config
+
+    with pytest.raises(ValueError, match="not a supported"):
+        Config.from_name("foobar")
+
+
+def test_from_checkpoint(tmp_path):
+    from lit_gpt import Config
+
+    # 1. Neither `lit_config.py` nor matching config exists.
+    with pytest.raises(FileNotFoundError, match="neither 'lit_config.json' nor matching config exists"):
+        Config.from_checkpoint(tmp_path / "non_existing_checkpoint")
+
+    # 2. If `lit_config.py` doesn't exists, but there is a matching config in `lit_gpt/config.py`.
+    config = Config.from_checkpoint(tmp_path / "pythia-14m")
+    assert config.name == "pythia-14m"
+    assert config.block_size == 512
+    assert config.n_layer == 6
+
+    # 3. If only `lit_config.py` exists.
+    config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2}
+    with open(tmp_path / "lit_config.json", "w") as file:
+        json.dump(config_data, file)
+    config = Config.from_checkpoint(tmp_path)
+    assert config.name == "pythia-14m"
+    assert config.block_size == 24
+    assert config.n_layer == 2
+
+    # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config
+    (tmp_path / "pythia-14m").mkdir()
+    with open(tmp_path / "pythia-14m/lit_config.json", "w") as file:
+        json.dump(config_data, file)
+    config = Config.from_checkpoint(tmp_path / "pythia-14m")
+    assert config.name == "pythia-14m"
+    assert config.block_size == 24
+    assert config.n_layer == 2
+
+
+@pytest.mark.parametrize("head_size", [None, 128])
+def test_head_size(head_size):
+    from lit_gpt import Config
+
+    config = Config(head_size)
+
+    assert config.head_size == head_size or config.n_embd // config.n_head
diff --git a/tests/test_convert_hf_checkpoint.py b/tests/test_convert_hf_checkpoint.py
index abaf7b9..97d42b4 100644
--- a/tests/test_convert_hf_checkpoint.py
+++ b/tests/test_convert_hf_checkpoint.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 from unittest import mock
 
 import pytest
@@ -95,6 +97,7 @@ def test_llama2_70b_conversion():
         "transformer.h.5.attn.proj.weight": (8192, 8192),
         "transformer.h.5.mlp.fc_1.weight": (28672, 8192),
         "transformer.wte.weight": (32000, 8192),
+        "lm_head.weight": (32000, 8192),  # due to weight tying lm_head is in the converted weights
     }
 
 
@@ -102,12 +105,12 @@ def test_convert_hf_checkpoint(tmp_path):
     from scripts.convert_hf_checkpoint import convert_hf_checkpoint
 
     with pytest.raises(ValueError, match="to contain .bin"):
-        convert_hf_checkpoint(checkpoint_dir=tmp_path, model_name="pythia-70m")
+        convert_hf_checkpoint(checkpoint_dir=tmp_path, model_name="pythia-14m")
 
     bin_file = tmp_path / "foo.bin"
     bin_file.touch()
     with mock.patch("scripts.convert_hf_checkpoint.lazy_load") as load:
-        convert_hf_checkpoint(checkpoint_dir=tmp_path, model_name="pythia-70m")
+        convert_hf_checkpoint(checkpoint_dir=tmp_path, model_name="pythia-14m")
     load.assert_called_with(bin_file)
 
     assert {p.name for p in tmp_path.glob("*")} == {"foo.bin", "lit_config.json", "lit_model.pth"}
diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
index c7d94ca..ce21f17 100644
--- a/tests/test_convert_lit_checkpoint.py
+++ b/tests/test_convert_lit_checkpoint.py
@@ -1,5 +1,6 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
-import operator
 import os
 from dataclasses import asdict
 from pathlib import Path
@@ -8,7 +9,7 @@
 
 import pytest
 import torch
-from lightning_utilities import compare_version
+from conftest import RunIf
 
 wd = Path(__file__).parent.parent.absolute()
 
@@ -115,18 +116,7 @@ def test_against_original_gpt_neox():
 
 @torch.inference_mode()
 @pytest.mark.parametrize(
-    "ours_kwargs",
-    [
-        {"name": "Llama-2-7b-hf"},
-        pytest.param(
-            {"name": "CodeLlama-7b-hf"},
-            marks=pytest.mark.skipif(
-                compare_version("transformers", operator.lt, "4.33.0", use_base_version=True),
-                reason="requires rope_theta",
-            ),
-        ),
-        {"name": "Llama-2-70b-chat-hf"},
-    ],
+    "ours_kwargs", [{"name": "Llama-2-7b-hf"}, {"name": "CodeLlama-7b-hf"}, {"name": "Llama-2-70b-chat-hf"}]
 )
 def test_against_hf_llama2(ours_kwargs):
     from transformers.models.llama.configuration_llama import LlamaConfig
@@ -146,7 +136,7 @@ def test_against_hf_llama2(ours_kwargs):
         num_hidden_layers=ours_config.n_layer,
         intermediate_size=ours_config.intermediate_size,
         max_position_embeddings=T,
-        rms_norm_eps=1e-5,
+        rms_norm_eps=ours_config.norm_eps,
         num_query_value_heads=ours_config.n_query_groups,
         rope_theta=ours_config.rope_base,
     )
@@ -166,6 +156,52 @@ def test_against_hf_llama2(ours_kwargs):
     torch.testing.assert_close(ours_y, theirs_y)
 
 
+@torch.inference_mode()
+def test_against_mixtral():
+    from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_lit_checkpoint import copy_weights_llama
+
+    ours_config = Config.from_name(
+        "Mixtral-8x7B-Instruct-v0.1",
+        padded_vocab_size=10000,
+        n_layer=2,
+        n_embd=32,
+        n_head=8,
+        n_query_groups=2,
+        intermediate_size=86,
+        n_expert=4,
+    )
+    T = 5
+    theirs_config = MixtralConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        num_local_experts=ours_config.n_expert,
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    ours_model = GPT(ours_config)
+    ours_state_dict = ours_model.state_dict()
+    theirs_state_dict = {}
+    copy_weights_llama(ours_config, theirs_state_dict, ours_state_dict)
+    theirs_model = MixtralForCausalLM(theirs_config)
+    theirs_model.load_state_dict(theirs_state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304], [23, 345, 65, 123, 321]], dtype=torch.int32)
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"]
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
 @torch.inference_mode()
 def test_against_original_open_llama_3b():
     from transformers.models.llama.configuration_llama import LlamaConfig
@@ -201,35 +237,92 @@ def test_against_original_open_llama_3b():
 
 
 @torch.inference_mode()
-def test_against_hf_phi():
-    file_path = wd / "tests" / "original_phi_1_5.py"
-    url = "https://gist.githubusercontent.com/carmocca/8ec003d9e0d2fdb09ea92941cd0985b4/raw/2ba35c28824d4f4d5dce14f9588a80067cb6ae7f/original_phi_1_5.py"
-    if not file_path.is_file():
-        urlretrieve(url=url, filename=file_path)
+def test_against_hf_phi_1_5():
+    workdir = wd / "tests" / "reference_models"
+    workdir.mkdir(parents=True, exist_ok=True)
+    file_paths = [workdir / "original_phi_1_5.py", workdir / "configuration_phi.py"]
+    urls = [
+        "https://huggingface.co/microsoft/phi-1_5/raw/main/modeling_phi.py",
+        "https://huggingface.co/microsoft/phi-1_5/raw/main/configuration_phi.py",
+    ]
+    for file_path, url in zip(file_paths, urls):
+        if not file_path.is_file():
+            urlretrieve(url=url, filename=file_path)
 
     from lit_gpt import GPT, Config
     from scripts.convert_lit_checkpoint import copy_weights_phi
-    from tests.original_phi_1_5 import MixFormerSequentialConfig, MixFormerSequentialForCausalLM
+    from tests.reference_models.configuration_phi import PhiConfig
+    from tests.reference_models.original_phi_1_5 import PhiForCausalLM
 
     ours_config = Config.from_name(
         "phi-1_5", padded_vocab_size=10000, n_layer=2, n_head=4, n_embd=256, rotary_percentage=0.5
     )
     T = 5
-    theirs_config = MixFormerSequentialConfig(
-        n_positions=ours_config.block_size,
-        n_embd=ours_config.n_embd,
-        n_head=ours_config.n_head,
-        n_layer=ours_config.n_layer,
-        rotary_dim=ours_config.rope_n_elem,
-        architecture={"block_cls": "parallel", "mixer": {}, "mlp": {"mlp_cls": "mlp"}},
+    theirs_config = PhiConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        max_position_embeddings=ours_config.block_size,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        partial_rotary_factor=ours_config.rotary_percentage,
+    )
+
+    ours_model = GPT(ours_config)
+    ours_state_dict = ours_model.state_dict()
+    theirs_state_dict = {}
+    copy_weights_phi(ours_config, theirs_state_dict, ours_state_dict)
+    theirs_model = PhiForCausalLM(theirs_config)
+    # strict=False because we don't save the rotary embeddings inv frequency
+    keys = theirs_model.load_state_dict(theirs_state_dict, strict=False)
+    assert not keys.unexpected_keys
+    assert all("inv_freq" in k for k in keys.missing_keys)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"]
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@torch.inference_mode()
+def test_against_hf_phi_2():
+    workdir = wd / "tests" / "reference_models"
+    workdir.mkdir(parents=True, exist_ok=True)
+    file_paths = [workdir / "original_phi_2.py", workdir / "configuration_phi.py"]
+    urls = [
+        "https://huggingface.co/microsoft/phi-2/raw/main/modeling_phi.py",
+        "https://huggingface.co/microsoft/phi-2/raw/main/configuration_phi.py",
+    ]
+    for file_path, url in zip(file_paths, urls):
+        if not file_path.is_file():
+            urlretrieve(url=url, filename=file_path)
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_lit_checkpoint import copy_weights_phi
+    from tests.reference_models.configuration_phi import PhiConfig
+    from tests.reference_models.original_phi_2 import PhiForCausalLM
+
+    ours_config = Config.from_name(
+        "phi-2", padded_vocab_size=10000, n_layer=2, n_head=4, n_embd=256, rotary_percentage=0.5
+    )
+    T = 5
+    theirs_config = PhiConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        max_position_embeddings=ours_config.block_size,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        partial_rotary_factor=ours_config.rotary_percentage,
     )
-    theirs_config.vocab_size = ours_config.padded_vocab_size
 
     ours_model = GPT(ours_config)
     ours_state_dict = ours_model.state_dict()
     theirs_state_dict = {}
     copy_weights_phi(ours_config, theirs_state_dict, ours_state_dict)
-    theirs_model = MixFormerSequentialForCausalLM(theirs_config)
+    theirs_model = PhiForCausalLM(theirs_config)
     # strict=False because we don't save the rotary embeddings inv frequency
     keys = theirs_model.load_state_dict(theirs_state_dict, strict=False)
     assert not keys.unexpected_keys
@@ -243,6 +336,105 @@ def test_against_hf_phi():
     torch.testing.assert_close(ours_y, theirs_y)
 
 
+@torch.inference_mode()
+def test_against_original_stablelm_zephyr_3b():
+    from transformers import AutoConfig, AutoModelForCausalLM
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_lit_checkpoint import copy_weights_llama
+
+    T = 5
+    ours_config = Config.from_name("stablelm-zephyr-3b", n_layer=2, n_head=16, n_embd=32, intermediate_size=86)
+    theirs_config = AutoConfig.from_pretrained(
+        "stabilityai/stablelm-zephyr-3b",
+        trust_remote_code=True,
+        num_hidden_layers=ours_config.n_layer,
+        num_attention_heads=ours_config.n_head,
+        num_key_value_heads=ours_config.n_head,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    ours_model = GPT(ours_config)
+    ours_state_dict = ours_model.state_dict()
+    theirs_state_dict = {}
+    copy_weights_llama(ours_config, theirs_state_dict, ours_state_dict)
+    theirs_model = AutoModelForCausalLM.from_config(theirs_config, trust_remote_code=True)
+    theirs_model.load_state_dict(theirs_state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"]
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ["gemma-2b", "gemma-7b"])
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_original_gemma(model_name, device, dtype):
+    from transformers.models.gemma.configuration_gemma import GemmaConfig
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_lit_checkpoint import copy_weights_llama
+
+    torch.set_default_dtype(dtype)
+
+    T = 5
+    ours_config = Config.from_name(model_name, n_layer=2, n_head=16, n_embd=32, intermediate_size=86)
+    theirs_config = GemmaConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        head_dim=ours_config.head_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        attention_bias=ours_config.bias,
+        tie_word_embeddings=True,
+        hidden_act="gelu_pytorch_tanh",
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    ours_model = GPT(ours_config).to(device)
+    # tie weights
+    ours_model.lm_head.weight = ours_model.transformer.wte.weight
+    ours_state_dict = ours_model.state_dict()
+    theirs_state_dict = {}
+    copy_weights_llama(ours_config, theirs_state_dict, ours_state_dict, untie_weights=True)
+    theirs_model = GemmaForCausalLM(theirs_config).to(device)
+    theirs_model.load_state_dict(theirs_state_dict, strict=False)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
 def test_check_conversion_supported_adapter():
     from scripts.convert_lit_checkpoint import check_conversion_supported
 
diff --git a/tests/test_convert_pretrained_checkpoint.py b/tests/test_convert_pretrained_checkpoint.py
new file mode 100644
index 0000000..4d8cadb
--- /dev/null
+++ b/tests/test_convert_pretrained_checkpoint.py
@@ -0,0 +1,33 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import os
+
+import torch
+
+
+def test_convert_pretrained_checkpoint(tmp_path):
+    from scripts.convert_pretrained_checkpoint import convert_checkpoint
+
+    # Pretend we made a checkpoint from pretraining
+    pretrained_checkpoint = {
+        "model": {"some.module.weight": torch.rand(2, 2), "_orig_mod.some.other.module.weight": torch.rand(2, 2)},
+        "the_optimizer": "optimizer_state",
+        "other": 1,
+    }
+    torch.save(pretrained_checkpoint, tmp_path / "pretrained.pth")
+
+    # Make a fake tokenizer config file
+    llama_checkpoint_folder = tmp_path / "checkpoints" / "meta-llama" / "Llama-2-7b-hf"
+    llama_checkpoint_folder.mkdir(parents=True)
+    (llama_checkpoint_folder / "tokenizer_config.json").touch()
+
+    convert_checkpoint(
+        checkpoint_file=(tmp_path / "pretrained.pth"),
+        tokenizer_dir=llama_checkpoint_folder,
+        config_name="tiny-llama-1.1b",
+        output_dir=(tmp_path / "converted"),
+    )
+
+    assert set(os.listdir(tmp_path / "converted")) == {"lit_model.pth", "lit_config.json", "tokenizer_config.json"}
+    converted_checkpoint = torch.load(tmp_path / "converted" / "lit_model.pth")
+    assert list(converted_checkpoint.keys()) == ["some.module.weight", "some.other.module.weight"]
diff --git a/tests/test_full.py b/tests/test_full.py
index 1a735e0..f543c27 100644
--- a/tests/test_full.py
+++ b/tests/test_full.py
@@ -1,19 +1,18 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import os
 from contextlib import redirect_stdout
 from io import StringIO
+from unittest import mock
 from unittest.mock import Mock
 
 import torch
 
 
+@mock.patch.dict(os.environ, {"LT_ACCELERATOR": "cpu"})
 def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch):
     import finetune.full as module
-
-    module.gradient_accumulation_iters = 1
-    module.save_interval = 2
-    module.eval_interval = 2
-    module.eval_iters = 2
-    module.eval_max_new_tokens = 1
-    module.max_iters = 6
+    from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 
     data = [
         {"input_ids": torch.tensor([0, 1, 2]), "labels": torch.tensor([1, 2, 3])},
@@ -35,17 +34,24 @@ def test_full_script(tmp_path, fake_checkpoint_dir, monkeypatch):
 
     stdout = StringIO()
     with redirect_stdout(stdout):
-        module.setup(data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path, precision="32-true")
+        module.setup(
+            io=IOArgs(
+                train_data_dir=tmp_path, val_data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path
+            ),
+            precision="32-true",
+            train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, epoch_size=6, micro_batch_size=1),
+            eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1),
+        )
 
     assert {p.name for p in tmp_path.glob("*.pth")} == {
-        "iter-000001-ckpt.pth",
-        "iter-000003-ckpt.pth",
-        "iter-000005-ckpt.pth",
+        "step-000002.pth",
+        "step-000004.pth",
+        "step-000006.pth",
         "lit_model_finetuned.pth",
     }
     assert (tmp_path / "version_0" / "metrics.csv").is_file()
 
     logs = stdout.getvalue()
-    assert logs.count("optimizer.step") == module.max_iters
-    assert logs.count("val loss") == module.max_iters // module.eval_interval
+    assert logs.count("optimizer.step") == 6
+    assert logs.count("val loss") == 3
     assert "of trainable parameters: 1,888" in logs
diff --git a/tests/test_generate.py b/tests/test_generate.py
index 5d59c4b..504e3ee 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import subprocess
 import sys
@@ -28,14 +30,13 @@ def test_generate(max_seq_length):
     max_new_tokens = 20
 
     multinomial_results = []
-    original_multinomial = torch.multinomial
 
     def multinomial(*args, **kwargs):
-        out = original_multinomial(*args, **kwargs)
+        out = torch.multinomial(*args, **kwargs, num_samples=1)
         multinomial_results.append(out)
         return out
 
-    with mock.patch("torch.multinomial", multinomial):
+    with mock.patch("generate.base.multinomial_num_samples_1", multinomial):
         out = generate.generate(model, input_idx, T + max_new_tokens, top_k=4)
 
     assert out.size(0) == T + max_new_tokens
@@ -72,7 +73,11 @@ def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):
 
     assert len(tokenizer_mock.return_value.decode.mock_calls) == num_samples
     assert torch.allclose(tokenizer_mock.return_value.decode.call_args[0][0], generate_mock.return_value)
-    assert generate_mock.mock_calls == [call(ANY, tensor_like, 53, temperature=2.0, top_k=2)] * num_samples
+    assert (
+        generate_mock.mock_calls
+        == [call(ANY, tensor_like, 53, temperature=2.0, top_k=2, eos_id=tokenizer_mock.return_value.eos_id)]
+        * num_samples
+    )
     # only the generated result is printed to stdout
     assert out.getvalue() == "foo bar baz\n" * num_samples
 
@@ -84,3 +89,21 @@ def test_cli():
     output = subprocess.check_output([sys.executable, cli_path, "-h"])
     output = str(output.decode())
     assert "Generates text samples" in output
+
+
+@pytest.mark.parametrize("temperature", (0.0, 1.0, 0.5))
+def test_sample(temperature):
+    from generate.base import sample
+
+    # shape: 2x3x5
+    logits = torch.tensor(
+        [
+            [[24, 4, 98, 77, 47], [65, 70, 32, 67, 24], [92, 32, 88, 36, 62]],
+            [[85, 79, 57, 68, 50], [89, 46, 72, 45, 32], [68, 96, 68, 24, 36]],
+        ]
+    )
+    token = sample(logits, temperature=temperature)
+
+    assert token.shape == (1,)
+    # sample is batch size 1 only for now - this should be [0, 1] once batched generation is supported
+    assert token.tolist() == [0]
diff --git a/tests/test_generate_adapter.py b/tests/test_generate_adapter.py
index 368dc4f..d296a68 100644
--- a/tests/test_generate_adapter.py
+++ b/tests/test_generate_adapter.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import subprocess
 import sys
@@ -42,7 +44,7 @@ def test_main(fake_checkpoint_dir, monkeypatch, version, tensor_like):
     # only the generated result is printed to stdout
     assert out.getvalue() == "foo bar baz\n" * num_samples
 
-    assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'n_embd': 8" in err.getvalue()
+    assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'head_size': 2, 'n_embd': 8" in err.getvalue()
 
 
 @pytest.mark.parametrize("version", ("", "_v2"))
diff --git a/tests/test_generate_lora.py b/tests/test_generate_lora.py
index 73498b6..f9ce72d 100644
--- a/tests/test_generate_lora.py
+++ b/tests/test_generate_lora.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import subprocess
 import sys
@@ -47,21 +49,7 @@ def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):
     # only the generated result is printed to stdout
     assert out.getvalue() == "foo bar baz\n" * num_samples
 
-    assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'n_embd': 8" in err.getvalue()
-
-
-def test_lora_variables_exist():
-    import generate.lora as generate
-
-    for lora_argument in ("r", "alpha", "dropout", "query", "key", "value", "projection", "mlp", "head"):
-        assert getattr(generate, f"lora_{lora_argument}", None) is not None
-
-
-def test_lora_is_enabled():
-    import generate.lora as generate
-
-    lora_arguments = ("query", "key", "value", "projection", "mlp", "head")
-    assert any(getattr(generate, f"lora_{lora_argument}") for lora_argument in lora_arguments)
+    assert "'padded_vocab_size': 512, 'n_layer': 2, 'n_head': 4, 'head_size': 2, 'n_embd': 8" in err.getvalue()
 
 
 def test_cli():
diff --git a/tests/test_generate_sequentially.py b/tests/test_generate_sequentially.py
new file mode 100644
index 0000000..984445e
--- /dev/null
+++ b/tests/test_generate_sequentially.py
@@ -0,0 +1,310 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import itertools
+import json
+import subprocess
+import sys
+from collections import defaultdict
+from dataclasses import asdict
+from pathlib import Path
+from re import escape
+
+import pytest
+import torch
+from conftest import RunIf
+from lightning import Fabric
+
+
+@pytest.mark.parametrize(
+    ("n_layer", "devices", "expected"),
+    [
+        (6, 2, {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1}),
+        (6, 3, {0: 0, 1: 0, 2: 1, 3: 1, 4: 2, 5: 2}),
+        (6, 1, {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}),
+    ],
+)
+def test_layer_to_device(n_layer, devices, expected):
+    from generate.sequentially import layer_to_device
+    from lit_gpt.model import GPT, Block
+
+    with torch.device("meta"):
+        model = GPT.from_name("pythia-14m", n_layer=n_layer)
+
+    actual = layer_to_device(model, Block, chunk_size=n_layer // devices)
+    expected = {f"transformer.h.{i}": v for i, v in expected.items()}
+    assert actual == expected
+
+
+def path_to_device(model):
+    return {k: str(v.device) for k, v in itertools.chain(model.named_parameters(), model.named_buffers())}
+
+
+def test_replace_device():
+    from generate.sequentially import replace_device
+
+    class Submodule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.register_buffer("foo", torch.tensor(1, device="cpu"))
+            self.register_buffer("bar", torch.tensor(1, device="cpu"))
+
+    class MyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.modules = torch.nn.ModuleDict(
+                {
+                    "module1": torch.nn.Linear(1, 1, bias=True, device="meta"),
+                    "module2": torch.nn.Linear(1, 1, bias=False, device="cpu"),
+                }
+            )
+            self.submodule = Submodule()
+
+    model = MyModel()
+    assert path_to_device(model) == {
+        "modules.module1.bias": "meta",
+        "modules.module1.weight": "meta",
+        "modules.module2.weight": "cpu",
+        "submodule.bar": "cpu",
+        "submodule.foo": "cpu",
+    }
+    model = replace_device(model, torch.device("cpu"), torch.device("meta"))
+    assert path_to_device(model) == {
+        "modules.module1.bias": "meta",
+        "modules.module1.weight": "meta",
+        "modules.module2.weight": "meta",
+        "submodule.bar": "meta",
+        "submodule.foo": "meta",
+    }
+
+    model = MyModel()
+    model.submodule.bar = model.submodule.bar.to("meta")
+    with pytest.raises(
+        ValueError,
+        match=escape("multiple devices: {'submodule.foo': device(type='cpu'), 'submodule.bar': device(type='meta')}"),
+    ):
+        replace_device(model, torch.device("cpu"), torch.device("meta"))
+
+
+def _test_model_1device(accelerator):
+    from generate.sequentially import sequential
+    from lit_gpt import GPT
+
+    fabric = Fabric(accelerator=accelerator, devices=1)
+    with torch.device("meta"):
+        model = GPT.from_name("pythia-14m", n_layer=2)
+    model = sequential(model, fabric.device, 15, 1)
+
+    device_str = str(fabric.device)
+    assert path_to_device(model) == {
+        "cos": device_str,
+        "sin": device_str,
+        "lm_head.weight": device_str,
+        "transformer.h.0.attn.attn.bias": device_str,
+        "transformer.h.0.attn.attn.weight": device_str,
+        "transformer.h.0.attn.proj.bias": device_str,
+        "transformer.h.0.attn.proj.weight": device_str,
+        "transformer.h.0.mlp.fc.bias": device_str,
+        "transformer.h.0.mlp.fc.weight": device_str,
+        "transformer.h.0.mlp.proj.bias": device_str,
+        "transformer.h.0.mlp.proj.weight": device_str,
+        "transformer.h.0.norm_1.bias": device_str,
+        "transformer.h.0.norm_1.weight": device_str,
+        "transformer.h.0.norm_2.bias": device_str,
+        "transformer.h.0.norm_2.weight": device_str,
+        "transformer.h.0.attn.kv_cache.k": device_str,
+        "transformer.h.0.attn.kv_cache.v": device_str,
+        "transformer.h.1.attn.attn.bias": device_str,
+        "transformer.h.1.attn.attn.weight": device_str,
+        "transformer.h.1.attn.proj.bias": device_str,
+        "transformer.h.1.attn.proj.weight": device_str,
+        "transformer.h.1.mlp.fc.bias": device_str,
+        "transformer.h.1.mlp.fc.weight": device_str,
+        "transformer.h.1.mlp.proj.bias": device_str,
+        "transformer.h.1.mlp.proj.weight": device_str,
+        "transformer.h.1.norm_1.bias": device_str,
+        "transformer.h.1.norm_1.weight": device_str,
+        "transformer.h.1.norm_2.bias": device_str,
+        "transformer.h.1.norm_2.weight": device_str,
+        "transformer.h.1.attn.kv_cache.k": device_str,
+        "transformer.h.1.attn.kv_cache.v": device_str,
+        "transformer.ln_f.bias": device_str,
+        "transformer.ln_f.weight": device_str,
+        "transformer.wte.weight": device_str,
+    }
+    assert model.max_seq_length == 15
+
+
+@RunIf(min_cuda_gpus=1)
+def test_model_1device_cuda():
+    _test_model_1device("cuda")
+
+
+def test_model_1device_cpu():
+    _test_model_1device("cpu")
+
+
+def find_forward_hooks(module):
+    mapping = defaultdict(list)
+    for name, submodule in module.named_modules():
+        for hook in submodule._forward_pre_hooks.values():
+            hook_data = ("forward_pre_hook", hook.func.__name__, hook.args, hook.keywords)
+            mapping[name].append(hook_data)
+        for hook in submodule._forward_hooks.values():
+            hook_data = ("forward_hook", hook.func.__name__, hook.args, hook.keywords)
+            mapping[name].append(hook_data)
+    return dict(mapping)
+
+
+@RunIf(min_cuda_gpus=2)
+def test_model_forward_hooks():
+    from generate.sequentially import sequential
+    from lit_gpt import GPT
+
+    fabric = Fabric(accelerator="cuda", devices=1)
+    with torch.device("meta"):
+        model = GPT.from_name("pythia-14m")  # 6 layers
+    model = sequential(model, fabric.device, max_seq_length=15, devices=2)
+
+    hooks = find_forward_hooks(model)
+    actual = path_to_device(model)
+    assert actual == {
+        "lm_head.weight": "cuda:0",
+        "transformer.wte.weight": "cuda:0",
+        "transformer.h.0.norm_1.weight": "cuda:0",
+        "transformer.h.0.norm_1.bias": "cuda:0",
+        "transformer.h.0.attn.attn.weight": "cuda:0",
+        "transformer.h.0.attn.attn.bias": "cuda:0",
+        "transformer.h.0.attn.proj.weight": "cuda:0",
+        "transformer.h.0.attn.proj.bias": "cuda:0",
+        "transformer.h.0.norm_2.weight": "cuda:0",
+        "transformer.h.0.norm_2.bias": "cuda:0",
+        "transformer.h.0.mlp.fc.weight": "cuda:0",
+        "transformer.h.0.mlp.fc.bias": "cuda:0",
+        "transformer.h.0.mlp.proj.weight": "cuda:0",
+        "transformer.h.0.mlp.proj.bias": "cuda:0",
+        "transformer.h.1.norm_1.weight": "cuda:0",
+        "transformer.h.1.norm_1.bias": "cuda:0",
+        "transformer.h.1.attn.attn.weight": "cuda:0",
+        "transformer.h.1.attn.attn.bias": "cuda:0",
+        "transformer.h.1.attn.proj.weight": "cuda:0",
+        "transformer.h.1.attn.proj.bias": "cuda:0",
+        "transformer.h.1.norm_2.weight": "cuda:0",
+        "transformer.h.1.norm_2.bias": "cuda:0",
+        "transformer.h.1.mlp.fc.weight": "cuda:0",
+        "transformer.h.1.mlp.fc.bias": "cuda:0",
+        "transformer.h.1.mlp.proj.weight": "cuda:0",
+        "transformer.h.1.mlp.proj.bias": "cuda:0",
+        "transformer.h.2.norm_1.weight": "cuda:0",
+        "transformer.h.2.norm_1.bias": "cuda:0",
+        "transformer.h.2.attn.attn.weight": "cuda:0",
+        "transformer.h.2.attn.attn.bias": "cuda:0",
+        "transformer.h.2.attn.proj.weight": "cuda:0",
+        "transformer.h.2.attn.proj.bias": "cuda:0",
+        "transformer.h.2.norm_2.weight": "cuda:0",
+        "transformer.h.2.norm_2.bias": "cuda:0",
+        "transformer.h.2.mlp.fc.weight": "cuda:0",
+        "transformer.h.2.mlp.fc.bias": "cuda:0",
+        "transformer.h.2.mlp.proj.weight": "cuda:0",
+        "transformer.h.2.mlp.proj.bias": "cuda:0",
+        "transformer.h.3.norm_1.weight": "cuda:1",
+        "transformer.h.3.norm_1.bias": "cuda:1",
+        "transformer.h.3.attn.attn.weight": "cuda:1",
+        "transformer.h.3.attn.attn.bias": "cuda:1",
+        "transformer.h.3.attn.proj.weight": "cuda:1",
+        "transformer.h.3.attn.proj.bias": "cuda:1",
+        "transformer.h.3.norm_2.weight": "cuda:1",
+        "transformer.h.3.norm_2.bias": "cuda:1",
+        "transformer.h.3.mlp.fc.weight": "cuda:1",
+        "transformer.h.3.mlp.fc.bias": "cuda:1",
+        "transformer.h.3.mlp.proj.weight": "cuda:1",
+        "transformer.h.3.mlp.proj.bias": "cuda:1",
+        "transformer.h.4.norm_1.weight": "cuda:1",
+        "transformer.h.4.norm_1.bias": "cuda:1",
+        "transformer.h.4.attn.attn.weight": "cuda:1",
+        "transformer.h.4.attn.attn.bias": "cuda:1",
+        "transformer.h.4.attn.proj.weight": "cuda:1",
+        "transformer.h.4.attn.proj.bias": "cuda:1",
+        "transformer.h.4.norm_2.weight": "cuda:1",
+        "transformer.h.4.norm_2.bias": "cuda:1",
+        "transformer.h.4.mlp.fc.weight": "cuda:1",
+        "transformer.h.4.mlp.fc.bias": "cuda:1",
+        "transformer.h.4.mlp.proj.weight": "cuda:1",
+        "transformer.h.4.mlp.proj.bias": "cuda:1",
+        "transformer.h.5.norm_1.weight": "cuda:1",
+        "transformer.h.5.norm_1.bias": "cuda:1",
+        "transformer.h.5.attn.attn.weight": "cuda:1",
+        "transformer.h.5.attn.attn.bias": "cuda:1",
+        "transformer.h.5.attn.proj.weight": "cuda:1",
+        "transformer.h.5.attn.proj.bias": "cuda:1",
+        "transformer.h.5.norm_2.weight": "cuda:1",
+        "transformer.h.5.norm_2.bias": "cuda:1",
+        "transformer.h.5.mlp.fc.weight": "cuda:1",
+        "transformer.h.5.mlp.fc.bias": "cuda:1",
+        "transformer.h.5.mlp.proj.weight": "cuda:1",
+        "transformer.h.5.mlp.proj.bias": "cuda:1",
+        "transformer.ln_f.weight": "cuda:0",
+        "transformer.ln_f.bias": "cuda:0",
+        "cos": "cuda:0",
+        "sin": "cuda:0",
+        "transformer.h.0.attn.kv_cache.k": "cuda:0",
+        "transformer.h.0.attn.kv_cache.v": "cuda:0",
+        "transformer.h.1.attn.kv_cache.k": "cuda:0",
+        "transformer.h.1.attn.kv_cache.v": "cuda:0",
+        "transformer.h.2.attn.kv_cache.k": "cuda:0",
+        "transformer.h.2.attn.kv_cache.v": "cuda:0",
+        "transformer.h.3.attn.kv_cache.k": "cuda:1",
+        "transformer.h.3.attn.kv_cache.v": "cuda:1",
+        "transformer.h.4.attn.kv_cache.k": "cuda:1",
+        "transformer.h.4.attn.kv_cache.v": "cuda:1",
+        "transformer.h.5.attn.kv_cache.k": "cuda:1",
+        "transformer.h.5.attn.kv_cache.v": "cuda:1",
+    }
+    assert hooks == {
+        "transformer.h.3": [("forward_pre_hook", "move_block_input", (torch.device(type="cuda", index=1),), {})],
+        "transformer.h.4": [("forward_pre_hook", "move_block_input", (torch.device(type="cuda", index=1),), {})],
+        "transformer.h.5": [
+            ("forward_pre_hook", "move_block_input", (torch.device(type="cuda", index=1),), {}),
+            ("forward_hook", "move_block_output", (torch.device(type="cuda", index=0),), {}),
+        ],
+    }
+
+
+root = Path(__file__).parent.parent.resolve()
+
+
+@RunIf(min_cuda_gpus=2)
+def test_base_with_sequentially(tmp_path):
+    from lit_gpt import GPT, Config
+    from scripts.download import download_from_hub
+
+    # download the tokenizer
+    download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
+    checkpoint_dir = tmp_path / "EleutherAI/pythia-14m"
+    # save the config
+    config = Config.from_name("pythia-14m")
+    (checkpoint_dir / "lit_config.json").write_text(json.dumps(asdict(config)))
+    # create a state dict to load from
+    torch.save(GPT(config).state_dict(), checkpoint_dir / "lit_model.pth")
+
+    args = [
+        "--num_samples=1",
+        "--max_new_tokens=10",
+        "--precision=16-true",
+        "--temperature=0.0",
+        f"--checkpoint_dir={str(checkpoint_dir)}",
+    ]
+    env = {"CUDA_VISIBLE_DEVICES": "0,1"}
+    base_stdout = subprocess.check_output([sys.executable, root / "generate/base.py", *args], env=env).decode()
+    sequential_stdout = subprocess.check_output(
+        [sys.executable, root / "generate/sequentially.py", *args], env=env
+    ).decode()
+
+    assert base_stdout.startswith("What food do llamas eat?")
+    assert base_stdout == sequential_stdout
+
+
+def test_cli():
+    cli_path = root / "generate" / "sequentially.py"
+    output = subprocess.check_output([sys.executable, cli_path, "-h"])
+    output = str(output.decode())
+    assert "Generates text samples" in output
diff --git a/tests/test_generate_tp.py b/tests/test_generate_tp.py
new file mode 100644
index 0000000..0125b51
--- /dev/null
+++ b/tests/test_generate_tp.py
@@ -0,0 +1,141 @@
+import json
+import subprocess
+import sys
+from dataclasses import asdict, replace
+from pathlib import Path
+from unittest.mock import Mock
+
+import pytest
+import torch
+from conftest import RunIf
+from test_generate_sequentially import find_forward_hooks
+
+
+def test_tensor_parallel_linear():
+    from generate.tp import tensor_parallel_linear
+
+    fabric = Mock()
+    fabric.world_size = 4
+    fabric.global_rank = 2
+
+    def get_linear(bias=True):
+        linear = torch.nn.Linear(8, 8, bias=bias)
+        linear.weight.data = torch.arange(64, dtype=torch.float32).reshape(8, 8)
+        if bias:
+            linear.bias.data = torch.arange(8, dtype=torch.float32)
+        return linear
+
+    linear = get_linear()
+    tensor_parallel_linear(fabric, linear, "colwise")
+    expected = torch.arange(32, 48, dtype=torch.float32).reshape(2, 8)
+    torch.testing.assert_close(linear.weight, expected)
+    expected = torch.arange(4, 6, dtype=torch.float32)
+    torch.testing.assert_close(linear.bias, expected)
+
+    linear = get_linear(bias=False)
+    tensor_parallel_linear(fabric, linear, "rowwise")
+    expected = torch.arange(4, 62, 8, dtype=torch.float32).reshape(8, 1)
+    expected = torch.cat([expected, expected + 1], dim=1)
+    torch.testing.assert_close(linear.weight, expected)
+    assert linear.bias is None
+
+
+@pytest.mark.parametrize(
+    ("name", "expected"),
+    [
+        (
+            "Llama-2-70b-hf",
+            {
+                "transformer.h.0.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.0.mlp": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.1.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.1.mlp": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.2.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.2.mlp": [("forward_hook", "all_reduce_output", (8,), {})],
+            },
+        ),
+        (
+            "falcon-180B",
+            {
+                "transformer.h.0.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.0.mlp": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.1.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.1.mlp": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.2.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.2.mlp": [("forward_hook", "all_reduce_output", (8,), {})],
+            },
+        ),
+        (
+            "Mixtral-8x7B-v0.1",
+            {
+                "transformer.h.0.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.0.mlp.experts.0": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.0.mlp.experts.1": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.1.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.1.mlp.experts.0": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.1.mlp.experts.1": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.2.attn": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.2.mlp.experts.0": [("forward_hook", "all_reduce_output", (8,), {})],
+                "transformer.h.2.mlp.experts.1": [("forward_hook", "all_reduce_output", (8,), {})],
+            },
+        ),
+    ],
+)
+def test_tensor_parallel_llama(name, expected):
+    from generate.tp import tensor_parallel
+    from lit_gpt import GPT
+
+    fabric = Mock()
+    fabric.world_size = 8
+    fabric.global_rank = 1
+
+    with torch.device("meta"):
+        model = GPT.from_name(name, n_layer=3, n_expert=2)
+    config = replace(model.config)  # make a copy
+
+    model = tensor_parallel(fabric, model)
+
+    hooks = find_forward_hooks(model)
+    assert hooks == expected
+
+    assert model.config.n_embd * 8 == config.n_embd
+    assert model.config.n_head * 8 == config.n_head
+    assert model.config.n_query_groups * 8 == config.n_query_groups
+
+
+root = Path(__file__).parent.parent.resolve()
+
+
+@RunIf(min_cuda_gpus=2)
+def test_tp(tmp_path):
+    from lit_gpt import GPT, Config
+    from scripts.download import download_from_hub
+
+    # download the tokenizer
+    download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
+    checkpoint_dir = tmp_path / "EleutherAI/pythia-14m"
+    # save the config
+    config = Config.from_name("pythia-14m")
+    (checkpoint_dir / "lit_config.json").write_text(json.dumps(asdict(config)))
+    # create a state dict to load from
+    torch.save(GPT(config).state_dict(), checkpoint_dir / "lit_model.pth")
+
+    args = [
+        "--num_samples=1",
+        "--max_new_tokens=10",
+        "--precision=16-true",
+        "--temperature=0.0",
+        f"--checkpoint_dir={str(checkpoint_dir)}",
+    ]
+    env = {"CUDA_VISIBLE_DEVICES": "0,1"}
+    tp_stdout = subprocess.check_output([sys.executable, root / "generate/tp.py", *args], env=env).decode()
+
+    # there is some unaccounted randomness so cannot compare the output with that of `generate/base.py`
+    assert tp_stdout.startswith("What food do llamas eat?")
+
+
+def test_cli():
+    cli_path = root / "generate" / "tp.py"
+    output = subprocess.check_output([sys.executable, cli_path, "-h"])
+    output = str(output.decode())
+    assert "Generates text samples" in output
diff --git a/tests/test_gptq.py b/tests/test_gptq.py
deleted file mode 100644
index 5ab0f9f..0000000
--- a/tests/test_gptq.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import lightning as L
-import pytest
-import torch
-
-
-def test_gptq_blockwise_quantization():
-    from quantize.gptq import _TRITON_AVAILABLE
-
-    if not _TRITON_AVAILABLE:
-        pytest.skip(str(_TRITON_AVAILABLE))
-
-    from lit_gpt import GPT
-
-    fabric = L.Fabric(devices=1)
-    with fabric.init_module(empty_init=False):
-        model = GPT.from_name("pythia-70m", n_layer=2)
-        x = torch.randint(0, 10, (2, model.config.block_size))
-
-    from quantize.gptq import blockwise_quantization
-
-    blockwise_quantization(model, x, fabric.device)
diff --git a/tests/test_lm_eval_harness.py b/tests/test_lm_eval_harness.py
new file mode 100644
index 0000000..7f4ca74
--- /dev/null
+++ b/tests/test_lm_eval_harness.py
@@ -0,0 +1,91 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import ANY, Mock
+
+import datasets
+import pytest
+from lightning import Fabric
+
+
+@pytest.mark.xfail(
+    raises=(datasets.builder.DatasetGenerationError, NotImplementedError),
+    strict=False,
+    match="Loading a dataset cached in a LocalFileSystem is not supported",
+)
+def test_run_eval(tmp_path, float_like):
+    from eval.lm_eval_harness import EvalHarnessBase
+    from lit_gpt.model import GPT
+    from lit_gpt.tokenizer import Tokenizer
+    from scripts.download import download_from_hub
+
+    fabric = Fabric(devices=1)
+    with fabric.init_module():
+        model = GPT.from_name("pythia-14m")
+    download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
+    tokenizer = Tokenizer(tmp_path / "EleutherAI/pythia-14m")
+
+    eval_harness = EvalHarnessBase(fabric, model, tokenizer, 1)
+    results = eval_harness.run_eval(
+        eval_tasks=["truthfulqa_mc", "hellaswag", "coqa"], limit=2, bootstrap_iters=2, num_fewshot=0, no_cache=True
+    )
+    assert results == {
+        "config": {
+            "batch_size": 1,
+            "bootstrap_iters": 2,
+            "device": ANY,
+            "limit": 2,
+            "model": "pythia-14m",
+            "no_cache": True,
+            "num_fewshot": 0,
+        },
+        "results": {
+            "hellaswag": {
+                "acc": float_like,
+                "acc_norm": float_like,
+                "acc_norm_stderr": float_like,
+                "acc_stderr": float_like,
+            },
+            "coqa": {"f1": float_like, "f1_stderr": float_like, "em": float_like, "em_stderr": float_like},
+            "truthfulqa_mc": {"mc1": float_like, "mc1_stderr": float_like, "mc2": float_like, "mc2_stderr": float_like},
+        },
+        "versions": {"hellaswag": 0, "coqa": 1, "truthfulqa_mc": 1},
+    }
+
+
+def test_eval_script(tmp_path, fake_checkpoint_dir, monkeypatch):
+    import eval.lm_eval_harness as module
+
+    model_config = dict(block_size=128, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
+    with open(fake_checkpoint_dir / "lit_config.json", "w") as fp:
+        json.dump(model_config, fp)
+    monkeypatch.setattr(module, "load_checkpoint", Mock())
+
+    tokenizer_mock = Mock()
+    monkeypatch.setattr(module, "Tokenizer", tokenizer_mock)
+
+    run_eval_mock = Mock()
+    run_eval_mock.return_value = {"foo": "test"}
+    monkeypatch.setattr(module.EvalHarnessBase, "run_eval", run_eval_mock)
+
+    output_folder = tmp_path / "output"
+    assert not output_folder.exists()
+
+    module.run_eval_harness(
+        checkpoint_dir=fake_checkpoint_dir, precision="32-true", save_filepath=(output_folder / "results.json")
+    )
+
+    run_eval_mock.assert_called_once_with(
+        ["arc_challenge", "piqa", "hellaswag", "hendrycksTest-*"], 0, None, 100000, True
+    )
+    assert (output_folder / "results.json").read_text() == '{"foo": "test"}'
+
+
+def test_cli():
+    cli_path = Path(__file__).parent.parent / "eval" / "lm_eval_harness.py"
+    output = subprocess.check_output([sys.executable, cli_path, "-h"])
+    output = str(output.decode())
+    assert "run_eval_harness" in output
diff --git a/tests/test_lora.py b/tests/test_lora.py
index 16365fd..88fe72c 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -1,12 +1,24 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 from contextlib import redirect_stdout
 from io import StringIO
 from itertools import product
+from pathlib import Path
 from unittest.mock import Mock
 
 import pytest
 import torch
+from conftest import RunIf
 from lightning import Fabric
+from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision
+from lightning.fabric.wrappers import _FabricOptimizer
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import lit_gpt.config as config_module
 
 
 def test_lora_layer_replacement():
@@ -40,9 +52,10 @@ def test_lora_merge():
     )
     model = GPT(config)
     model.train()
+    attn_proj = model.transformer.h[0].attn.proj
 
-    initial_weight = model.transformer.h[0].attn.proj.linear.weight.clone()
-    assert torch.equal(model.transformer.h[0].attn.proj.linear.weight, initial_weight)
+    initial_weight = attn_proj.linear.weight.clone()
+    assert torch.equal(attn_proj.linear.weight, initial_weight)
 
     # perform an update to the LoRA weights
     mark_only_lora_as_trainable(model)
@@ -52,21 +65,18 @@ def test_lora_merge():
     optimizer.step()
     optimizer.zero_grad()
     # the weight remains unchanged (only lora A and B change)
-    assert torch.equal(model.transformer.h[0].attn.proj.linear.weight, initial_weight)
+    assert torch.equal(attn_proj.linear.weight, initial_weight)
 
     # calling merge() multiple times in a row should not merge multiple times
     merge_lora_weights(model)
-    assert model.transformer.h[0].attn.attn.merged
-    weight_after = model.transformer.h[0].attn.proj.linear.weight.clone()
+    assert attn_proj.merged
+    weight_after = attn_proj.linear.weight.clone()
     merge_lora_weights(model)
     merge_lora_weights(model)
-    assert torch.equal(model.transformer.h[0].attn.proj.linear.weight, weight_after)
+    assert torch.equal(attn_proj.linear.weight, weight_after)
 
     # check that `W_after = W_initial + (A x B)`
-    a = model.transformer.h[0].attn.proj.lora_A
-    b = model.transformer.h[0].attn.proj.lora_B
-    scaling = model.transformer.h[0].attn.proj.scaling
-    delta_w = (b @ a) * scaling
+    delta_w = attn_proj.get_lora_AB()
     torch.testing.assert_close(weight_after, initial_weight + delta_w)
 
 
@@ -89,41 +99,71 @@ def test_lora_mqa_gqa():
     assert config.n_query_groups == config.n_head
     model = GPT(config)
     attn = model.transformer.h[0].attn.attn
+    for p in attn.linear.parameters():
+        torch.nn.init.zeros_(p)
+    torch.nn.init.ones_(attn.lora_B)
+    lora_ind = [0, 1, 6, 7, 12, 13, 18, 19, 4, 5, 10, 11, 16, 17, 22, 23]
     assert attn.linear.weight.shape == (24, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (16, 2)
-    assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+    assert attn.lora_ind == lora_ind
     x = torch.randint(0, 8, size=(3, 5, 16), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 24)
+    bsz, ctx_len, in_dim = 2, 30, 8
+    x_in = torch.randn(bsz, ctx_len, in_dim)
+    out = attn(x_in)
+    non_lora_ind = list(set(range(24)).difference(lora_ind))
+    assert torch.count_nonzero(out[:, :, lora_ind]) == bsz * ctx_len * len(lora_ind)
+    assert torch.count_nonzero(out[:, :, non_lora_ind]) == 0
 
     # MQA
     config.n_query_groups = 1
     model = GPT(config)
     attn = model.transformer.h[0].attn.attn
+    for p in attn.linear.parameters():
+        torch.nn.init.zeros_(p)
+    torch.nn.init.ones_(attn.lora_B)
+    lora_ind = [0, 1, 2, 3, 4, 5, 6, 7, 10, 11]
     assert attn.linear.weight.shape == (12, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (10, 2)
-    assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 10, 11]
+    assert attn.lora_ind == lora_ind
     x = torch.randint(0, 8, size=(3, 5, 10), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 12)
+    bsz, ctx_len, in_dim = 2, 30, 8
+    x_in = torch.randn(bsz, ctx_len, in_dim)
+    out = attn(x_in)
+    non_lora_ind = list(set(range(12)).difference(lora_ind))
+    assert torch.count_nonzero(out[:, :, lora_ind]) == bsz * ctx_len * len(lora_ind)
+    assert torch.count_nonzero(out[:, :, non_lora_ind]) == 0
 
     # GQA
     config.n_query_groups = 2
     model = GPT(config)
     attn = model.transformer.h[0].attn.attn
+    for p in attn.linear.parameters():
+        torch.nn.init.zeros_(p)
+    torch.nn.init.ones_(attn.lora_B)
+    lora_ind = [0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 14, 15]
     assert attn.linear.weight.shape == (16, 8)
     assert attn.lora_A.shape == (4, 8)
     assert attn.lora_B.shape == (12, 2)
-    assert attn.lora_ind == [0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]
+    assert attn.lora_ind == lora_ind
     x = torch.randint(0, 8, size=(3, 5, 12), dtype=torch.int64)
     assert attn.zero_pad(x).shape == (3, 5, 16)
+    bsz, ctx_len, in_dim = 2, 30, 8
+    x_in = torch.randn(bsz, ctx_len, in_dim)
+    out = attn(x_in)
+    non_lora_ind = list(set(range(16)).difference(lora_ind))
+    assert torch.count_nonzero(out[:, :, lora_ind]) == bsz * ctx_len * len(lora_ind)
+    assert torch.count_nonzero(out[:, :, non_lora_ind]) == 0
 
 
 def test_lora_filter(tmp_path):
     from lit_gpt.lora import GPT, lora_filter
 
     fabric = Fabric(devices=1)
-    model = GPT.from_name("pythia-70m", n_layer=3, r=1, to_query=True, to_value=True)
+    model = GPT.from_name("pythia-14m", n_layer=3, r=1, to_query=True, to_value=True)
     save_path = tmp_path / "model.pth"
     fabric.save(save_path, {"model": model}, filter={"model": lora_filter})
     saved = torch.load(save_path)["model"]
@@ -141,13 +181,7 @@ def test_lora_filter(tmp_path):
 
 def test_lora_script(tmp_path, fake_checkpoint_dir, monkeypatch):
     import finetune.lora as module
-
-    module.gradient_accumulation_iters = 1
-    module.save_interval = 2
-    module.eval_interval = 2
-    module.eval_iters = 2
-    module.eval_max_new_tokens = 1
-    module.max_iters = 6
+    from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
 
     data = [
         {"input_ids": torch.tensor([0, 1, 2]), "labels": torch.tensor([1, 2, 3])},
@@ -169,19 +203,26 @@ def test_lora_script(tmp_path, fake_checkpoint_dir, monkeypatch):
 
     stdout = StringIO()
     with redirect_stdout(stdout):
-        module.setup(data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path, precision="32-true")
+        module.setup(
+            io=IOArgs(
+                train_data_dir=tmp_path, val_data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path
+            ),
+            precision="32-true",
+            train=TrainArgs(global_batch_size=1, save_interval=2, epochs=1, epoch_size=6, micro_batch_size=1),
+            eval=EvalArgs(interval=2, max_iters=2, max_new_tokens=1),
+        )
 
     assert {p.name for p in tmp_path.glob("*.pth")} == {
-        "iter-000001-ckpt.pth",
-        "iter-000003-ckpt.pth",
-        "iter-000005-ckpt.pth",
+        "iter-000002-ckpt.pth",
+        "iter-000004-ckpt.pth",
+        "iter-000006-ckpt.pth",
         "lit_model_lora_finetuned.pth",
     }
     assert (tmp_path / "version_0" / "metrics.csv").is_file()
 
     logs = stdout.getvalue()
-    assert logs.count("optimizer.step") == module.max_iters
-    assert logs.count("val loss") == module.max_iters // module.eval_interval
+    assert logs.count("optimizer.step") == 6
+    assert logs.count("val loss") == 3
     assert "of trainable parameters: 512" in logs
 
 
@@ -351,14 +392,13 @@ def test_lora_qkv_linear_weights_merged_status(rank, enable_lora, expected_merge
     assert layer.merged == expected_merged
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="8bit requires CUDA")
-# platform dependent cuda issue: libbitsandbytes_cpu.so: undefined symbol: cquantize_blockwise_fp16_nf4
-@pytest.mark.xfail(raises=AttributeError, strict=False)
-def test_lora_merge_with_quantize():
+@RunIf(min_cuda_gpus=1)
+def test_lora_merge_with_bitsandbytes():
     from lightning.fabric.plugins.precision.bitsandbytes import _BITSANDBYTES_AVAILABLE, BitsandbytesPrecision
 
     if not _BITSANDBYTES_AVAILABLE:
         pytest.skip("BNB not available")
+    import bitsandbytes as bnb
 
     from lit_gpt.lora import GPT, Config, mark_only_lora_as_trainable, merge_lora_weights
 
@@ -376,14 +416,7 @@ def test_lora_merge_with_quantize():
         to_projection=True,
     )
     fabric = Fabric(devices=1, plugins=BitsandbytesPrecision("nf4", dtype=torch.bfloat16, ignore_modules={"lm_head"}))
-    with fabric.init_module(empty_init=False):
-        model = GPT(config)
-        model.apply(model._init_weights)
-
-    attn_proj = model.transformer.h[0].attn.proj
-    assert model.lm_head.linear.weight.dtype is torch.bfloat16
-    assert attn_proj.linear.weight.dtype is torch.bfloat16
-
+    model = GPT(config)
     mark_only_lora_as_trainable(model)
 
     from bitsandbytes.optim import PagedAdamW
@@ -393,10 +426,12 @@ def test_lora_merge_with_quantize():
 
     model.train()
 
+    attn_proj = model.transformer.h[0].attn.proj
     initial_weight = attn_proj.linear.weight.clone()
+    initial_weight_kwargs = attn_proj.linear.weight.__dict__
 
     # this was skipped
-    assert model.lm_head.linear.weight.dtype is torch.bfloat16
+    assert model.lm_head.linear.weight.dtype is torch.float32
     assert attn_proj.linear.weight.dtype is torch.uint8
 
     # perform an update to the LoRA weights
@@ -417,8 +452,16 @@ def test_lora_merge_with_quantize():
     assert torch.equal(attn_proj.linear.weight, weight_after)
 
     # check that `W_after = W_initial + (A x B)`
-    delta_w = (attn_proj.lora_B @ attn_proj.lora_A) * attn_proj.scaling
-    torch.testing.assert_close(weight_after, initial_weight + delta_w)
+    delta_w = attn_proj.get_lora_AB()
+    # dequantize initial weight and sum with delta_w
+    initial_weight_data = (
+        bnb.functional.dequantize_4bit(initial_weight.data, initial_weight_kwargs["quant_state"]) + delta_w
+    )
+    # quantize again
+    initial_weight_data = bnb.nn.Params4bit(
+        initial_weight_data.to("cpu"), requires_grad=False, **initial_weight_kwargs
+    ).to(initial_weight.device)
+    torch.testing.assert_close(weight_after, initial_weight_data)
 
 
 def test_lora_gpt_init_weights():
@@ -435,24 +478,17 @@ def test_lora_gpt_init_weights():
     assert (param == 0).all()
 
 
-def test_base_model_can_be_lora_loaded():
+@pytest.mark.parametrize("name", [c["name"] for c in config_module.configs])
+def test_base_model_can_be_lora_loaded(name):
     from lit_gpt.lora import GPT as LoRAGPT
     from lit_gpt.lora import lora_filter
     from lit_gpt.model import GPT as BaseGPT
 
-    base_model = BaseGPT.from_name("pythia-70m", bias=True, n_layer=2)
+    kwargs = {"n_layer": 2, "n_head": 8, "n_embd": 16, "padded_vocab_size": 32}
+    base_model = BaseGPT.from_name(name, **kwargs)
     base_model_state_dict = base_model.state_dict()
     lora_model = LoRAGPT.from_name(
-        "pythia-70m",
-        bias=True,
-        n_layer=2,
-        r=1,
-        to_query=True,
-        to_key=True,
-        to_value=True,
-        to_projection=True,
-        to_mlp=True,
-        to_head=True,
+        name, **kwargs, r=1, to_query=True, to_key=True, to_value=True, to_projection=True, to_mlp=True, to_head=True
     )
     keys = lora_model.load_state_dict(base_model_state_dict, strict=False)
     assert not keys.unexpected_keys
@@ -460,13 +496,13 @@ def test_base_model_can_be_lora_loaded():
         assert lora_filter(k, None)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_lora_compile():
     from lit_gpt.lora import GPT
 
     model = GPT.from_name(
-        "pythia-70m",
+        "pythia-14m",
         n_layer=3,
         r=8,
         alpha=8,
@@ -482,7 +518,7 @@ def test_lora_compile():
 
     from torch._dynamo.backends import debugging
 
-    explanation = torch._dynamo.explain(model, x)
+    explanation = torch._dynamo.explain(model)(x)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
@@ -490,7 +526,172 @@ def test_lora_compile():
     model = GPT(model.config)
     model.set_kv_cache(2)
     input_pos = torch.arange(model.config.block_size)
-    explanation = torch._dynamo.explain(model, x, input_pos)
+    explanation = torch._dynamo.explain(model)(x, input_pos)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
+
+
+@torch.inference_mode()
+def test_against_hf_mixtral():
+    from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
+
+    from lit_gpt.lora import GPT, Config
+    from scripts.convert_hf_checkpoint import copy_weights_hf_llama
+
+    device = torch.device("cpu")
+    dtype = torch.float32
+    ours_config = Config.from_name(
+        "Mixtral-8x7B-Instruct-v0.1",
+        padded_vocab_size=10000,
+        n_layer=2,
+        n_embd=32,
+        n_head=8,
+        n_query_groups=2,
+        intermediate_size=86,
+        n_expert=4,
+    )
+    T = 5
+    theirs_config = MixtralConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        num_local_experts=ours_config.n_expert,
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    theirs_model = MixtralForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    state_dict = {}
+    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304], [23, 345, 65, 123, 321]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@RunIf(min_cuda_gpus=1)
+def test_lora_bitsandbytes(monkeypatch, tmp_path, fake_checkpoint_dir):
+    from lit_gpt.args import IOArgs
+
+    if not _BITSANDBYTES_AVAILABLE:
+        pytest.skip("BNB not available")
+
+    from bitsandbytes.optim import PagedAdamW
+
+    import finetune.lora as module
+
+    data = []
+    torch.save(data, tmp_path / "train.pt")
+    torch.save(data, tmp_path / "test.pt")
+
+    from lit_gpt.config import name_to_config
+
+    model_config = dict(
+        block_size=128,
+        n_layer=2,
+        n_embd=8,
+        n_head=4,
+        padded_vocab_size=8,
+        bias=True,
+        r=8,
+        alpha=8,
+        dropout=0.1,
+        to_query=True,
+        to_value=True,
+        to_projection=True,
+    )
+    monkeypatch.setitem(name_to_config, "tmp", model_config)
+
+    monkeypatch.setattr(module, "load_checkpoint", Mock())
+    train_mock = Mock()
+    monkeypatch.setattr(module, "fit", train_mock)
+
+    stdout = StringIO()
+    with redirect_stdout(stdout):
+        module.setup(
+            io=IOArgs(
+                train_data_dir=tmp_path, val_data_dir=tmp_path, checkpoint_dir=fake_checkpoint_dir, out_dir=tmp_path
+            ),
+            precision="16-true",
+            quantize="bnb.nf4-dq",
+        )
+
+    args, kwargs = train_mock.call_args
+    fabric, model, optimizer, *_ = args
+    assert isinstance(fabric.strategy.precision, BitsandbytesPrecision)
+    assert isinstance(optimizer, _FabricOptimizer)
+    assert isinstance(optimizer._optimizer, PagedAdamW)
+
+    dtype_to_name = {"torch.uint8": set(), "torch.float16": set()}
+    for name, layer in model.named_parameters():
+        name = name[len("_forward_module.") :]
+        dtype_to_name[str(layer.dtype)].add(name)
+    assert dtype_to_name == {
+        "torch.uint8": {
+            "transformer.h.0.attn.attn.linear.weight",
+            "transformer.h.0.attn.proj.linear.weight",
+            "transformer.h.0.mlp.fc.linear.weight",
+            "transformer.h.1.mlp.proj.linear.weight",
+            "transformer.h.0.mlp.proj.linear.weight",
+            "transformer.h.1.attn.attn.linear.weight",
+            "lm_head.linear.weight",
+            "transformer.h.1.attn.proj.linear.weight",
+            "transformer.h.1.mlp.fc.linear.weight",
+        },
+        "torch.float16": {
+            "transformer.h.0.attn.attn.lora_B",
+            "transformer.h.0.norm_2.weight",
+            "transformer.wte.weight",
+            "transformer.h.1.mlp.fc.linear.bias",
+            "transformer.ln_f.bias",
+            "transformer.h.1.attn.attn.lora_B",
+            "transformer.h.1.attn.proj.linear.bias",
+            "transformer.h.1.norm_1.weight",
+            "transformer.h.1.attn.attn.linear.bias",
+            "transformer.h.1.attn.attn.lora_A",
+            "transformer.h.1.norm_1.bias",
+            "transformer.h.1.norm_2.bias",
+            "transformer.h.0.attn.proj.linear.bias",
+            "transformer.h.0.norm_1.bias",
+            "transformer.h.0.mlp.proj.linear.bias",
+            "transformer.h.0.mlp.fc.linear.bias",
+            "transformer.h.0.norm_2.bias",
+            "transformer.ln_f.weight",
+            "transformer.h.0.attn.attn.lora_A",
+            "transformer.h.1.norm_2.weight",
+            "transformer.h.1.mlp.proj.linear.bias",
+            "transformer.h.0.norm_1.weight",
+            "transformer.h.0.attn.attn.linear.bias",
+        },
+    }
+
+    assert {p.name for p in tmp_path.glob("*.pth")} == {"lit_model_lora_finetuned.pth"}
+    state_dict = torch.load(tmp_path / "lit_model_lora_finetuned.pth")
+    assert len(state_dict) == 1
+    dtype_to_name = {"torch.float16": set()}
+    for name, layer in state_dict["model"].items():
+        dtype_to_name[str(layer.dtype)].add(name)
+    assert dtype_to_name == {
+        "torch.float16": {
+            "transformer.h.1.attn.attn.lora_A",
+            "transformer.h.0.attn.attn.lora_A",
+            "transformer.h.0.attn.attn.lora_B",
+            "transformer.h.1.attn.attn.lora_B",
+        }
+    }
+
+    logs = stdout.getvalue()
+    assert "of trainable parameters: 512" in logs
+    assert "of non trainable parameters: 1,888" in logs
diff --git a/tests/test_merge_lora.py b/tests/test_merge_lora.py
index 574a585..649ab1f 100644
--- a/tests/test_merge_lora.py
+++ b/tests/test_merge_lora.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import os
 
@@ -14,11 +16,11 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir):
     config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16)
     with open(fake_checkpoint_dir / "lit_config.json", "w") as fp:
         json.dump(config, fp)
-    base_model = GPT.from_name("pythia-70m", **config)
+    base_model = GPT.from_name("pythia-14m", **config)
     state_dict = base_model.state_dict()
     assert len(state_dict) == 40
     torch.save(state_dict, fake_checkpoint_dir / "lit_model.pth")
-    lora_model = LoRAGPT.from_name("pythia-70m", **config, r=8, alpha=16, dropout=0.05, to_query=True, to_value=True)
+    lora_model = LoRAGPT.from_name("pythia-14m", **config, r=8, alpha=16, dropout=0.05, to_query=True, to_value=True)
     state_dict = {k: v for k, v in lora_model.state_dict().items() if lora_filter(k, v)}
     assert len(state_dict) == 6
     lora_path = tmp_path / "lora"
diff --git a/tests/test_model.py b/tests/test_model.py
index 0f02ddd..3a7d5bc 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1,19 +1,22 @@
-import operator
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
+from functools import partial
 from pathlib import Path
 from urllib.request import urlretrieve
 
 import pytest
 import torch
-from lightning_utilities.core.imports import compare_version
-
-wd = Path(__file__).parent.parent.absolute()
+from conftest import RunIf
+from lightning import Fabric
+from lightning.fabric.utilities.imports import _IS_WINDOWS
+from lightning.fabric.utilities.init import _materialize_meta_tensors
 
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
 
-@pytest.fixture(autouse=True)
-def restore_default_dtype():
-    # just in case
-    torch.set_default_dtype(torch.float32)
+import lit_gpt.config as config_module
 
 
 @torch.inference_mode()
@@ -21,7 +24,6 @@ def restore_default_dtype():
 @pytest.mark.parametrize("batch_size", (1, 3))
 @pytest.mark.parametrize("n_embd", (16, 32))
 @pytest.mark.parametrize("parallel_residual", (False, True))
-@pytest.mark.parametrize("kv_cache", (False, True))
 @pytest.mark.parametrize(
     ("device", "dtype"),
     [
@@ -30,20 +32,31 @@ def restore_default_dtype():
             torch.device("cuda"),
             torch.float16,
             marks=[
-                pytest.mark.xfail(raises=AssertionError, strict=True),
-                pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA"),
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
 )
-def test_against_gpt_neox_model(rotary_pct, batch_size, n_embd, parallel_residual, kv_cache, device, dtype) -> None:
+def test_against_gpt_neox_model(rotary_pct, batch_size, n_embd, parallel_residual, device, dtype) -> None:
     from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
 
     from lit_gpt import GPT, Config
     from scripts.convert_hf_checkpoint import copy_weights_gpt_neox
 
-    batch_size = 3
-    ours_config = Config(block_size=64, vocab_size=100, n_layer=4, n_head=8, n_embd=n_embd)
+    torch.set_default_dtype(dtype)
+
+    ours_config = Config(
+        block_size=64,
+        vocab_size=100,
+        n_layer=4,
+        n_head=8,
+        n_embd=n_embd,
+        rotary_percentage=rotary_pct,
+        parallel_residual=parallel_residual,
+    )
     assert ours_config.padded_vocab_size == 512
     theirs_config = GPTNeoXConfig(
         hidden_act="gelu",
@@ -52,21 +65,19 @@ def test_against_gpt_neox_model(rotary_pct, batch_size, n_embd, parallel_residua
         num_hidden_layers=ours_config.n_layer,
         initializer_range=0.02,
         intermediate_size=ours_config.intermediate_size,
-        layer_norm_eps=1e-05,
+        layer_norm_eps=ours_config.norm_eps,
         max_position_embeddings=ours_config.block_size,
         rotary_emb_base=10000,
         rotary_pct=ours_config.rotary_percentage,
         vocab_size=ours_config.padded_vocab_size,
         use_parallel_residual=ours_config.parallel_residual,
-        use_cache=kv_cache,
-        torch_dtype=dtype,
     )
 
     state_dict = {}
     theirs_model = GPTNeoXForCausalLM(theirs_config).to(device)
     # load the hf initialization into our model
     copy_weights_gpt_neox(state_dict, theirs_model.state_dict())
-    ours_model = GPT(ours_config).to(device, dtype)
+    ours_model = GPT(ours_config).to(device)
     ours_model.load_state_dict(state_dict)
 
     token_sample = torch.randint(
@@ -74,7 +85,7 @@ def test_against_gpt_neox_model(rotary_pct, batch_size, n_embd, parallel_residua
     )
 
     theirs = theirs_model(token_sample)["logits"]
-    ours = ours_model(token_sample).float()  # HF converts logits to float
+    ours = ours_model(token_sample)
     torch.testing.assert_close(ours, theirs)
 
 
@@ -94,18 +105,22 @@ def test_against_gpt_neox_model(rotary_pct, batch_size, n_embd, parallel_residua
             torch.device("cuda"),
             torch.float16,
             marks=[
-                pytest.mark.xfail(raises=AssertionError, strict=True),
-                pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA"),
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
 )
-def test_against_original_falcon_180b(kwargs, device, dtype):
+def test_against_hf_falcon(kwargs, device, dtype):
     from transformers.models.falcon import FalconConfig, FalconForCausalLM
 
     from lit_gpt import GPT, Config
     from scripts.convert_hf_checkpoint import copy_weights_falcon
 
+    torch.set_default_dtype(dtype)
+
     ours_config = Config.from_name(**kwargs)
     theirs_config = FalconConfig(
         hidden_size=ours_config.n_embd,
@@ -116,19 +131,18 @@ def test_against_original_falcon_180b(kwargs, device, dtype):
         vocab_size=ours_config.padded_vocab_size,
         bias=ours_config.bias,
         new_decoder_architecture=True,
-        torch_dtype=dtype,
     )
 
     theirs_model = FalconForCausalLM(theirs_config).to(device)
     theirs_state_dict = theirs_model.state_dict()
     state_dict = {}
     copy_weights_falcon(kwargs["name"], state_dict, theirs_state_dict)
-    ours_model = GPT(ours_config).to(device, dtype)
+    ours_model = GPT(ours_config).to(device)
     ours_model.load_state_dict(state_dict)
 
     # test end to end
     x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
-    ours_y = ours_model(x).float()  # HF converts logits to float
+    ours_y = ours_model(x)
     theirs_y = theirs_model(x)["logits"]
     torch.testing.assert_close(ours_y, theirs_y)
 
@@ -142,8 +156,10 @@ def test_against_original_falcon_180b(kwargs, device, dtype):
             torch.device("cuda"),
             torch.float16,
             marks=[
-                pytest.mark.xfail(raises=AssertionError, strict=True),
-                pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA"),
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -155,6 +171,8 @@ def test_against_original_open_llama_3b(device, dtype):
     from lit_gpt import GPT, Config
     from scripts.convert_hf_checkpoint import copy_weights_hf_llama
 
+    torch.set_default_dtype(dtype)
+
     ours_config = Config.from_name("open_llama_3b", n_layer=2, n_head=8, n_embd=32, intermediate_size=86)
     T = 5
     theirs_config = LlamaConfig(
@@ -163,7 +181,6 @@ def test_against_original_open_llama_3b(device, dtype):
         num_hidden_layers=ours_config.n_layer,
         intermediate_size=ours_config.intermediate_size,
         max_position_embeddings=T,
-        torch_dtype=dtype,
     )
     assert ours_config.intermediate_size == theirs_config.intermediate_size
 
@@ -171,31 +188,21 @@ def test_against_original_open_llama_3b(device, dtype):
     theirs_state_dict = theirs_model.state_dict()
     state_dict = {}
     copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
-    ours_model = GPT(ours_config).to(device, dtype)
+    ours_model = GPT(ours_config).to(device)
     ours_model.load_state_dict(state_dict)
 
     # test end to end
     x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
     assert x.size(1) == T
-    ours_y = ours_model(x).float()  # HF converts logits to float
-    theirs_y = theirs_model(x)["logits"]
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
     torch.testing.assert_close(ours_y, theirs_y)
 
 
 @torch.inference_mode()
 @pytest.mark.parametrize(
     "ours_kwargs",
-    [
-        {"name": "Llama-2-7b-hf"},
-        pytest.param(
-            {"name": "CodeLlama-7b-hf"},
-            marks=pytest.mark.skipif(
-                compare_version("transformers", operator.lt, "4.33.0", use_base_version=True),
-                reason="requires rope_theta",
-            ),
-        ),
-        {"name": "Llama-2-70b-chat-hf", "n_query_groups": 1},
-    ],
+    [{"name": "Llama-2-7b-hf"}, {"name": "CodeLlama-7b-hf"}, {"name": "Llama-2-70b-chat-hf", "n_query_groups": 1}],
 )
 @pytest.mark.parametrize(
     ("device", "dtype"),
@@ -205,8 +212,10 @@ def test_against_original_open_llama_3b(device, dtype):
             torch.device("cuda"),
             torch.float16,
             marks=[
-                pytest.mark.xfail(raises=AssertionError, strict=True),
-                pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA"),
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -218,6 +227,8 @@ def test_against_hf_llama2(ours_kwargs, device, dtype):
     from lit_gpt import GPT, Config
     from scripts.convert_hf_checkpoint import copy_weights_hf_llama
 
+    torch.set_default_dtype(dtype)
+
     ours_config = Config.from_name(
         padded_vocab_size=10000, n_layer=2, n_head=8, n_embd=32, intermediate_size=86, **ours_kwargs
     )
@@ -229,11 +240,10 @@ def test_against_hf_llama2(ours_kwargs, device, dtype):
         num_hidden_layers=ours_config.n_layer,
         intermediate_size=ours_config.intermediate_size,
         max_position_embeddings=T,
-        rms_norm_eps=1e-5,
+        rms_norm_eps=ours_config.norm_eps,
         num_key_value_heads=ours_config.n_query_groups,
         rope_theta=ours_config.rope_base,
         attention_bias=ours_config.bias,
-        torch_dtype=dtype,
     )
     assert ours_config.intermediate_size == theirs_config.intermediate_size
 
@@ -241,14 +251,14 @@ def test_against_hf_llama2(ours_kwargs, device, dtype):
     theirs_state_dict = theirs_model.state_dict()
     state_dict = {}
     copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
-    ours_model = GPT(ours_config).to(device, dtype)
+    ours_model = GPT(ours_config).to(device)
     ours_model.load_state_dict(state_dict)
 
     # test end to end
     x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
     assert x.size(1) == T
-    ours_y = ours_model(x).float()  # HF converts logits to float
-    theirs_y = theirs_model(x)["logits"]
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
     torch.testing.assert_close(ours_y, theirs_y)
 
 
@@ -260,57 +270,123 @@ def test_against_hf_llama2(ours_kwargs, device, dtype):
         pytest.param(
             torch.device("cuda"),
             torch.float16,
-            marks=[
-                pytest.mark.xfail(raises=AssertionError, strict=True),
-                pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA"),
-            ],
+            marks=[pytest.mark.xfail(raises=AssertionError, strict=False), RunIf(min_cuda_gpus=1)],
         ),
     ],
 )
-def test_against_hf_phi(device, dtype):
-    file_path = wd / "tests" / "original_phi_1_5.py"
-    url = "https://gist.githubusercontent.com/carmocca/8ec003d9e0d2fdb09ea92941cd0985b4/raw/2ba35c28824d4f4d5dce14f9588a80067cb6ae7f/original_phi_1_5.py"
-    if not file_path.is_file():
-        urlretrieve(url=url, filename=file_path)
+def test_against_hf_phi_1_5(device, dtype):
+    workdir = wd / "tests" / "reference_models"
+    workdir.mkdir(parents=True, exist_ok=True)
+    file_paths = [workdir / "original_phi_1_5.py", workdir / "configuration_phi.py"]
+    urls = [
+        "https://huggingface.co/microsoft/phi-1_5/raw/main/modeling_phi.py",
+        "https://huggingface.co/microsoft/phi-1_5/raw/main/configuration_phi.py",
+    ]
+    for file_path, url in zip(file_paths, urls):
+        if not file_path.is_file():
+            urlretrieve(url=url, filename=file_path)
+
+    from reference_models.configuration_phi import PhiConfig
+    from reference_models.original_phi_1_5 import PhiForCausalLM
 
     from lit_gpt import GPT, Config
     from scripts.convert_hf_checkpoint import copy_weights_phi
-    from tests.original_phi_1_5 import MixFormerSequentialConfig, MixFormerSequentialForCausalLM
+
+    torch.set_default_dtype(dtype)
 
     ours_config = Config.from_name(
         "phi-1_5", padded_vocab_size=10000, n_layer=2, n_head=4, n_embd=256, rotary_percentage=0.5
     )
     T = 5
-    theirs_config = MixFormerSequentialConfig(
-        n_positions=ours_config.block_size,
-        n_embd=ours_config.n_embd,
-        n_head=ours_config.n_head,
-        n_layer=ours_config.n_layer,
-        rotary_dim=ours_config.rope_n_elem,
-        architecture={"block_cls": "parallel", "mixer": {}, "mlp": {"mlp_cls": "mlp"}},
+    theirs_config = PhiConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        max_position_embeddings=ours_config.block_size,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        partial_rotary_factor=ours_config.rotary_percentage,
         torch_dtype=dtype,
     )
-    theirs_config.vocab_size = ours_config.padded_vocab_size
 
-    theirs_model = MixFormerSequentialForCausalLM(theirs_config).to(device)
+    theirs_model = PhiForCausalLM(theirs_config).to(device)
     theirs_state_dict = theirs_model.state_dict()
     state_dict = {}
-    copy_weights_phi(ours_config, state_dict, theirs_state_dict)
-    ours_model = GPT(ours_config).to(device, dtype)
+    copy_weights_phi(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
     ours_model.load_state_dict(state_dict)
 
     # test end to end
     x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
     assert x.size(1) == T
-    ours_y = ours_model(x).float()  # HF converts logits to float
-    theirs_y = theirs_model(x)["logits"]
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
     torch.testing.assert_close(ours_y, theirs_y)
 
 
 @torch.inference_mode()
-@pytest.mark.skipif(
-    compare_version("transformers", operator.lt, "4.33.4", use_base_version=True), reason="requires mistral"
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[pytest.mark.xfail(raises=AssertionError, strict=False), RunIf(min_cuda_gpus=1)],
+        ),
+    ],
 )
+def test_against_hf_phi_2(device, dtype):
+    workdir = wd / "tests" / "reference_models"
+    workdir.mkdir(parents=True, exist_ok=True)
+    file_paths = [workdir / "original_phi_2.py", workdir / "configuration_phi.py"]
+    urls = [
+        "https://huggingface.co/microsoft/phi-2/raw/main/modeling_phi.py",
+        "https://huggingface.co/microsoft/phi-2/raw/main/configuration_phi.py",
+    ]
+    for file_path, url in zip(file_paths, urls):
+        if not file_path.is_file():
+            urlretrieve(url=url, filename=file_path)
+
+    from reference_models.configuration_phi import PhiConfig
+    from reference_models.original_phi_2 import PhiForCausalLM
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_hf_checkpoint import copy_weights_phi
+
+    torch.set_default_dtype(dtype)
+
+    ours_config = Config.from_name(
+        "phi-2", padded_vocab_size=10000, n_layer=2, n_head=4, n_embd=256, rotary_percentage=0.5
+    )
+    T = 5
+    theirs_config = PhiConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        max_position_embeddings=ours_config.block_size,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        partial_rotary_factor=ours_config.rotary_percentage,
+        torch_dtype=dtype,
+    )
+
+    theirs_model = PhiForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    state_dict = {}
+    copy_weights_phi(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@torch.inference_mode()
 @pytest.mark.parametrize(
     ("device", "dtype"),
     [
@@ -319,8 +395,10 @@ def test_against_hf_phi(device, dtype):
             torch.device("cuda"),
             torch.float16,
             marks=[
-                pytest.mark.xfail(raises=AssertionError, strict=True),
-                pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA"),
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
             ],
         ),
     ],
@@ -332,6 +410,8 @@ def test_against_hf_mistral(device, dtype):
     from lit_gpt import GPT, Config
     from scripts.convert_hf_checkpoint import copy_weights_hf_llama
 
+    torch.set_default_dtype(dtype)
+
     ours_config = Config.from_name(
         "Mistral-7B-Instruct-v0.1",
         padded_vocab_size=10000,
@@ -349,10 +429,9 @@ def test_against_hf_mistral(device, dtype):
         num_hidden_layers=ours_config.n_layer,
         intermediate_size=ours_config.intermediate_size,
         max_position_embeddings=T,
-        rms_norm_eps=1e-5,
+        rms_norm_eps=ours_config.norm_eps,
         num_key_value_heads=ours_config.n_query_groups,
         rope_theta=ours_config.rope_base,
-        torch_dtype=dtype,
     )
     assert ours_config.intermediate_size == theirs_config.intermediate_size
 
@@ -360,28 +439,195 @@ def test_against_hf_mistral(device, dtype):
     theirs_state_dict = theirs_model.state_dict()
     state_dict = {}
     copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
-    ours_model = GPT(ours_config).to(device, dtype)
+    ours_model = GPT(ours_config).to(device)
     ours_model.load_state_dict(state_dict)
 
     # test end to end
     x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
     assert x.size(1) == T
-    ours_y = ours_model(x).float()
-    theirs_y = theirs_model(x)["logits"]
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@torch.inference_mode()
+def test_against_hf_mixtral():
+    from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_hf_checkpoint import copy_weights_hf_llama
+
+    device = torch.device("cpu")
+    dtype = torch.float32
+    ours_config = Config.from_name(
+        "Mixtral-8x7B-Instruct-v0.1",
+        padded_vocab_size=10000,
+        n_layer=2,
+        n_embd=32,
+        n_head=8,
+        n_query_groups=2,
+        intermediate_size=86,
+        n_expert=4,
+    )
+    T = 5
+    theirs_config = MixtralConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        num_local_experts=ours_config.n_expert,
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    theirs_model = MixtralForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    state_dict = {}
+    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304], [23, 345, 65, 123, 321]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_original_stablelm_zephyr_3b(device, dtype):
+    from transformers import AutoConfig, AutoModelForCausalLM
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_hf_checkpoint import copy_weights_hf_llama
+
+    torch.set_default_dtype(dtype)
+
+    T = 5
+    ours_config = Config.from_name("stablelm-zephyr-3b", n_layer=2, n_head=16, n_embd=32, intermediate_size=86)
+    theirs_config = AutoConfig.from_pretrained(
+        "stabilityai/stablelm-zephyr-3b",
+        trust_remote_code=True,
+        num_hidden_layers=ours_config.n_layer,
+        num_attention_heads=ours_config.n_head,
+        num_key_value_heads=ours_config.n_head,
+        hidden_size=ours_config.n_embd,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        torch_dtype=dtype,
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    theirs_model = AutoModelForCausalLM.from_config(theirs_config, trust_remote_code=True).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    state_dict = {}
+    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
     torch.testing.assert_close(ours_y, theirs_y)
 
 
-@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="torch.compile not supported on this platform")
+@torch.inference_mode()
+@pytest.mark.parametrize("model_name", ["gemma-2b", "gemma-7b"])
+@pytest.mark.parametrize(
+    ("device", "dtype"),
+    [
+        (torch.device("cpu"), torch.float32),
+        pytest.param(
+            torch.device("cuda"),
+            torch.float16,
+            marks=[
+                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
+                # is slightly different
+                pytest.mark.xfail(raises=AssertionError, strict=False),
+                RunIf(min_cuda_gpus=1),
+            ],
+        ),
+    ],
+)
+def test_against_original_gemma(model_name, device, dtype):
+    from transformers.models.gemma.configuration_gemma import GemmaConfig
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+
+    from lit_gpt import GPT, Config
+    from scripts.convert_hf_checkpoint import copy_weights_hf_llama
+
+    torch.set_default_dtype(dtype)
+
+    T = 5
+    ours_config = Config.from_name(model_name, n_layer=2, n_head=16, n_embd=32, intermediate_size=86)
+    theirs_config = GemmaConfig(
+        vocab_size=ours_config.padded_vocab_size,
+        hidden_size=ours_config.n_embd,
+        head_dim=ours_config.head_size,
+        num_attention_heads=ours_config.n_head,
+        num_hidden_layers=ours_config.n_layer,
+        intermediate_size=ours_config.intermediate_size,
+        max_position_embeddings=T,
+        rms_norm_eps=ours_config.norm_eps,
+        num_key_value_heads=ours_config.n_query_groups,
+        rope_theta=ours_config.rope_base,
+        attention_bias=ours_config.bias,
+        tie_word_embeddings=True,
+        hidden_act="gelu_pytorch_tanh",
+    )
+    assert ours_config.intermediate_size == theirs_config.intermediate_size
+
+    theirs_model = GemmaForCausalLM(theirs_config).to(device)
+    theirs_state_dict = theirs_model.state_dict()
+    # Gemma weights are shipped without `lm_head.weight`
+    theirs_state_dict.pop("lm_head.weight")
+    state_dict = {}
+    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
+    ours_model = GPT(ours_config).to(device)
+    ours_model.load_state_dict(state_dict)
+
+    # test end to end
+    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
+    assert x.size(1) == T
+    ours_y = ours_model(x)
+    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
+    torch.testing.assert_close(ours_y, theirs_y)
+
+
+@RunIf(dynamo=True)
 @torch.inference_mode()
 def test_model_compile():
     from lit_gpt import GPT
 
-    model = GPT.from_name("pythia-70m", n_layer=3)
+    model = GPT.from_name("pythia-14m", n_layer=3)
     x = torch.randint(model.config.vocab_size, size=(2, model.config.block_size), dtype=torch.int64)
 
     from torch._dynamo.backends import debugging
 
-    explanation = torch._dynamo.explain(model, x)
+    explanation = torch._dynamo.explain(model)(x)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
@@ -389,7 +635,7 @@ def test_model_compile():
     model = GPT(model.config)
     model.set_kv_cache(2)
     input_pos = torch.arange(model.config.block_size)
-    explanation = torch._dynamo.explain(model, x, input_pos)
+    explanation = torch._dynamo.explain(model)(x, input_pos)
     assert isinstance(explanation, debugging.ExplainOutput)
     assert explanation.graph_count == 1
     assert explanation.graph_break_count == 0
@@ -436,10 +682,166 @@ def generate(logits):
 def test_model_kv_cache_amp():
     from lit_gpt.model import GPT, Config
 
-    config = Config.from_name("pythia-70m", n_layer=2)
+    config = Config.from_name("pythia-14m", n_layer=2)
     model = GPT(config)
     encoded = torch.arange(45)
     model.set_kv_cache(batch_size=1)
     with torch.autocast("cpu", torch.bfloat16):
         output = model(encoded.unsqueeze(0), encoded)
     assert output.dtype is torch.bfloat16
+
+
+# https://github.com/pytorch/pytorch/blob/ad3572a5d/torch/testing/_internal/common_cuda.py#L31-L34
+SUPPORTS_FLASH_ATTENTION = (
+    torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0) and not _IS_WINDOWS
+)
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
+@torch.inference_mode()
+def test_sdpa_choice(config):
+    from torch.backends.cuda import (
+        SDPAParams,
+        SDPBackend,
+        can_use_efficient_attention,
+        can_use_flash_attention,
+        flash_sdp_enabled,
+        math_sdp_enabled,
+        mem_efficient_sdp_enabled,
+    )
+
+    from lit_gpt import GPT
+
+    torch.set_default_dtype(torch.float16)
+
+    def assert_sdpa_backend(original_fn, q, k, v, mask):
+        params = SDPAParams(q, k, v, mask, 0.0, True)
+        if expected is SDPBackend.FLASH_ATTENTION:
+            assert flash_sdp_enabled()
+            assert can_use_flash_attention(params, True)
+        elif expected is SDPBackend.EFFICIENT_ATTENTION:
+            assert mem_efficient_sdp_enabled()
+            assert can_use_efficient_attention(params, True)
+        elif expected is SDPBackend.MATH:
+            assert math_sdp_enabled()
+        else:
+            raise NotImplementedError
+        return original_fn(q, k, v, mask)
+
+    config["n_layer"] = 1
+    config = config_module.Config(**config)
+
+    try:
+        with torch.device("cuda"):
+            model = GPT(config)
+            x = torch.randint(0, 10, (2, 16), dtype=torch.int32)
+    except torch.cuda.OutOfMemoryError:
+        # best effort, if the GPU can load it
+        pytest.xfail()
+
+    for h in model.transformer.h:
+        h.attn.scaled_dot_product_attention = partial(assert_sdpa_backend, h.attn.scaled_dot_product_attention)
+
+    if SUPPORTS_FLASH_ATTENTION:
+        expected = SDPBackend.FLASH_ATTENTION
+        with torch.backends.cuda.sdp_kernel(enable_mem_efficient=False):
+            model(x)
+
+    expected = SDPBackend.EFFICIENT_ATTENTION if config.head_size % 8 == 0 else SDPBackend.MATH
+    with torch.backends.cuda.sdp_kernel(enable_flash=False):
+        model(x)
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
+@torch.inference_mode()
+def test_sdpa_choice_kv_cache(config):
+    from torch.backends.cuda import (
+        SDPAParams,
+        SDPBackend,
+        can_use_efficient_attention,
+        can_use_flash_attention,
+        flash_sdp_enabled,
+        math_sdp_enabled,
+        mem_efficient_sdp_enabled,
+    )
+
+    from lit_gpt import GPT
+
+    torch.set_default_dtype(torch.float16)
+
+    def assert_sdpa_backend(original_fn, q, k, v, mask):
+        params = SDPAParams(q, k, v, mask, 0.0, True)
+        if expected is SDPBackend.FLASH_ATTENTION:
+            assert flash_sdp_enabled()
+            assert can_use_flash_attention(params, True)
+        elif expected is SDPBackend.EFFICIENT_ATTENTION:
+            assert mem_efficient_sdp_enabled()
+            assert can_use_efficient_attention(params, True)
+        elif expected is SDPBackend.MATH:
+            assert math_sdp_enabled()
+        else:
+            raise NotImplementedError
+        return original_fn(q, k, v, mask)
+
+    config["n_layer"] = 1
+    config = config_module.Config(**config)
+
+    try:
+        with torch.device("cuda"):
+            model = GPT(config)
+            model.max_seq_length = 1
+            model.set_kv_cache(2)
+            x = torch.randint(0, 10, (2, 1), dtype=torch.int32)
+            input_pos = torch.tensor([0], dtype=torch.long)
+    except torch.cuda.OutOfMemoryError:
+        # best effort, if the GPU can load it
+        pytest.xfail()
+
+    for h in model.transformer.h:
+        h.attn.scaled_dot_product_attention = partial(assert_sdpa_backend, h.attn.scaled_dot_product_attention)
+
+    if SUPPORTS_FLASH_ATTENTION:
+        # flash attention does not support an attention mask
+        expected = SDPBackend.MATH
+        with torch.backends.cuda.sdp_kernel(enable_mem_efficient=False):
+            model(x, input_pos)
+
+    expected = (
+        SDPBackend.EFFICIENT_ATTENTION if config.head_size % 8 == 0 and config.n_query_groups != 1 else SDPBackend.MATH
+    )
+    with torch.backends.cuda.sdp_kernel(enable_flash=False):
+        model(x, input_pos)
+
+
+@RunIf(min_cuda_gpus=2, standalone=True)
+def test_rope_init_under_fsdp():
+    """Check that the rope cache is properly intialized"""
+    from lit_gpt import GPT
+
+    fabric = Fabric(devices=2, strategy="fsdp", accelerator="cuda")
+    fabric.launch()
+
+    with fabric.init_module(empty_init=True):
+        model = GPT.from_name("pythia-14m", n_layer=1)
+    assert model.cos.device.type == "meta"
+    assert model.sin.device.type == "meta"
+
+    model = fabric.setup(model)
+    assert model.cos.device.type == "cuda"
+    assert model.sin.device.type == "cuda"
+    cos, sin = model.rope_cache(device=fabric.device)
+    torch.testing.assert_close(model.cos, cos)
+    torch.testing.assert_close(model.sin, sin)
+
+
+@RunIf(min_cuda_gpus=1)
+def test_reset_parameters_device():
+    from lit_gpt import GPT
+
+    with torch.device("meta"):
+        model = GPT.from_name("pythia-14m", n_layer=1)
+    _materialize_meta_tensors(model, torch.device("cuda"))
+    model.reset_parameters()
+    assert model.cos.device.type == "cuda"
diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py
index 10814c7..3691448 100644
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -1,25 +1,25 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import os
 from unittest.mock import MagicMock
 
-import requests
+import pytest
 from torch.utils.data import IterableDataset
 
-
-def maybe_get_file(url, file_path):
-    if not file_path.exists():
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(requests.get(url).text)
+from scripts.prepare_alpaca import download_if_missing
 
 
 def test_packed_dataset(tmp_path):
     tmp_path.mkdir(parents=True, exist_ok=True)
 
     vocabulary_path = tmp_path / "tokenizer.json"
-    maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
+    download_if_missing(
+        vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
+    )
 
     tokenizer_path = tmp_path / "tokenizer_config.json"
-    maybe_get_file(
-        "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
+    download_if_missing(
+        tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
     )
 
     from lit_gpt import Tokenizer
@@ -127,7 +127,7 @@ def __iter__(self):
         return iter(range(self._start, self._end))
 
 
-def test_combined_dataset(tmp_path):
+def test_combined_dataset():
     from lit_gpt.packed_dataset import CombinedDataset
 
     dataset1 = SimpleDataset(0, 10)
@@ -187,3 +187,21 @@ def test_sharded_packed_dataset(monkeypatch):
     # world_size = 3, rank = 2 (dataset size not cleanly divisible by world size)
     iter(PackedDataset(filenames=filenames, n_chunks=2, block_size=2, num_processes=3, process_rank=2))
     assert dataset_iterator_mock.call_args[1]["filenames"] == ["2", "5", "8"]
+
+
+@pytest.mark.parametrize(
+    ("weights", "expected"),
+    [
+        ([1], [1]),
+        ([2], [1]),
+        ([2, 0.5], [0.8, 0.2]),
+        ([1, 1, 1], [1 / 3, 1 / 3, 1 / 3]),
+        ([0.3, 0, 0], [1.0, 0, 0]),
+        (None, [0.5, 0.5]),
+    ],
+)
+def test_combined_dataset_normalizes_weights(weights, expected):
+    from lit_gpt.packed_dataset import CombinedDataset
+
+    combined_dataset = CombinedDataset([[1], [2, 3]], weights=weights, seed=1)
+    assert combined_dataset._weights == expected
diff --git a/tests/test_prepare_csv.py b/tests/test_prepare_csv.py
index a8a2aff..5d8c3dc 100644
--- a/tests/test_prepare_csv.py
+++ b/tests/test_prepare_csv.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import subprocess
 import sys
@@ -34,59 +36,17 @@ def test_prepare_csv(tmp_path, fake_checkpoint_dir):
     train_calls, test_calls = save_mock.mock_calls
     assert train_calls == call(
         [
-            {
-                "instruction": "Add",
-                "input": "2+2",
-                "output": "4",
-                "input_ids": ANY,
-                "input_ids_no_response": ANY,
-                "labels": ANY,
-            },
-            {
-                "instruction": "Divide",
-                "input": "10/2",
-                "output": "5",
-                "input_ids": ANY,
-                "input_ids_no_response": ANY,
-                "labels": ANY,
-            },
-            {
-                "instruction": "Multiply",
-                "input": "6*4",
-                "output": "24",
-                "input_ids": ANY,
-                "input_ids_no_response": ANY,
-                "labels": ANY,
-            },
+            {"instruction": "Add", "input": "2+2", "output": "4", "input_ids": ANY, "labels": ANY},
+            {"instruction": "Divide", "input": "10/2", "output": "5", "input_ids": ANY, "labels": ANY},
+            {"instruction": "Multiply", "input": "6*4", "output": "24", "input_ids": ANY, "labels": ANY},
         ],
         tmp_path / "train.pt",
     )
     assert test_calls == call(
         [
-            {
-                "instruction": "Exponentiate",
-                "input": "2^3",
-                "output": "8",
-                "input_ids": ANY,
-                "input_ids_no_response": ANY,
-                "labels": ANY,
-            },
-            {
-                "instruction": "Subtract",
-                "input": "5-3",
-                "output": "2",
-                "input_ids": ANY,
-                "input_ids_no_response": ANY,
-                "labels": ANY,
-            },
-            {
-                "instruction": "Square root",
-                "input": "√9",
-                "output": "3",
-                "input_ids": ANY,
-                "input_ids_no_response": ANY,
-                "labels": ANY,
-            },
+            {"instruction": "Exponentiate", "input": "2^3", "output": "8", "input_ids": ANY, "labels": ANY},
+            {"instruction": "Subtract", "input": "5-3", "output": "2", "input_ids": ANY, "labels": ANY},
+            {"instruction": "Square root", "input": "√9", "output": "3", "input_ids": ANY, "labels": ANY},
         ],
         tmp_path / "test.pt",
     )
diff --git a/tests/test_prepare_redpajama.py b/tests/test_prepare_redpajama.py
index 3a91f88..059e47e 100644
--- a/tests/test_prepare_redpajama.py
+++ b/tests/test_prepare_redpajama.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import json
 import os
 import subprocess
@@ -5,21 +7,17 @@
 from pathlib import Path
 from unittest import mock
 
-import requests
-
-
-def maybe_get_file(url, file_path):
-    if not file_path.exists():
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(requests.get(url).text)
+from scripts.prepare_alpaca import download_if_missing
 
 
 def test_prepare_sample(tmp_path):
     vocabulary_path = tmp_path / "tokenizer.json"
-    maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
+    download_if_missing(
+        vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
+    )
     tokenizer_path = tmp_path / "tokenizer_config.json"
-    maybe_get_file(
-        "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
+    download_if_missing(
+        tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
     )
     with open(tmp_path / "lit_config.json", "w") as f:
         json.dump({"block_size": 2048}, f)
@@ -64,10 +62,12 @@ def test_prepare_sample(tmp_path):
 
 def test_prepare_full(tmp_path):
     vocabulary_path = tmp_path / "tokenizer.json"
-    maybe_get_file("https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json", vocabulary_path)
+    download_if_missing(
+        vocabulary_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer.json"
+    )
     tokenizer_path = tmp_path / "tokenizer_config.json"
-    maybe_get_file(
-        "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json", tokenizer_path
+    download_if_missing(
+        tokenizer_path, "https://huggingface.co/stabilityai/stablelm-base-alpha-3b/raw/main/tokenizer_config.json"
     )
     with open(tmp_path / "lit_config.json", "w") as f:
         json.dump({"block_size": 2048}, f)
diff --git a/tests/test_pretrain_tinyllama.py b/tests/test_pretrain_tinyllama.py
new file mode 100644
index 0000000..f781f1d
--- /dev/null
+++ b/tests/test_pretrain_tinyllama.py
@@ -0,0 +1,50 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
+import os
+from contextlib import redirect_stdout
+from io import StringIO
+from unittest import mock
+from unittest.mock import Mock
+
+import torch
+from conftest import RunIf
+from torch.utils.data import DataLoader
+
+
+@RunIf(min_cuda_gpus=2, standalone=True)
+# Set CUDA_VISIBLE_DEVICES for FSDP hybrid-shard, if fewer GPUs are used than are available
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
+def test_pretrain_tiny_llama(tmp_path, monkeypatch):
+    import pretrain.tinyllama as module
+    from lit_gpt.args import EvalArgs, IOArgs, TrainArgs
+    from lit_gpt.config import Config
+
+    model_config = Config(block_size=2, n_layer=2, n_embd=8, n_head=4, padded_vocab_size=8)
+
+    dataset = torch.tensor([[0, 1, 2], [3, 4, 5], [0, 1, 2]])
+    dataloader = DataLoader(dataset)
+    module.get_dataloaders = Mock(return_value=(dataloader, dataloader))
+
+    stdout = StringIO()
+    with redirect_stdout(stdout):
+        module.setup(
+            devices=2,
+            model=model_config,
+            io=IOArgs(out_dir=tmp_path, train_data_dir=None),
+            train=TrainArgs(global_batch_size=2, max_tokens=16, save_interval=1, micro_batch_size=1, max_norm=1.0),
+            eval=EvalArgs(interval=1, max_iters=1),
+        )
+
+    if torch.distributed.get_rank() == 0:
+        # tmp_path is not the same across all ranks, run assert only on rank 0
+        assert {p.name for p in tmp_path.glob("*.pth")} == {
+            "step-00000001.pth",
+            "step-00000002.pth",
+            "step-00000003.pth",
+            "step-00000004.pth",
+        }
+        # logs only appear on rank 0
+        logs = stdout.getvalue()
+        assert logs.count("optimizer.step") == 4
+        assert logs.count("val loss") == 4
+        assert "Total parameters: 1,888" in logs
diff --git a/tests/test_rope.py b/tests/test_rope.py
index aa8af44..20f2dc5 100644
--- a/tests/test_rope.py
+++ b/tests/test_rope.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import torch
 
 
@@ -13,7 +15,7 @@ def test_rope():
     position_ids = torch.arange(seq_len).unsqueeze(0)
 
     theirs = GPTNeoXRotaryEmbedding(head_size, seq_len)
-    ours_cos_cached, ours_sin_cached = build_rope_cache(seq_len, head_size, device=x.device, dtype=x.dtype)
+    ours_cos_cached, ours_sin_cached = build_rope_cache(seq_len, head_size, device=x.device)
     # their rope cache has 2 added dimensions and the cos/sin is duplicated
     torch.testing.assert_close(ours_cos_cached, theirs.cos_cached.squeeze())
     torch.testing.assert_close(ours_sin_cached, theirs.sin_cached.squeeze())
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index fdd9220..c65e7c3 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import os
 import sys
 from pathlib import Path
@@ -13,7 +15,7 @@
 import lit_gpt.config as config_module
 
 
-@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs])
+@pytest.mark.parametrize("config", config_module.configs, ids=[c["hf_config"]["name"] for c in config_module.configs])
 def test_tokenizer_against_hf(config):
     from lit_gpt.tokenizer import Tokenizer
 
@@ -21,11 +23,11 @@ def test_tokenizer_against_hf(config):
 
     config = config_module.Config(**config)
 
-    repo_id = f"{config.org}/{config.name}"
+    repo_id = f"{config.hf_config['org']}/{config.hf_config['name']}"
     cache_dir = Path("/tmp/tokenizer_test_cache")
 
     # create a checkpoint directory that points to the HF files
-    checkpoint_dir = cache_dir / "ligpt" / config.org / config.name
+    checkpoint_dir = cache_dir / "litgpt" / config.hf_config["org"] / config.hf_config["name"]
     if not checkpoint_dir.exists():
         file_to_cache = {}
         for file in ("tokenizer.json", "generation_config.json", "tokenizer.model", "tokenizer_config.json"):
@@ -48,7 +50,11 @@ def test_tokenizer_against_hf(config):
     )
     ours = Tokenizer(checkpoint_dir)
 
-    assert ours.vocab_size == theirs.vocab_size
+    if config.name.startswith("CodeLlama-70b-Instruct"):
+        # TODO: the HF tokenizer returns 1 less token for this model. why?
+        assert ours.vocab_size == theirs.vocab_size + 1
+    else:
+        assert ours.vocab_size == theirs.vocab_size
     assert ours.vocab_size == config.vocab_size
 
     if config.name.startswith("falcon") or config.name.startswith("stablecode"):
@@ -68,5 +74,17 @@ def test_tokenizer_against_hf(config):
     prompt = "Hello, readers of this test!"
     actual = ours.encode(prompt)
     expected = theirs.encode(prompt)
-    assert actual.tolist() == expected
+    if config.name.startswith("CodeLlama-70b"):
+        # TODO: there's a encoding difference with this model. why? note that the decoding is equal
+        # "Hello": 10994, "▁Hello": 15043
+        assert [15043 if t == 10994 else t for t in actual.tolist()] == expected
+    else:
+        assert actual.tolist() == expected
     assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True)
+
+
+def test_tokenizer_input_validation():
+    from lit_gpt.tokenizer import Tokenizer
+
+    with pytest.raises(NotADirectoryError, match="The checkpoint directory does not exist"):
+        Tokenizer("cocofruit")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8618da1..048e153 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,11 +1,14 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import os
-import sys
 from contextlib import redirect_stderr
 from io import StringIO
 
 import pytest
 import torch
 import torch.nn.functional as F
+from conftest import RunIf
+from lightning import Fabric
 
 
 def test_find_multiple():
@@ -20,7 +23,8 @@ def test_find_multiple():
     assert find_multiple(50254, 512) == 50688
 
 
-@pytest.mark.skipif(sys.platform == "win32", reason="match fails on windows. why did they have to use backslashes?")
+# match fails on windows. why did they have to use backslashes?
+@RunIf(skip_windows=True)
 def test_check_valid_checkpoint_dir(tmp_path):
     from lit_gpt.utils import check_valid_checkpoint_dir
 
@@ -92,7 +96,8 @@ def test_incremental_write(tmp_path):
 
 
 @pytest.mark.parametrize("B", (1, 2))
-def test_chunked_cross_entropy(B):
+@pytest.mark.parametrize("ignore_index", (None, -1, -2, -100))
+def test_chunked_cross_entropy(ignore_index, B):
     from lit_gpt.utils import chunked_cross_entropy
 
     V = 50
@@ -100,23 +105,32 @@ def test_chunked_cross_entropy(B):
     regular_logits = torch.randn(B, T, V)
     targets = torch.randint(0, V, (B, T))
 
-    baseline_loss = F.cross_entropy(regular_logits.reshape(-1, regular_logits.size(-1)), targets.reshape(-1))
-    regular_loss = chunked_cross_entropy(regular_logits, targets, chunk_size=0)
+    if ignore_index is not None:
+        targets[:, [1, 4, 10, 19]] = ignore_index
+
+    baseline_loss = F.cross_entropy(
+        regular_logits.reshape(-1, regular_logits.size(-1)),
+        targets.reshape(-1),
+        ignore_index=(ignore_index if ignore_index is not None else -100),
+    )
+
+    ignore_index = ignore_index if ignore_index is not None else -1
+    regular_loss = chunked_cross_entropy(regular_logits, targets, chunk_size=0, ignore_index=ignore_index)
     assert torch.equal(baseline_loss, regular_loss)
     assert regular_loss.numel() == 1
 
-    chunked_loss = chunked_cross_entropy(regular_logits, targets, chunk_size=10)
+    chunked_loss = chunked_cross_entropy(regular_logits, targets, chunk_size=10, ignore_index=ignore_index)
     torch.testing.assert_close(chunked_loss, regular_loss)
     torch.testing.assert_close(chunked_loss, baseline_loss)
 
     logit_chunk_size = 6
     assert T % logit_chunk_size != 0  # ensure leftover
     chunked_logits = list(regular_logits.split(logit_chunk_size, dim=1))
-    chunked_loss = chunked_cross_entropy(chunked_logits, targets, chunk_size=0)
+    chunked_loss = chunked_cross_entropy(chunked_logits, targets, chunk_size=0, ignore_index=ignore_index)
     torch.testing.assert_close(chunked_loss, regular_loss)
     torch.testing.assert_close(chunked_loss, baseline_loss)
 
-    chunked_loss = chunked_cross_entropy(chunked_logits, targets, chunk_size=10)
+    chunked_loss = chunked_cross_entropy(chunked_logits, targets, chunk_size=10, ignore_index=ignore_index)
     torch.testing.assert_close(chunked_loss, regular_loss)
     torch.testing.assert_close(chunked_loss, baseline_loss)
 
@@ -134,3 +148,43 @@ def test_num_parameters():
     assert num_parameters(model) == 6
     assert num_parameters(model, requires_grad=True) == 4
     assert num_parameters(model, requires_grad=False) == 2
+
+
+@RunIf(min_cuda_gpus=1)
+@pytest.mark.parametrize("mode", ["nf4", "nf4-dq", "fp4", "fp4-dq", "int8", "int8-training"])
+@pytest.mark.skip("To be fixed")
+def test_num_parameters_bitsandbytes(mode):
+    from lightning.fabric.plugins import BitsandbytesPrecision
+
+    from lit_gpt import GPT
+    from lit_gpt.utils import num_parameters
+
+    plugin = BitsandbytesPrecision(mode=mode)
+    fabric = Fabric(plugins=plugin, accelerator="cuda", devices=1)
+
+    model = torch.nn.Linear(10, 10)
+    model = fabric.setup(model)
+    assert num_parameters(model) == 110
+
+    with fabric.init_module(empty_init=True):
+        model = GPT.from_name("pythia-14m")
+    assert num_parameters(model) == 14067712
+
+
+def test_cycle_iterator():
+    from lit_gpt.utils import CycleIterator
+
+    iterator = CycleIterator([])
+    with pytest.raises(StopIteration):
+        next(iterator)
+
+    iterator = CycleIterator(range(3))
+    assert iterator.epoch == 0
+    assert next(iterator) == 0
+    assert iterator.epoch == 0
+    assert next(iterator) == 1
+    assert iterator.epoch == 0
+    assert next(iterator) == 2
+    assert iterator.epoch == 0
+    assert next(iterator) == 0
+    assert iterator.epoch == 1
diff --git a/tutorials/convert_lit_models.md b/tutorials/convert_lit_models.md
index 971df9a..b436776 100644
--- a/tutorials/convert_lit_models.md
+++ b/tutorials/convert_lit_models.md
@@ -1,24 +1,144 @@
-## Converting Lit-GPT weights to HuggingFace Transformers
+## Converting Lit-GPT weights to Hugging Face Transformers
 
-Lit-GPT weights need to be converted to a format that HuggingFace understands with a [conversion script](../scripts/convert_lit_checkpoint.py) before our scripts can run.
+Lit-GPT weights need to be converted to a format that Hugging Face understands with a [conversion script](../scripts/convert_lit_checkpoint.py) before our scripts can run.
 
-We provide a helpful script to convert models Lit-GPT models back to their equivalent HuggingFace Transformers format:
+We provide a helpful script to convert models Lit-GPT models back to their equivalent Hugging Face Transformers format:
 
 ```sh
 python scripts/convert_lit_checkpoint.py \
-    --checkpoint_path path/to/litgpt/model.pth \
-    --output_path where/to/save/the/converted.ckpt \
-    --config_path path/to/litgpt/config.json
+    --checkpoint_path checkpoints/repo_id/lit_model.pth \
+    --output_path output_path/converted.pth \
+    --config_path checkpoints/repo_id/config.json
 ```
 
 These paths are just placeholders, you will need to customize them based on which finetuning or pretraining script you ran and it's configuration.
 
+### Loading converted Lit-GPT checkpoints into transformers
 
-Please note that if you want to convert a model that has been fine-tuned using an adapter like LoRA, these weights should be [merged](../scripts/merge_lora.py) to the checkpoint prior to converting. 
+If you want to load the converted checkpoints into a `transformers` model, please make sure you copied the original `config.json` file into the folder that contains the `converted.pth` file saved via `--output_path` above.
+
+For example,
+
+```bash
+cp checkpoints/repo_id/config.json output_path/config.json
+```
+
+Then, you can load the checkpoint file in a Python session as follows:
+
+```python
+import torch
+from transformers import AutoModel
+
+
+state_dict = torch.load("output_path/converted.pth")
+model = AutoModel.from_pretrained(
+    "output_path/", local_files_only=True, state_dict=state_dict
+)
+```
+
+Alternatively, you can also load the model without copying the `config.json` file as follows:
+
+```python
+model = AutoModel.from_pretrained("online_repo_id", state_dict=state_dict)
+```
+
+
+
+### Merging LoRA weights
+
+Please note that if you want to convert a model that has been fine-tuned using an adapter like LoRA, these weights should be [merged](../scripts/merge_lora.py) to the checkpoint prior to converting.
 
 ```sh
 python scripts/merge_lora.py \
-    --checkpoint_dir path/to/litgpt/model.pth \
+    --checkpoint_dir checkpoints/repo_id \
     --lora_path path/to/litgpt/lora_finetuned.pth \
-    --out_dir where/to/save/the/merged.ckpt
+    --out_dir output_path/merged.ckpt
+```
+
+<br>
+<br>
+
+# A finetuning and conversion tutorial
+
+This section contains a reproducible example for finetuning a Lit-GPT model and converting it back into a HF `transformer` model.
+
+1. Download a model of interest:
+
+For convenience, we first specify an environment variable (optional) to avoid copy and pasting the whole path:
+
+```bash
+export repo_id=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+```
+
+Instead of using TinyLlama, you can replace the `repo_id` target with any other model repository 
+specifier that is currently supported by Lit-GPT. You can get a list of supported repository specifier
+by running `scripts/download.py` without any additional arguments.
+
+Then, we download the model we specified via `$repo_id` above:
+
+```bash
+python scripts/download.py --repo_id $repo_id
+```
+
+2. Convert the model into the Lit-GPT format:
+
+```bash
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id
+```
+
+3. Prepare a dataset for finetuning:
+
+```bash
+python scripts/prepare_alpaca.py \
+    --checkpoint_dir checkpoints/$repo_id \
+    --destination_path data/alpaca
+```
+
+4. Finetune the model:
+
+
+```bash
+export finetuned_dir=out/lit-finetuned-model
+
+python finetune/lora.py \
+   --io.checkpoint_dir checkpoints/$repo_id \
+   --io.train_data_dir data/alpaca \
+   --io.val_data_dir data/alpaca \
+   --train.epochs 1 \
+   --io.out_dir $finetuned_dir
+```
+
+5. Merge LoRA weights:
+
+Note that this step only applies if the model was finetuned with `lora.py` above and not when `full.py` was used for finetuning.
+
+```bash
+python scripts/merge_lora.py \
+    --checkpoint_dir checkpoints/$repo_id \
+    --lora_path $finetuned_dir/lit_model_lora_finetuned.pth \
+    --out_dir $finetuned_dir/merged/
+```
+
+
+5. Convert the finetuning model back into a HF format:
+
+```bash
+python scripts/convert_lit_checkpoint.py \
+   --checkpoint_path $finetuned_dir/merged/lit_model.pth \
+   --output_path out/hf-tinyllama/converted_model.pth \
+   --config_path checkpoints/$repo_id/lit_config.json 
+```
+
+(If you used `full.py` instead of `lora.py` to finetune your model, 
+replace `$finetuned_dir/merged/lit_model.pth` with `$finetuned_dir/lit_model_finetuned.pth`.)
+
+
+6. Load the model into a `transformers` model:
+
+```python
+import torch
+from transformers import AutoModel
+
+state_dict = torch.load('out/hf-tinyllama/converted_model.pth')
+model = AutoModel.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", state_dict=state_dict)
 ```
diff --git a/tutorials/download_code_llama.md b/tutorials/download_code_llama.md
index 672c9e2..fe6d5a3 100644
--- a/tutorials/download_code_llama.md
+++ b/tutorials/download_code_llama.md
@@ -20,20 +20,23 @@ which will print
 
 ```text
 codellama/CodeLlama-7b-hf
-codellama/CodeLlama-7b-Python-hf
-codellama/CodeLlama-7b-Instruct-hf
 codellama/CodeLlama-13b-hf
-codellama/CodeLlama-13b-Python-hf
-codellama/CodeLlama-13b-Instruct-hf
 codellama/CodeLlama-34b-hf
+codellama/CodeLlama-70b-hf
+codellama/CodeLlama-7b-Python-hf
+codellama/CodeLlama-13b-Python-hf
 codellama/CodeLlama-34b-Python-hf
+codellama/CodeLlama-70b-Python-hf
+codellama/CodeLlama-7b-Instruct-hf
+codellama/CodeLlama-13b-Instruct-hf
 codellama/CodeLlama-34b-Instruct-hf
+codellama/CodeLlama-70b-Instruct-hf
 ```
 
 In order to use a specific checkpoint, for instance [CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf), download the weights and convert the checkpoint to the lit-gpt format.
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id codellama/CodeLlama-7b-Python-hf
 
diff --git a/tutorials/download_dolly.md b/tutorials/download_dolly.md
new file mode 100644
index 0000000..901e923
--- /dev/null
+++ b/tutorials/download_dolly.md
@@ -0,0 +1,45 @@
+## Download [Dolly](https://github.com/databrickslabs/dolly) weights
+
+Databricks’ [Dolly](https://huggingface.co/databricks/dolly-v2-12b) is an instruction-following large language model trained on the Databricks machine learning platform
+that is licensed for commercial use. Based on `pythia-12b`, Dolly is trained on ~15k instruction/response fine tuning records
+[`databricks-dolly-15k`](https://huggingface.co/datasets/databricks/databricks-dolly-15k) generated
+by Databricks employees in capability domains from the InstructGPT paper, including brainstorming, classification, closed QA, generation,
+information extraction, open QA and summarization. `dolly-v2-12b` is not a state-of-the-art model, but does exhibit surprisingly
+high quality instruction following behavior not characteristic of the foundation model on which it is based.
+
+For detailed info on the models, their training, and their behavior, please see the [Dolly repository](https://github.com/databrickslabs/dolly).
+
+To see all the available checkpoints for Dolly, run:
+
+```bash
+python scripts/download.py | grep dolly
+```
+
+which will print
+
+```text
+databricks/dolly-v2-3b
+databricks/dolly-v2-7b
+databricks/dolly-v2-12b
+```
+
+In order to use a specific Dolly checkpoint, for instance [dolly-v2-3b](https://huggingface.co/databricks/dolly-v2-3b), download the weights and convert the checkpoint to the lit-gpt format:
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id databricks/dolly-v2-3b
+
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/databricks/dolly-v2-3b
+```
+
+By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
+or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
+
+You're done! To execute the model just run:
+
+```bash
+pip install tokenizers
+
+python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/databricks/dolly-v2-3b
+```
diff --git a/tutorials/download_falcon.md b/tutorials/download_falcon.md
index 60dd001..1f9903f 100644
--- a/tutorials/download_falcon.md
+++ b/tutorials/download_falcon.md
@@ -26,7 +26,7 @@ tiiuae/falcon-180B-chat
 In order to use a specific Falcon checkpoint, for instance [falcon-7b](https://huggingface.co/tiiuae/falcon-7b), download the weights and convert the checkpoint to the lit-gpt format:
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id tiiuae/falcon-7b
 
diff --git a/tutorials/download_freewilly_2.md b/tutorials/download_freewilly_2.md
index 32ebba7..43f057b 100644
--- a/tutorials/download_freewilly_2.md
+++ b/tutorials/download_freewilly_2.md
@@ -5,7 +5,7 @@ Stability AI announced FreeWilly inspired by the methodology pioneered by Micros
 FreeWilly2 leverages the Llama 2 70B foundation model to reach a performance that compares favorably with GPT-3.5 for some tasks.
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id stabilityai/FreeWilly2
 
diff --git a/tutorials/download_function_calling_llama_2.md b/tutorials/download_function_calling_llama_2.md
new file mode 100644
index 0000000..f86799e
--- /dev/null
+++ b/tutorials/download_function_calling_llama_2.md
@@ -0,0 +1,32 @@
+## Download [Function Calling Llama 2](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2) weights
+
+Llama-7B with function calling is licensed according to the Meta Community license.
+
+Function calling Llama extends the hugging face Llama 2 models with function calling capabilities.
+The model responds with a structured json argument with the function name and arguments.
+
+In order to use the checkpoint, download the weights and convert the checkpoint to the lit-gpt format.
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id Trelis/Llama-2-7b-chat-hf-function-calling-v2 --from_safetensors true
+
+python scripts/convert_hf_checkpoint.py --checkpoint_dir Trelis/Llama-2-7b-chat-hf-function-calling-v2
+```
+
+By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
+or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
+
+You're done! To execute the model just run:
+
+```bash
+pip install sentencepiece
+
+python chat/base.py --checkpoint_dir Trelis/Llama-2-7b-chat-hf-function-calling-v2
+```
+Is strongly recommended to visit the model [repository](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2) to know how to format the prompt.
+
+The chat script has a generic use case with a single function defined, feel free to play with it to fit your needs, for instance to make HTTP requests with the model outputs.
+
+Have fun!
diff --git a/tutorials/download_gemma.md b/tutorials/download_gemma.md
new file mode 100644
index 0000000..f740538
--- /dev/null
+++ b/tutorials/download_gemma.md
@@ -0,0 +1,45 @@
+## Download [Gemma](https://blog.google/technology/developers/gemma-open-models/) weights
+
+Google developed and publicly released the Gemma large language models (LLMs), a collection of pretrained models in 2B and 7B parameter size that are based on the Gemini architecture.
+
+For more information, please see the [technical report](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf).
+
+
+To see all the available checkpoints, run:
+
+```bash
+python scripts/download.py | grep gemma
+```
+
+which will print
+
+```text
+google/gemma-7b
+google/gemma-2b
+google/gemma-7b-it
+google/gemma-2b-it
+```
+
+In the list above, `gemma-2b` and `gemma-7b` are the pretrained models, and `gemma-2b-it` and `gemma-7b-it` are the instruction-finetuned models.
+
+In order to use a specific checkpoint, for instance [gemma-2b](https://huggingface.co/google/gemma-2b), download the weights and convert the checkpoint to the lit-gpt format.
+
+This requires that you've been granted access to the weights on the HuggingFace hub. You can do so by following the steps at <https://huggingface.co/google/gemma-2b>.
+After access is granted, you can find your HF hub token in <https://huggingface.co/settings/tokens>.
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id google/gemma-2b --access_token your_hf_token --from_safetensors true
+
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/google/gemma-2b
+```
+
+By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM
+or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
+
+You're done! To execute the model just run:
+
+```bash
+python chat/base.py --checkpoint_dir checkpoints/google/gemma-2b
+```
diff --git a/tutorials/download_llama_2.md b/tutorials/download_llama_2.md
index 977ca9d..7b5d45b 100644
--- a/tutorials/download_llama_2.md
+++ b/tutorials/download_llama_2.md
@@ -33,7 +33,7 @@ This requires that you've been granted access to the weights on the HuggingFace
 After access is granted, you can find your HF hub token in <https://huggingface.co/settings/tokens>.
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf --access_token your_hf_token
 
diff --git a/tutorials/download_longchat.md b/tutorials/download_longchat.md
index 69ad861..aa2f6e6 100644
--- a/tutorials/download_longchat.md
+++ b/tutorials/download_longchat.md
@@ -19,7 +19,7 @@ lmsys/longchat-13b-16k
 In order to use a specific checkpoint, for instance [longchat-7b-16k](https://huggingface.co/lmsys/longchat-7b-16k), download the weights and convert the checkpoint to the lit-gpt format:
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id lmsys/longchat-7b-16k
 
diff --git a/tutorials/download_mistral.md b/tutorials/download_mistral.md
index 2f64baa..4cf200d 100644
--- a/tutorials/download_mistral.md
+++ b/tutorials/download_mistral.md
@@ -1,5 +1,7 @@
 ## Download [Mistral](https://mistral.ai) weights
 
+### Mistral
+
 [Mistral 7B](https://mistral.ai/news/announcing-mistral-7b) is Apache 2.0 licensed and can be used without restrictions. It:
 
 * Outperforms Llama 2 13B on all benchmarks
@@ -11,14 +13,30 @@
 
 Details about the data used to train the model or training procedure have not been made public.
 
-In order to use the Mistral 7B model checkpoint, which requires about 14 Gb of disk space, download the weights and convert the checkpoint to the lit-gpt format:
+To see all the available checkpoints, run:
 
 ```bash
-pip install huggingface_hub
+python scripts/download.py | grep -E 'Mistral|Mixtral'
+```
 
-python scripts/download.py --repo_id mistralai/Mistral-7B-Instruct-v0.1
+which will print
 
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.1
+```text
+mistralai/Mistral-7B-v0.1
+mistralai/Mistral-7B-Instruct-v0.1
+mistralai/Mixtral-8x7B-v0.1
+mistralai/Mixtral-8x7B-Instruct-v0.1
+mistralai/Mistral-7B-Instruct-v0.2
+```
+
+In order to use the Mistral 7B model checkpoint, which requires about 14 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format:
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id mistralai/Mistral-7B-Instruct-v0.2
+
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.2
 ```
 
 You're done! To execute the model just run:
@@ -26,5 +44,33 @@ You're done! To execute the model just run:
 ```bash
 pip install sentencepiece
 
-python chat/base.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.1
+python chat/base.py --checkpoint_dir checkpoints/mistralai/Mistral-7B-Instruct-v0.2
 ```
+
+### Mixtral
+
+[Mixtral 8x7B](https://mistral.ai/news/mixtral-of-experts) is a pretrained generative Sparse Mixture of Experts model based on Mistral 7B.
+Mistral-8x7B outperforms Llama 2 70B on most benchmarks tested.
+
+Details about the data used to train the model or training procedure have not been made public.
+
+In order to use the Mixtral 7B model checkpoint, which requires about 94 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format:
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id mistralai/Mixtral-8x7B-Instruct-v0.1 --from_safetensors true
+
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1
+```
+
+Due to the size of the model, currently only the multi-device sequential generation script can handle it.
+
+```bash
+pip install sentencepiece
+
+python generate/sequentially.py --checkpoint_dir checkpoints/mistralai/Mixtral-8x7B-Instruct-v0.1
+```
+
+You will need enough devices (2, 4, or 8) where their combined memory is higher than 94 GB to fit the model in memory.
+Please check out [this section](inference.md#run-a-large-model-on-multiple-smaller-devices) for more information about this script.
diff --git a/tutorials/download_openllama.md b/tutorials/download_openllama.md
index 5fa3f8e..009fd4a 100644
--- a/tutorials/download_openllama.md
+++ b/tutorials/download_openllama.md
@@ -21,7 +21,7 @@ openlm-research/open_llama_13b
 In order to use a specific OpenLLaMA checkpoint, for instance [open_llama_3b](https://huggingface.co/openlm-research/open_llama_3b), download the weights and convert the checkpoint to the lit-gpt format:
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id openlm-research/open_llama_3b
 
diff --git a/tutorials/download_phi.md b/tutorials/download_phi.md
new file mode 100644
index 0000000..31f7e19
--- /dev/null
+++ b/tutorials/download_phi.md
@@ -0,0 +1,77 @@
+## Download [phi](https://arxiv.org/abs/2309.05463) weights
+
+### Phi 2
+
+Microsoft Research [released](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) Phi 2, which is a 2.7 billion parameter model trained on "textbook-quality" data with knowledge distillation from Phi 1.5. The model achieves sota results among base LLMs with less than 13B parameters and matches or outperforms models up to 25x larger on complex benchmarks, e.g. it achieves better performance compared to 25x larger Llama-2-70B model on multi-step reasoning tasks, i.e., coding and math. Phi 2 was trained on 1.4T tokens and has not undergone any RLHF alignment nor has it been instruct fine-tuned. Phi 2 shares the same architecture with Phi 1.5 and has context length of 2048 tokens.
+The model weights are released under [*Microsoft Research license*](https://huggingface.co/microsoft/phi-2#license).
+
+To download the model weights and convert them to the lit-gpt format, run
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id microsoft/phi-2 --from_safetensors True
+
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/microsoft/phi-2
+```
+
+> [!WARNING]
+> Phi-2 used [dropout](https://huggingface.co/microsoft/phi-2/blob/cb2f453/config.json#L26) during training which we don't model, so training will not be equal.
+
+Inference the model in instruct mode:
+
+```bash
+python chat/base.py --checkpoint_dir checkpoints/microsoft/phi-2
+```
+```text
+>> Prompt: Write a detailed analogy between mathematics and a lighthouse.
+>> Reply: Mathematics is like a lighthouse. Mathematics provides a method to guide us through the sometimes chaotic and confusing waters of life. It provides a structured approach to problems which can help us find our way and provide direction. Just as a lighthouse keeps watch over the sea, mathematics can provide us with the tools to try and make sense of the world. Furthermore, just as a lighthouse keeps a watchful eye on the horizon, mathematics can help us reach our goals by showing us the way.
+```
+
+> [!NOTE]
+> In order to obtain appropriate answers, you may need to tweak the [input prompt](https://github.com/Lightning-AI/lit-gpt/blob/74b8df0c3f07fc31d9d1a49e870a1f7955329ad8/chat/base.py#L359). E.g. we found out that if using `"Instruct:{prompt}\nOutput:\n"` instead of `"Instruct:{prompt}\nOutput:"` the model generates longer answers in some cases.
+
+Free generation mode:
+```bash
+python generate/base.py --prompt "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\nBob:" --checkpoint_dir checkpoints/microsoft/phi-2
+```
+which yields
+```text
+Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?
+Bob: Well, one possible reason could be stress. Have you been feeling overwhelmed lately?
+Alice: Yes, I've been juggling multiple deadlines and it's been quite taxing.
+Carol: Stress can definitely impact your ability to concentrate. Maybe you need
+```
+
+### Phi 1.5
+
+A team at Microsoft Research has made available Phi 1.5, which is a 1.3 billion parameter model optimized for common sense reasoning in natural language, showing performance on par with models 5x its size, especially in grade-school mathematics and basic coding. This model retains characteristics of larger LLMs, and significant improvement was noted in reducing toxic and biased generations by avoiding web data. It's also worth highlighting that while this model performs well on language understanding and common sense reasoning tasks, it is a base model that has not undergone any supervised instruction finetuning or finetuning with RLHF.
+
+The model was trained the same data sources (7B tokens) as its [phi-1](https://arxiv.org/abs/2306.11644) predecessor, which includes
+
+- a Python code subset from [The Stack](https://arxiv.org/abs/2211.15533) v1.2
+- Q&A texts from [StackOverflow](https://archive.org/download/stackexchange)
+- code from DeepMind [code_contests](https://github.com/deepmind/code_contests)
+- synthetic Python textbooks and exercises generated by [gpt-3.5-turbo-0301](https://platform.openai.com/docs/models/gpt-3-5)
+
+In addition, to create phi-1.5, the authors included additional textbook-quality synthetic text (roughly 20B tokens) in natural language, which was created using the [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) approach.
+
+The model weights are released under a [*Microsoft Research license*](https://huggingface.co/microsoft/phi-1_5/blob/main/README.md#license).
+
+In order to use the phi-1.5 model checkpoint, which requires about 3 Gb of disk space, download the weights and convert the checkpoint to the lit-gpt format:
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id microsoft/phi-1_5
+
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/microsoft/phi-1_5
+```
+
+You're done! To execute the model just run:
+
+```bash
+pip install tokenizers
+
+python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/microsoft/phi-1_5
+```
diff --git a/tutorials/download_phi15.md b/tutorials/download_phi15.md
deleted file mode 100644
index e85f8ac..0000000
--- a/tutorials/download_phi15.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## Download [phi-1.5](https://arxiv.org/abs/2309.05463) weights
-
-A team at Microsoft Research has made available Phi 1.5, which is a 1.3 billion parameter model optimized for common sense reasoning in natural language, showing performance on par with models 5x its size, especially in grade-school mathematics and basic coding. This model retains characteristics of larger LLMs, and significant improvement was noted in reducing toxic and biased generations by avoiding web data. It's also worth highlighting that while this model performs well on language understanding and common sense reasoning tasks, it is a base model that has not undergone any supervised instruction finetuning or finetuning with RLHF.
-
-The model was trained the same data sources (7B tokens) as its [phi-1](https://arxiv.org/abs/2306.11644) predecessor, which includes
-
-- a Python code subset from [The Stack](https://arxiv.org/abs/2211.15533) v1.2
-- Q&A texts from [StackOverflow](https://archive.org/download/stackexchange)
-- code from DeepMind [code_contests](https://github.com/deepmind/code_contests)
-- synthetic Python textbooks and exercises generated by [gpt-3.5-turbo-0301](https://platform.openai.com/docs/models/gpt-3-5)
-
-In addition, to create phi-1.5, the authors included additional textbook-quality synthetic text (roughly 20B tokens) in natural language, which was created using the [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) approach.
-
-
-The model weights are released under a [*Microsoft Research license*](https://huggingface.co/microsoft/phi-1_5/blob/main/README.md#license).
-
-
-In order to use the phi-1.5 model checkpoint, which requires about 3 Gb of disk space, download the weights and convert the checkpoint to the lit-gpt format:
-
-```bash
-pip install huggingface_hub
-
-python scripts/download.py --repo_id microsoft/phi-1_5
-
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/microsoft/phi-1_5
-```
-
-You're done! To execute the model just run:
-
-```bash
-pip install tokenizers
-
-python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/microsoft/phi-1_5
-```
diff --git a/tutorials/download_pythia.md b/tutorials/download_pythia.md
index 9183e69..7124fd3 100644
--- a/tutorials/download_pythia.md
+++ b/tutorials/download_pythia.md
@@ -3,7 +3,7 @@
 EleutherAI's project Pythia combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers. Weights are released under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0).
 
 For detailed info on the models, their training, and their behavior, please see the [Pythia repository](https://github.com/EleutherAI/pythia).
-It includes a suite of 8 checkpoints (weights) on 2 different datasets: [The Pile](https://pile.eleuther.ai/), as well as The Pile with deduplication applied.
+It includes a suite of 8 checkpoints (weights) on 2 different datasets: [The Pile](https://pile.eleuther.ai/), as well as The Pile with deduplication applied. In addition there are two small models that come only in non-deduplicated form: `Pythia-14m` and `Pythia-31m`.
 
 To see all the available checkpoints for Pythia, run:
 
@@ -14,6 +14,8 @@ python scripts/download.py | grep pythia
 which will print
 
 ```text
+EleutherAI/pythia-14m
+EleutherAI/pythia-31m
 EleutherAI/pythia-70m
 EleutherAI/pythia-160m
 EleutherAI/pythia-410m
@@ -35,7 +37,7 @@ EleutherAI/pythia-12b-deduped
 In order to use a specific Pythia checkpoint, for instance [pythia-1b](https://huggingface.co/EleutherAI/pythia-1b), download the weights and convert the checkpoint to the lit-gpt format:
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id EleutherAI/pythia-1b
 
diff --git a/tutorials/download_redpajama_incite.md b/tutorials/download_redpajama_incite.md
index f065455..4bb7c91 100644
--- a/tutorials/download_redpajama_incite.md
+++ b/tutorials/download_redpajama_incite.md
@@ -27,7 +27,7 @@ togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1
 In order to use a specific RedPajama-INCITE checkpoint, for instance [RedPajama-INCITE-Base-3B-v1](https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1), download the weights and convert the checkpoint to the lit-gpt format:
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id togethercomputer/RedPajama-INCITE-Base-3B-v1
 
diff --git a/tutorials/download_stablecode.md b/tutorials/download_stablecode.md
new file mode 100644
index 0000000..19c06f8
--- /dev/null
+++ b/tutorials/download_stablecode.md
@@ -0,0 +1,54 @@
+## Download [StableCode](https://huggingface.co/collections/stabilityai/stable-code-64f9dfb4ebc8a1be0a3f7650) weights
+
+StableCode is a suite of 4 developer assistant models.
+
+Each one of them is a decoder-only code completion model with 3 billion parameters, pre-trained on a diverse collection of programming languages that ranked highest in the 2023 StackOverflow developer survey.
+
+For more info on the models, please visit the [StableCode repository](https://huggingface.co/collections/stabilityai/stable-code-64f9dfb4ebc8a1be0a3f7650).
+
+------
+
+To see all the available checkpoints for StableCode, run:
+
+```bash
+python scripts/download.py | grep -E "stable-?code"
+```
+
+which will print:
+
+```text
+stabilityai/stablecode-completion-alpha-3b
+stabilityai/stablecode-completion-alpha-3b-4k
+stabilityai/stablecode-instruct-alpha-3b
+stabilityai/stable-code-3b
+```
+
+In order to use a specific StableCode checkpoint, for instance [stable-code-3b](https://huggingface.co/stabilityai/stable-code-3b), download the weights and convert the checkpoint to the Lit-GPT format:
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+export repo_id=stabilityai/stable-code-3b
+python scripts/download.py --repo_id $repo_id --from_safetensors=True
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id
+```
+
+> [!NOTE]
+> `stablecode-completion-alpha-3b` is shipped in PyTorch .bin format, thus set `--from_safetensors=False`.
+
+By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM
+or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
+
+You're done! To execute the model just run:
+
+```bash
+pip install tokenizers
+
+python generate/base.py --prompt "Write in Python a softmax function. Be concise." --checkpoint_dir checkpoints/$repo_id
+```
+
+Or you can run the model in an interactive mode:
+
+```bash
+python chat/base.py --checkpoint_dir checkpoints/$repo_id
+```
diff --git a/tutorials/download_stablelm.md b/tutorials/download_stablelm.md
index b36976e..6904843 100644
--- a/tutorials/download_stablelm.md
+++ b/tutorials/download_stablelm.md
@@ -1,8 +1,6 @@
 ## Download [StableLM](https://github.com/Stability-AI/StableLM) weights
 
-StableLM is a family of generative language models trained by StabilityAI, trained on a dataset derived from [The Pile](https://pile.eleuther.ai/) but 3x larger, for a total of 1.5 trillion tokens. Weights are released under the [CC-BY-SA license](https://creativecommons.org/licenses/by-sa/4.0).
-
-For more info on the models, please see the [StableLM repository](https://github.com/EleutherAI/pythia). 3B and a 7B checkpoints have been released, both after pre-training and after instruction tuning, using a combination of Stanford's Alpaca, Nomic-AI's gpt4all, RyokoAI's ShareGPT52K datasets, Databricks labs' Dolly, and Anthropic's HH.
+StableLM is a family of generative language models trained by StabilityAI.
 
 To see all the available checkpoints for StableLM, run:
 
@@ -10,26 +8,45 @@ To see all the available checkpoints for StableLM, run:
 python scripts/download.py | grep stablelm
 ```
 
-which will print
+which will print:
 
 ```text
 stabilityai/stablelm-base-alpha-3b
 stabilityai/stablelm-base-alpha-7b
 stabilityai/stablelm-tuned-alpha-3b
 stabilityai/stablelm-tuned-alpha-7b
+stabilityai/stablelm-3b-4e1t
+stabilityai/stablelm-zephyr-3b
 ```
 
-In order to use a specific StableLM checkpoint, for instance [stablelm-base-alpha-3b](http://huggingface.co/stabilityai/stablelm-base-alpha-3b), download the weights and convert the checkpoint to the lit-gpt format:
+> [!Important]
+> `stablelm-base-alpha-(3,7)b` and `stablelm-tuned-alpha-(3,7)b` are deprecated and are no longer in the StableLM collection. Last time these models were updated in April 2023. Consider using `stablelm-3b-4e1t` (base model) or `stablelm-zephyr-3b` (instruct fine-tuned).
 
-```bash
-pip install huggingface_hub
+### StableLM-3B-4E1T
+
+StableLM-3B-4E1T is a 3 billion (3B) parameter language model pre-trained under the multi-epoch regime to study the impact of repeated tokens on downstream performance.
+
+Building on past achievements, StabilityAI underwent training on 1 trillion tokens for 4 epochs, as recommended by Muennighoff et al. (2023) in their study "Scaling Data-Constrained Language Models." They noted that training with repeated data over 4 epochs has minimal impact on loss compared to using unique data. Additionally, insights from "Go smol or go home" (De Vries, 2023) guided the choice of token count. The research suggests that a 2.96B model trained on 2.85 trillion tokens can achieve a loss similar to a compute-optimized 9.87B language model.
+More info can be found on [GitHub](https://github.com/Stability-AI/StableLM?tab=readme-ov-file#stablelm-3b-4e1t).
+
+### StableLM Zephyr 3B
+
+Lightweight LLM, preference tuned for instruction following and Q&A-type tasks. This model is an extension of the pre-existing StableLM 3B-4e1t model and is inspired by the Zephyr 7B model from HuggingFace. With StableLM Zephyr's 3 billion parameters, this model efficiently caters to a wide range of text generation needs, from simple queries to complex instructional contexts on edge devices.
+More details can be found in the [announcement](https://stability.ai/news/stablelm-zephyr-3b-stability-llm).
+
+### Usage
 
-python scripts/download.py --repo_id stabilityai/stablelm-base-alpha-3b
+In order to use a specific StableLM checkpoint, for instance [StableLM Zephyr 3B](https://huggingface.co/stabilityai/stablelm-zephyr-3b), download the weights and convert the checkpoint to the Lit-GPT format:
 
-python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+export repo_id=stabilityai/stablelm-zephyr-3b
+python scripts/download.py --repo_id $repo_id --from_safetensors=True
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/$repo_id
 ```
 
-By default, the convert_hf_checkpoint step will use the data type of the HF checkpoint's parameters. In cases where RAM
+By default, the `convert_hf_checkpoint` step will use the data type of the HF checkpoint's parameters. In cases where RAM
 or disk size is constrained, it might be useful to pass `--dtype bfloat16` to convert all parameters into this smaller precision before continuing.
 
 You're done! To execute the model just run:
@@ -37,5 +54,11 @@ You're done! To execute the model just run:
 ```bash
 pip install tokenizers
 
-python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
+python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/$repo_id
+```
+
+Or you can run the model in an interactive mode:
+
+```bash
+python chat/base.py --checkpoint_dir checkpoints/$repo_id
 ```
diff --git a/tutorials/download_tinyllama.md b/tutorials/download_tinyllama.md
new file mode 100644
index 0000000..77e2db9
--- /dev/null
+++ b/tutorials/download_tinyllama.md
@@ -0,0 +1,68 @@
+## Download TinyLlama weights
+
+[TinyLlama 1.1B](https://github.com/jzhang38/TinyLlama/) is Apache 2.0 licensed and can be used without restrictions.
+It is still in development and at the time of writing this, checkpoints for the model trained up to 2T tokens are available.
+The target is to train it for ~3 epochs on 3T tokens total. For more details on the schedule and progress of the pretraining, see the official [README](https://github.com/jzhang38/TinyLlama/tree/main).
+
+There are two version of TinyLlama available: a base one and a fine-tuned "Chat" version.
+To see all available versions, run:
+
+```bash
+python scripts/download.py | grep TinyLlama
+```
+
+which will print
+
+```text
+TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+TinyLlama/TinyLlama-1.1B-Chat-v1.0
+```
+
+In order to use a specific checkpoint, for instance [TinyLlama 1.1B base model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T), which requires about 5 GB of disk space, download the weights and convert the checkpoint to the lit-gpt format:
+
+```bash
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+
+python scripts/download.py --repo_id TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+
+python scripts/convert_hf_checkpoint.py \
+    --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+```
+
+-----
+
+With the `Chat` version of the model, the download and conversion procedures are slightly different.
+As this version of the model is stored in `safetensor` format, to download it an additional flag is required:
+
+```bash
+python scripts/download.py --repo_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --from_safetensors=True
+```
+
+The model is shipped in `bfloat16` format, so if your hardware doesn't support it, you can provide `dtype` argument during model conversion. For example we can convert the weights into `float32` format:
+
+```bash
+python scripts/convert_hf_checkpoint.py \
+    --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dtype=float32
+```
+
+-----
+
+You're done! To execute the model just run:
+
+```bash
+pip install sentencepiece
+
+# base version
+python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+
+# or
+
+# chat version
+python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0
+```
+
+To improve the response from Chat version you can also provide these args (as in the [model card](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0)):
+
+```bash
+python chat/base.py --checkpoint_dir checkpoints/TinyLlama/TinyLlama-1.1B-Chat-v1.0 --top_k=50 --temperature=0.7
+```
diff --git a/tutorials/download_vicuna.md b/tutorials/download_vicuna.md
index a583a4e..6b274ae 100644
--- a/tutorials/download_vicuna.md
+++ b/tutorials/download_vicuna.md
@@ -23,7 +23,7 @@ lmsys/vicuna-13b-v1.5-16k
 In order to use a specific Vicuna checkpoint, for instance [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), download the weights and convert the checkpoint to the lit-gpt format:
 
 ```bash
-pip install huggingface_hub
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
 
 python scripts/download.py --repo_id lmsys/vicuna-7b-v1.5
 
diff --git a/tutorials/evaluation.md b/tutorials/evaluation.md
index 2b3bdc5..e1dc463 100644
--- a/tutorials/evaluation.md
+++ b/tutorials/evaluation.md
@@ -22,7 +22,6 @@ Use the following command to evaluate Lit-GPT models on all tasks in Eleuther AI
 python eval/lm_eval_harness.py \
     --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
     --precision "bf16-true" \
-    --batch_size 4 \
     --save_filepath "results.json"
 ```
 
@@ -33,7 +32,6 @@ python eval/lm_eval_harness.py \
     --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
     --eval_tasks "[truthfulqa_mc,hellaswag]" \
     --precision "bf16-true" \
-    --batch_size 4 \
     --save_filepath "results.json"
 ```
 
@@ -45,7 +43,7 @@ A list of supported tasks can be found [here](https://github.com/EleutherAI/lm-e
 
 The above command can be used to evaluate models that are saved via a single checkpoint file. This includes downloaded checkpoints and base models finetuned via the full and adapter finetuning scripts.
 
-For LoRA-finetuned models, you need to first merge the LoRA weights with the original checkpoint file as described in the [Merging LoRA Weights](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/finetune_lora.md#merging-lora-weights) section of the LoRA finetuning documentation.
+For LoRA-finetuned models, you need to first merge the LoRA weights with the original checkpoint file as described in the [Merging LoRA Weights](finetune_lora.md#merging-lora-weights) section of the LoRA finetuning documentation.
 
 &nbsp;
 
@@ -60,7 +58,6 @@ For LoRA-finetuned models, you need to first merge the LoRA weights with the ori
       --checkpoint_dir "checkpoints/meta-llama/Llama-2-7b-hf" \
       --precision "bf16-true" \
       --eval_tasks "[hendrycksTest*]" \
-      --batch_size 4 \
       --num_fewshot 5 \
       --save_filepath "results.json"
   ```
diff --git a/tutorials/finetune_adapter.md b/tutorials/finetune_adapter.md
index 7fb8937..97de63f 100644
--- a/tutorials/finetune_adapter.md
+++ b/tutorials/finetune_adapter.md
@@ -27,48 +27,63 @@ For more information about dataset preparation, also see the [prepare_dataset.md
 ## Running the finetuning
 
 ```bash
-python finetune/adapter.py --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
+python finetune/adapter.py --io.checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
 ```
 
 or for Adapter V2
 
 ```bash
-python finetune/adapter_v2.py --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
+python finetune/adapter_v2.py --io.checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
 ```
 
 The finetuning requires at least one GPU with ~12 GB memory.
-You can speed up training by setting the `devices` variable in the script to utilize more GPUs if available.
+You can speed up training by passing the `devices` argument to the script to utilize more GPUs if available.
 Depending on the available GPU memory, you can also tune the `micro_batch_size` parameter to utilize the GPU efficiently.
-To fit Adapter V2 to 12GB memory set micro_batch_size = 2.
+To fit Adapter V2 to 12GB memory set `--micro_batch_size 2`.
 
 For example, the following settings will let you finetune the model in under 1 hour:
 
-```python
-devices = 4
-micro_batch_size = 4
+```bash
+--devices 4 --train.micro_batch_size 4
 ```
 
 This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name:
 
 ```bash
-python finetune/adapter.py --out_dir out/adapter/my-model-finetuned
+python finetune/adapter.py --io.out_dir out/adapter/my-model-finetuned
 ```
 
 or for Adapter V2
 
 ```bash
-python finetune/adapter_v2.py --out_dir out/adapter_v2/my-model-finetuned
+python finetune/adapter_v2.py --io.out_dir out/adapter_v2/my-model-finetuned
 ```
 
 If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument.
 For instance, to fine-tune on MPS (the GPU on modern Macs), you can run
 
 ```bash
-python finetune/adapter.py --out_dir out/adapter/my-model-finetuned --precision 32-true
+python finetune/adapter.py --io.out_dir out/adapter/my-model-finetuned --precision 32-true
 ```
 
 Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac.
 
+### Quantization
+
+Optionally, finetuning using quantization can be enabled via the `--quantize` flag, for example using the 4-bit NormalFloat data type:
+
+```bash
+python finetune/adapter.py --quantize "bnb.nf4"
+```
+
+or using adapter_v2 with double-quantization:
+
+```bash
+python finetune/adapter_v2.py --quantize "bnb.nf4-dq"
+```
+
+For additional benchmarks and resource requirements, please see the [Resource Tables](resource-tables.md).
+
 ## Test the model
 
 You can test the finetuned model with your own instructions by running:
@@ -131,12 +146,7 @@ With only a few modifications, you can prepare and train on your own instruction
 
     ```bash
     python finetune/adapter.py \
-        --data_dir data/mydata/ \
-        --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b \
-        --out_dir data/mydata-finetuned
+        --io.train_data_dir data/mydata --io.val_data_dir data/mydata/ \
+        --io.checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b \
+        --io.out_dir data/mydata-finetuned
     ```
-
-## Troubleshooting
-
-If you run into a CUDA error "Expected is_sm80 to be true, but got false", uncomment the line
-`torch.backends.cuda.enable_flash_sdp(False)` in the finetune script (see <https://github.com/Lightning-AI/lit-llama/issues/101>).
diff --git a/tutorials/finetune_full.md b/tutorials/finetune_full.md
index e74807b..f6109d8 100644
--- a/tutorials/finetune_full.md
+++ b/tutorials/finetune_full.md
@@ -21,7 +21,7 @@ For more information about dataset preparation, also see the [prepare_dataset.md
 ## Running the finetuning
 
 ```bash
-python finetune/full.py --checkpoint_dir checkpoints/tiiuae/falcon-7b
+python finetune/full.py --io.checkpoint_dir checkpoints/tiiuae/falcon-7b
 ```
 
 Finetuning the falcon-7b model requires at least 8 GPUs with ~40 GB memory each.
@@ -32,14 +32,14 @@ Depending on the available GPU memory, you can also tune the `micro_batch_size`
 This script will save checkpoints periodically to the `out_dir` directory. If you are finetuning different models or on your own dataset, you can specify an output directory with your preferred name:
 
 ```bash
-python finetune/full.py --out_dir out/full/my-model-finetuned
+python finetune/full.py --io.out_dir out/full/my-model-finetuned
 ```
 
 If your GPU does not support `bfloat16`, you can pass the `--precision 32-true` argument.
 For instance, to fine-tune on MPS (the GPU on modern Macs), you can run
 
 ```bash
-python finetune/full.py --out_dir out/full/my-model-finetuned --precision 32-true
+python finetune/full.py --io.out_dir out/full/my-model-finetuned --precision 32-true
 ```
 
 Note that `mps` as the accelerator will be picked up automatically by Fabric when running on a modern Mac.
@@ -51,7 +51,8 @@ You can test the finetuned model with your own instructions by running:
 ```bash
 python generate/full.py \
     --prompt "Recommend a movie to watch on the weekend." \
-    --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
+    --checkpoint_dir checkpoints/tiiuae/falcon-7b \
+    --finetuned_path out/full/my-model-finetuned/lit_model_finetuned.pth
 ```
 
 Output:
@@ -98,12 +99,7 @@ With only a few modifications, you can prepare and train on your own instruction
 
     ```bash
     python finetune/full.py \
-        --data_dir data/mydata/ \
-        --checkpoint_dir checkpoints/tiiuae/falcon-7b \
-        --out_dir data/mydata-finetuned
+        --io.train_data_dir data/mydata --io.val_data_dir data/mydata/ \
+        --io.checkpoint_dir checkpoints/tiiuae/falcon-7b \
+        --io.out_dir data/mydata-finetuned
     ```
-
-## Troubleshooting
-
-If you run into a CUDA error "Expected is_sm80 to be true, but got false", uncomment the line
-`torch.backends.cuda.enable_flash_sdp(False)` in the finetune script (see <https://github.com/Lightning-AI/lit-llama/issues/101>).
diff --git a/tutorials/finetune_lora.md b/tutorials/finetune_lora.md
index 37a0b91..0b858f2 100644
--- a/tutorials/finetune_lora.md
+++ b/tutorials/finetune_lora.md
@@ -21,7 +21,8 @@ The steps here only need to be done once:
 3. Download the data and generate the instruction tuning dataset:
 
 ```bash
-python scripts/prepare_alpaca.py --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
+python scripts/prepare_alpaca.py \
+  --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
 ```
 
 or [prepare your own dataset](#tune-on-your-dataset).
@@ -57,23 +58,25 @@ and optionally with double-quantization:
 python finetune/lora.py --quantize "bnb.nf4-dq"
 ```
 
-The table below lists a comparison with different settings on a StableLM 3B model finetuned with LoRA on Alpaca for 5,000 iterations using a microbatch size of 4:
+The table below lists a comparison with different settings on a StableLM 3B model finetuned with LoRA on Alpaca for 1,000 iterations using a microbatch size of 1:
 
-| Settings                                    | Training Memory | Training Time | Loss   | Inference Memory |
-|---------------------------------------------|-----------------|---------------|--------|------------------|
-| Default (bf16-mixed)                        | 34.57 GB        | 591.78s       | 0.9207 | 21.43 GB         |
-| --precision bf16-true                       | 16.93 GB        | 592.14s       | 0.9180 | 7.30 GB          |
-| --precision bf16-true --quantize bnb.nf4    | 15.28 GB        | 802.02s       | 0.9408 | 3.20 GB          |
-| --precision bf16-true --quantize bnb.nf4-dq | 15.12 GB        | 802.94s       | 0.9384 | 3.04 GB          |
+| Settings                                    | Training Memory | Training Time |  Inference Memory |
+|---------------------------------------------|-----------------|---------------|-------------------|
+| Default (bf16-mixed)                        | 26.92 GB        | 1.34 min      | 21.43 GB          |
+| --precision bf16-true                       | 9.69 GB         | 1.24 min      | 7.30 GB           |
+| --precision bf16-true --quantize bnb.nf4    | 6.35 GB         | 1.82 min      | 3.20 GB           |
+| --precision bf16-true --quantize bnb.nf4-dq | 6.19 GB         | 1.87 min      | 3.04 GB           |
 
-The advantages of QLoRA-style quantization are more pronounced in larger models, such as Llama 2 7B. The table below summarizes the results for Llama 2 7B on Alpaca for 5,000 iterations using a microbatch size of 1:
+The advantages of QLoRA-style quantization are more pronounced in larger models, such as Llama 2 7B. The table below summarizes the results for Llama 2 7B on Alpaca for 1,000 iterations using a microbatch size of 1:
 
-| Settings                                    | Training Memory  | Training Time | Loss   | Inference Memory |
-|---------------------------------------------|------------------|---------------|--------|------------------|
-| Default (bf16-mixed)                        | OutOfMemoryError | N/A           | N/A    | 40.21 GB         |
-| --precision bf16-true                       | 21.30 GB         | 876.30s       | 0.8696 | 13.52 GB         |
-| --precision bf16-true --quantize bnb.nf4    | 14.14 GB         | 1089.79s      | 1.0130 | 4.57 GB          |
-| --precision bf16-true --quantize bnb.nf4-dq | 13.84 GB         | 1135.86s      | 1.0124 | 4.26 GB          |
+| Settings                                    | Training Memory  | Training Time | Inference Memory |
+|---------------------------------------------|------------------|---------------|------------------|
+| Default (bf16-mixed)                        | OutOfMemoryError | N/A           | 40.21 GB         |
+| --precision bf16-true                       | 21.30 GB         | 2.36 min      | 13.52 GB         |
+| --precision bf16-true --quantize bnb.nf4    | 14.14 GB         | 3.68 min      | 4.57 GB          |
+| --precision bf16-true --quantize bnb.nf4-dq | 13.84 GB         | 3.83 min      | 4.26 GB          |
+
+For additional benchmarks and resource requirements, please see the [Resource Tables](resource-tables.md).
 
 &nbsp;
 
@@ -82,7 +85,8 @@ The advantages of QLoRA-style quantization are more pronounced in larger models,
 You can test the finetuned model with your own instructions by running:
 
 ```bash
-python generate/lora.py --prompt "Recommend a movie to watch on the weekend."
+python generate/lora.py \
+  --prompt "Recommend a movie to watch on the weekend."
 ```
 
 Output:
@@ -124,13 +128,16 @@ With only a few modifications, you can prepare and train on your own instruction
 4. Run the script to generate the preprocessed, tokenized train-val split:
 
    ```bash
-   python scripts/prepare_mydata.py --destination_path data/mydata/
+   python scripts/prepare_mydata.py \
+     --destination_path data/mydata/
    ```
 
 5. Run `finetune/lora.py` by passing in the location of your data (and optionally other parameters):
 
    ```bash
-   python finetune/lora.py --data_dir data/mydata/ --out_dir out/myexperiment
+   python finetune/lora.py  \
+     --io.train_data_dir data/mydata --io.val_data_dir data/mydata/ \
+     --out_dir out/myexperiment
    ```
 
 &nbsp;
@@ -146,9 +153,9 @@ Let's assume we finetuned a model using LoRA as follows:
 
 ```bash
 python finetune/lora.py \
-  --checkpoint_dir "checkpoints/stabilityai/stablelm-base-alpha-3b/" \
-  --data_dir "data/alpaca" \
-  --out_dir "out/lora_weights/stablelm-base-alpha-3b/"
+  --io.checkpoint_dir "checkpoints/stabilityai/stablelm-base-alpha-3b/" \
+  --io.train_data_dir data/mydata --io.val_data_dir data/mydata/ \
+  --io.out_dir "out/lora_weights/stablelm-base-alpha-3b/"
 ```
 
 Then, we can merge the LoRA weights with the checkpoint model using the `merge_lora.py` script as shown below:
@@ -160,10 +167,10 @@ python scripts/merge_lora.py \
   --out_dir "out/lora_merged/stablelm-base-alpha-3b/"
 ```
 
-> [!Note]
+> [!IMPORTANT]
 > If you changed the LoRA hyperparameters (`lora_r`, `lora_key`, etc.) in the
-> `finetune/lora.py` script, it is important to update the hyperparameter configuration
-> in the `scripts/merge_lora.py` script accordingly. Otherwise, you will encounter size
+> `finetune/lora.py` script, it is important to pass the same hyperparameter configuration
+> to the `scripts/merge_lora.py` script accordingly. Otherwise, you will encounter size
 > mismatch errors upon merging.
 
 After merging, we can use the `base.py` file for inference using the new checkpoint file. Note that if your new checkpoint directory is different from the original checkpoint directory, we also have to copy over the tokenizer and config files:
@@ -189,15 +196,7 @@ Similarly, you can evaluate the model using the `eval/lm_eval_harness.py` script
 
 ```bash
 python eval/lm_eval_harness.py \
-    --checkpoint_dir "out/lora_merged/stablelm-base-alpha-3b/" \
-    --precision "bf16-true" \
-    --batch_size 4 \
-    --save_filepath "results.json"
+  --checkpoint_dir "out/lora_merged/stablelm-base-alpha-3b/" \
+  --precision "bf16-true" \
+  --save_filepath "results.json"
 ```
-
-&nbsp;
-
-## Troubleshooting
-
-If you run into a CUDA error "Expected is_sm80 to be true, but got false", uncomment the line
-`torch.backends.cuda.enable_flash_sdp(False)` in the script below (see <https://github.com/Lightning-AI/lit-llama/issues/101>).
diff --git a/tutorials/inference.md b/tutorials/inference.md
index bb21cef..3d10173 100644
--- a/tutorials/inference.md
+++ b/tutorials/inference.md
@@ -33,23 +33,92 @@ Check out our [quantization tutorial](quantize.md).
 
 ## Run a large model on multiple smaller devices
 
-You can also use the Fully-Sharded Data Parallel (FSDP) distributed strategy to leverage multiple devices to perform inference. This will allow you to run models that wouldn't fit in a single card by sharding them across several.
+We offer two scripts to leverage multiple devices for inference.
 
-For instance, `falcon-40b` would require ~80 GB of GPU memory to run on a single device. We can instead run it on 4 A100 40GB GPUs:
+### [`generate/sequentially.py`](../generate/sequentially.py)
+
+Allows you to run models that wouldn't fit in a single card by partitioning the transformer blocks across all your devices and running them sequentially.
+
+For instance, `meta-llama/Llama-2-70b-chat-hf` would require ~140 GB of GPU memory to load on a single device, plus the memory for activations.
+With 80 transformer layers, we could partition them across 8, 5, 4, or 2 devices.
 
 ```shell
-python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 4
+python generate/sequentially.py \
+  --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
+  --max_new_tokens 256 \
+  --num_samples 2
 ```
 
-Which will take 32 GB of memory, and run at 0.37 tokens/sec.
+Using A100 40GB GPUs, we need to use at least 4. You can control the number of devices by setting the `CUDA_VISIBLE_DEVICES=` environment variable.
+
+| Devices | Max GPU RAM | Token/sec |
+|---------|-------------|-----------|
+| 2       | OOM         | -         |
+| 4       | 35.64 GB    | 7.55      |
+| 5       | 28.72 GB    | 7.49      |
+| 8       | 18.35 GB    | 7.47      |
 
-Or to reduce the memory requirements even further, you can try using CPU offloading. For that, you will need to manually edit the `cpu_offload=False` parameter in the file and set it to `True`.
+Note that the memory usage will also depend on the `max_new_tokens` value used.
 
-Now we can run it on just 2 devices.
+The script also supports quantization, using 4-bit precision, we can now use 2 GPUs
 
 ```shell
-python generate/base.py --checkpoint_dir checkpoints/tiiuae/falcon-40b --strategy fsdp --devices 2
+python generate/sequentially.py \
+  --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
+  --max_new_tokens 256 \
+  --num_samples 2 \
+  --quantize bnb.nf4-dq
 ```
 
-taking 13 GB of memory but running at 0.12 tokens/sec on 2 A100 40GB GPUs.
-Smaller devices like 3090s (24 GB) can also fit it with this technique.
+| Devices | Max GPU RAM | Token/sec |
+|---------|-------------|-----------|
+| 2       | 20.00 GB    | 8.63      |
+| 4       | 10.80 GB    | 8.23      |
+| 5       | 8.96 GB     | 8.10      |
+| 8       | 6.23 GB     | 8.18      |
+
+Smaller devices can also be used to run inference with this technique.
+
+### [`generate/tp.py`](../generate/tp.py)
+
+Uses tensor parallelism (TP) to run models that wouldn't fit in a single card by sharding the MLP and Attention QKV linear layers across all your devices.
+
+For instance, `meta-llama/Llama-2-70b-chat-hf` would require ~140 GB of GPU memory to load on a single device, plus the memory for activations.
+The requirement is that the intermediate size (for the MLP) and the QKV size (for attention) is divisible by the number of devices.
+With an intermediate size of 28672, we can use 2, 4, 7, or 8 devices. With a QKV size of 10240 we can use 2, 4, 5, or 8 devices.
+Since the script is configured to shard both, the intersection is used: we can only use 2, 4, or 8 devices.
+
+```shell
+python generate/tp.py \
+  --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
+  --max_new_tokens 256 \
+  --num_samples 2
+```
+
+Using A100 40GB GPUs, we need to use at least 4. You can control the number of devices by setting the `CUDA_VISIBLE_DEVICES=` environment variable.
+
+| Devices | Max GPU RAM | Token/sec |
+|---------|-------------|-----------|
+| 2       | OOM         | -         |
+| 4       | 35.46 GB    | 9.33      |
+| 8       | 18.19 GB    | 8.61      |
+
+Note that the memory usage will also depend on the `max_new_tokens` value used.
+
+The script also supports quantization, using 4-bit precision, we can now use 2 GPUs
+
+```shell
+python generate/tp.py \
+  --checkpoint_dir checkpoints/meta-llama/Llama-2-70b-chat-hf \
+  --max_new_tokens 256 \
+  --num_samples 2 \
+  --quantize bnb.nf4-dq
+```
+
+| Devices | Max GPU RAM | Token/sec |
+|---------|-------------|-----------|
+| 2       | 19.79 GB    | 6.72      |
+| 4       | 10.73 GB    | 6.48      |
+| 8       | 6.15 GB     | 6.20      |
+
+Smaller devices can also be used to run inference with this technique.
diff --git a/tutorials/neurips_challenge_quickstart.md b/tutorials/neurips_challenge_quickstart.md
deleted file mode 100644
index 62289aa..0000000
--- a/tutorials/neurips_challenge_quickstart.md
+++ /dev/null
@@ -1,220 +0,0 @@
-# NeurIPS 2023 LLM Efficiency Challenge Quickstart Guide
-
-
-
-The [NeurIPS 2023 Efficiency Challenge](https://llm-efficiency-challenge.github.io/) is a competition focused on training **1 LLM for 24 hours on 1 GPU** – the team with the best LLM gets to present their results at NeurIPS 2023.
-
-This quick start guide is a short starter guide illustrating the main steps to get started with Lit-GPT, which was selected as the competition's official starter kit.
-
-
-
-&nbsp;
-
-## Competition Facts
-
-
-&nbsp;
-
-**Permitted GPUs:**
-
-- 1x A100 (40 GB RAM);
-- 1x RTX 4090 (24 GB RAM).
-
-&nbsp;
-
-**Permitted models:**
-
-- All transformer-based LLM base models that are not finetuned yet.
-
-The subset of Lit-GPT models supported in this competition is listed in the table below.
-These don't include models that have been finetuned or otherwise aligned, as per the rules of the challenge.
-
-&nbsp;
-
-| Models in Lit-GPT         | Reference                                                    |
-| ------------------------- | ------------------------------------------------------------ |
-| Meta AI Llama 2 Base      | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)      |
-| TII UAE Falcon Base       | [TII 2023](https://falconllm.tii.ae/)                        |
-| OpenLM Research OpenLLaMA | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) |
-| EleutherAI Pythia         | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)     |
-| StabilityAI StableLM Base | [Stability AI 2023](https://github.com/Stability-AI/StableLM) |
-
-&nbsp;
-
-**Permitted datasets**
-
-Any open-source dataset is allowed. Originally, [per competition rules](https://llm-efficiency-challenge.github.io/challenge), datasets that utilize "generated content" from other LLMs were not permitted. However, the rules were recently softened to also allow LLM-generated datasets if those datasets are made available and if it is not against the usage restrictions and guidelines of the LLM. If you plan to use a specific dataset that is not explicitely listed on the [challenge website](https://llm-efficiency-challenge.github.io/challenge) or want to use LLM-generated data, it is recommended to reach out to the organizers and confirm that this is in line with the competition rules.
-
-Examples of permitted datasets are the following:
-
-- [Databricks-Dolly-15](https://huggingface.co/datasets/databricks/databricks-dolly-15k)
-- [OpenAssistant Conversations Dataset (oasst1)](https://huggingface.co/datasets/OpenAssistant/oasst1)
-- [The Flan Collection](https://github.com/google-research/FLAN/tree/main/flan/v2)
-
-You are allowed to create your own datasets if they are made
-publicly accessible under an open-source license, and they are not generated from other LLMs (even open-source ones).
-
-Helpful competition rules relevant to the dataset choice:
-
-- The maximum prompt/completion length the models are expected to handle is 2048 tokens.
-- The evaluation will be on English texts only.
-
-&nbsp;
-
-**Submission deadline**
-
-- October 25, 2023 ([Please check](https://llm-efficiency-challenge.github.io/dates) official website in case of updates.)
-
-&nbsp;
-
-## Lit-GPT Setup
-
-Use the following steps to set up the Lit-GPT repository on your machine.
-
-```shell
-git clone https://github.com/Lightning-AI/lit-gpt
-cd lit-gpt
-pip install -r requirements.txt tokenizers sentencepiece huggingface_hub
-```
-
-&nbsp;
-
-## Downloading Model Checkpoints
-
-This section explains how to download the StableLM 3B Base model, one of the smallest models supported in Lit-GPT (an even smaller, supported model is Pythia, which starts at 70M parameters). The downloaded and converted checkpoints will occupy approximately 28 Gb of disk space.
-
-```bash
-python scripts/download.py \
-  --repo_id stabilityai/stablelm-base-alpha-3b
-
-python scripts/convert_hf_checkpoint.py \
-  --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
-```
-
-While StableLM 3B Base is useful as a first starter model to set things up, you may want to use the more capable Falcon 7B or Llama 2 7B/13B models later. See the [`download_*`](https://github.com/Lightning-AI/lit-gpt/tree/main/tutorials) tutorials in Lit-GPT to download other model checkpoints.
-
-After downloading and converting the model checkpoint, you can test the model via the following command:
-
-```bash
-python generate/base.py \
-  --prompt "LLM efficiency competitions are fun, because" \
-  --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
-```
-
-&nbsp;
-
-## Downloading and Preparing Datasets
-
-The following command will download and preprocess the Dolly15k dataset for the StableLM 3B Base model:
-
-```bash
-python scripts/prepare_dolly.py \
-  --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b \
-  --destination_path data/dolly-stablelm3b
-```
-
-> [!NOTE]
-> The preprocessed dataset is specific to the StableLM 3B model. If you use a different model like Falcon or Llama 2 later, you'll need to process the dataset with that model checkpoint directory. This is because each model uses a different tokenizer.
-
-&nbsp;
-
-## Finetuning
-
-[Low-rank Adaptation (LoRA)](https://lightning.ai/pages/community/tutorial/lora-llm/) is a good choice for a first finetuning run. The Dolly dataset has ~15k samples, and the finetuning might take half an hour.
-
-To accelerate this for testing purposes, edit the [./finetune/lora.py](https://github.com/Lightning-AI/lit-gpt/blob/main/finetune/lora.py) script and change `max_iters = 50000` to `max_iters = 500` at the top of the file.
-
-> [!NOTE]
-> The Dolly dataset has a relatively long context length, which could result in out-of-memory issues. The maximum context length that is used for the evaluation, [according to the official competition rules](https://llm-efficiency-challenge.github.io/question), is 2,048 tokens. Hence, it's highly recommended to prepare the dataset with a fixed max length, for example, `python scripts/prepare_dolly.py --max_seq_length 2048`.
-
-The following command finetunes the model:
-
-```bash
-CUDA_VISIBLE_DEVICES=2 python finetune/lora.py \
-  --data_dir data/dolly-stablelm3b \
-  --checkpoint_dir "checkpoints/stabilityai/stablelm-base-alpha-3b" \
-  --out_dir "out/stablelm3b/dolly/lora/experiment1" \
-  --precision "bf16-true"
-```
-
-With 500 iterations, this takes approximately 1-2 min on an A100 and uses 26.30 GB GPU memory.
-
-If you are using an RTX 4090, change `micro_batch_size=4` to `micro_batch_size=1` so that the model will only use 12.01 GB of memory.
-
-(More finetuning settings are explained [here](https://lightning.ai/pages/community/tutorial/neurips2023-llm-efficiency-guide/#toc10).)
-
-&nbsp;
-
-## Local Evaluation
-
-The official Lit-GPT competition will use a small subset of HELM tasks for model evaluation, which includes BigBench (general), MMLU (knowledge), TruthfulQA (knowledge and harm in a multiple choice format), CNN/DailyMail (news summarization), GSM8K (math), and BBQ (bias).
-
-HELM is currently also being integrated into Lit-GPT to evaluate LLMs before submission.
-
-However, a tool with a more convenient interface is Eleuther AI's [Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), which contains some tasks, for example, BigBench, TruthfulQA, and GSM8k, that overlap with HELM. We can set up the Evaluation Harness as follows:
-
-```bash
-pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@master
-```
-
-And then we can use it via the following command:
-
-```bash
-python eval/lm_eval_harness.py \
-  --checkpoint_dir "checkpoints/stabilityai/stablelm-base-alpha-3b" \
-  --precision "bf16-true" \
-  --eval_tasks "[truthfulqa_mc,gsm8k]" \
-  --batch_size 4 \
-  --save_filepath "results-stablelm-3b.json"
-```
-
-(You can find a full task list in the task table [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).)
-
-To evaluate a LoRA-finetuned model, you need to first merge the LoRA weights with the base model to create a new checkpoint file:
-
-
-```bash
-python scripts/merge_lora.py \
-  --checkpoint_dir "checkpoints/stabilityai/stablelm-base-alpha-3b/" \
-  --lora_path "out/stablelm3b/dolly/lora/experiment1/lit_model_lora_finetuned.pth" \
-  --out_dir "out/lora_merged/stablelm-base-alpha-3b/"
-```
-
-```bash
-cp checkpoints/stabilityai/stablelm-base-alpha-3b/*.json \
-out/lora_merged/stablelm-base-alpha-3b/
-```
-
-For more information on LoRA weight merging, please see the
-[Merging LoRA Weights](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/finetune_lora.md#merging-lora-weights)
-section of the LoRA finetuning documentation.
-
-After merging the weights, we can use the `lm_eval_harness.py` similar to before with the only difference that we now use the new
-checkpoint folder containing the merged LoRA model:
-
-```bash
-python eval/lm_eval_harness.py \
-  --checkpoint_dir "out/lora_merged/stablelm-base-alpha-3b" \
-  --precision "bf16-true" \
-  --eval_tasks "[truthfulqa_mc,gsm8k]" \
-  --batch_size 4 \
-  --save_filepath "results-stablelm-3b.json"
-```
-
-&nbsp;
-
-## Submission
-
-You will be required to submit a Docker image for the submission itself. Fortunately, the organizers have a GitHub repository with the exact steps [here](https://github.com/llm-efficiency-challenge/neurips_llm_efficiency_challenge) and a toy-submission setup guide to test your model locally before submission.
-
-
-&nbsp;
-
-## Additional Information & Resources
-
-- [The official NeurIPS 2023 LLM Efficiency Challenge competition website](https://llm-efficiency-challenge.github.io/)
-- A more extensive guide, including environment setup tips: [The NeurIPS 2023 LLM Efficiency Challenge Starter Guide](https://lightning.ai/pages/community/tutorial/neurips2023-llm-efficiency-guide)
-- [Official competition Discord](https://discord.com/login?redirect_to=%2Fchannels%2F1077906959069626439%2F1134560480795570186) and [Lightning AI + Lit-GPT Discord](https://discord.com/invite/MWAEvnC5fU)
-- LoRA vs Adapter vs Adapter v2 comparison in Lit-GPT using Falcon 7B: [Finetuning Falcon LLMs More Efficiently With LoRA and Adapters](https://lightning.ai/pages/community/finetuning-falcon-efficiently/)
-- [Dealing with out-of-memory (OOM) errors in Lit-GPT](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/oom.md)
-- Introduction to Fabric (an API to access more advanced PyTorch features used in Lit-GPT) and memory saving tips: [Optimizing Memory Usage for Training LLMs and Vision Transformers in PyTorch](https://lightning.ai/pages/community/tutorial/pytorch-memory-vit-llm/)
diff --git a/tutorials/oom.md b/tutorials/oom.md
index f59c4d7..951a5ed 100644
--- a/tutorials/oom.md
+++ b/tutorials/oom.md
@@ -14,7 +14,7 @@ Here's a few things you can try:
 
 ### Reduce the micro batch size
 
-Adjust the `micro_batch_size = ...` variable in the fine-tuning and pretraining scripts. This variable determines the number of samples loaded per iteration.
+Adjust the `--train.micro_batch_size` argument in the fine-tuning and pretraining scripts. This variable determines the number of samples loaded per iteration.
 
 A smaller value will simply load fewer samples simultaneously. The minimum value is 1.
 
@@ -23,16 +23,18 @@ Experiment with different micro batch sizes to find a balance between memory con
 ### Reduce the model's context length
 
 The context length (`block_size` in the code) plays a significant role in running models with attention.
-* The pretraining scripts are configured to use the full context length of the model to train. 
-* The finetuning scripts are configured to use the longest sample length of the training data to avoid allocating unnecessary memory (`max_seq_length` in the code).
-  If that's longer than the model's context length, an error is raised. If you try to run a batch that is longer than this, an error is raised. 
+
+* The pretraining scripts are configured to use the full context length of the model to train.
+* The finetuning scripts are configured to use the longest sample length of the training data to avoid allocating unnecessary memory (`--train.max_seq_length` argument).
+  If that's longer than the model's context length, an error is raised. If you try to run a batch that is longer than this, an error is raised.
 
 However, your hardware may not support such large context lengths. Here's what you can do:
+
 * For the pretraining scripts, you can simply reduce the `Config(block_size=...)` value.
-* For the finetuning scripts, you can trim the length of the samples in your dataset. 
+* For the finetuning scripts, you can trim the length of the samples in your dataset.
   Most of the `scripts/prepare_*.py` scripts expose a `--max_seq_length=...` argument. This might also be useful in cases where
   sample lengths are highly unbalanced, as the presence of a single very long sample would incur a larger memory usage for all other
-  shorter samples. For example, the median length of the samples in Alpaca is 110 tokens. Truncating the Alpaca dataset to 256 max tokens reduces the memory requirements of a Falcon 7B model from 23.52 GB to 15.73 GB. For more information about the dataset truncation, please see the *Truncating datasets* section in the the [prepare_datasets.md](prepare_datasets.md) tutorial.
+  shorter samples. For example, the median length of the samples in Alpaca is 110 tokens. Truncating the Alpaca dataset to 256 max tokens reduces the memory requirements of a Falcon 7B model from 23.52 GB to 15.73 GB. For more information about the dataset truncation, please see the *Truncating datasets* section in the [prepare_datasets.md](prepare_datasets.md) tutorial.
 
 Keep in mind that reducing the context length will affect the modelling performance on text sequences longer than the limit.
 
@@ -48,7 +50,7 @@ Mixed precision training (`16-mixed`, `bf16-mixed`) provides better stability bu
 ### Do sharding across multiple GPUs
 
 For exceptionally large models, the aforementioned techniques might still not suffice. If you have multiple GPUs available,
-you can trade off memory for speed by changing the `devices = 1` argument in the scripts. Enabling this option enables a parallelism technique (FSDP), sharding the memory across different GPUs.
+you can trade off memory for speed by changing the `--devices 1` argument in the scripts. Enabling this option enables a parallelism technique (FSDP), sharding the memory across different GPUs.
 
 The default configuration already uses activation checkpointing, but you can enable CPU offloading by changing the `cpu_offload=False` argument in the scripts.
 
diff --git a/tutorials/prepare_dataset.md b/tutorials/prepare_dataset.md
index bf7e846..71895e3 100644
--- a/tutorials/prepare_dataset.md
+++ b/tutorials/prepare_dataset.md
@@ -2,16 +2,16 @@
 
 Below is a table of all datasets that are currently supported in Lit-GPT:
 
-
 | Name         | Task        | Size                | Reference Repo                                                  | Paper / Blog                                                                                                              | Data License                                                                                                                                                                                                     |
 |--------------|-------------|---------------------|-----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Alpaca       | Finetuning  | 51,759 samples      | [URL](https://github.com/tatsu-lab/stanford_alpaca)             | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                   | Attribution-NonCommercial 4.0 International, [ URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                                                            |
+| Alpaca       | Finetuning  | 51,759 samples      | [URL](https://github.com/tatsu-lab/stanford_alpaca)             | [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                   | Attribution-NonCommercial 4.0 International, [URL](https://crfm.stanford.edu/2023/03/13/alpaca.html)                                                                                                             |
 | Alpaca Libre | Finetuning  | 55,370 samples      | [URL](https://github.com/mobarski/alpaca-libre)                 | -                                                                                                                         | CC0/MIT,  [URL](https://github.com/mobarski/alpaca-libre)                                                                                                                                                        |
 | Dolly        | Finetuning  | 15,011 samples      | [URL](https://github.com/databrickslabs/dolly/tree/master/data) | [URL](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm)              | CC-BY-SA, [URL](https://github.com/databrickslabs/dolly#model-overview)                                                                                                                                          |
-| LongForm     | Finetuning  | 23,652 samples      | [URL](https://github.com/akoksal/LongForm)                      | [URL](https://arxiv.org/abs/2304.08460)                                                                                   | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm) |
+| FLAN         | Finetuning  | 1,753,240 samples   | [UR](https://huggingface.co/datasets/Muennighoff/flan)          | [URL](https://blog.research.google/2023/02/the-flan-collection-advancing-open.html)                                       | Subset dependent                                                                                                                                                                                                 |
+| LongForm     | Finetuning  | 23,652 samples      | [URL](https://github.com/akoksal/LongForm)                      | [URL](https://arxiv.org/abs/2304.08460)                                                                                   | No information provided and subset-dependent, [URL](https://github.com/akoksal/LongForm)                                                                                                                         |
 | LIMA         | Finetuning  | 1,084 samples       | [URL](https://huggingface.co/datasets/GAIR/lima)                | [URL](https://arxiv.org/abs/2305.11206)                                                                                   | "If the source data of LIMA has a stricter license than CC BY-NC-SA, the LIMA dataset follows the same. Otherwise, it follows the CC BY-NC-SA license", [URL](https://huggingface.co/datasets/GAIR/lima#license) |
 | OpenWeb Text | Pretraining | 8,013,769 documents | [URL](https://github.com/jcpeterson/openwebtext)                | [URL](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) | Unspecified                                                                                                                                                                                                      |
-| RedPajama    | Pretraining | 1.2 T tokens        | [URL](https://github.com/togethercomputer/RedPajama-Data)       | [URL](https://together.ai/blog/redpajama-models-v1)                                                                       | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license)                                                                                                                              |                                                                     |   |
+| RedPajama    | Pretraining | 1.2 T tokens        | [URL](https://github.com/togethercomputer/RedPajama-Data)       | [URL](https://together.ai/blog/redpajama-models-v1)                                                                       | Subset-dependent, [URL](https://github.com/togethercomputer/RedPajama-Data#license)                                                                                                                              |
 
 &nbsp;
 
@@ -32,7 +32,6 @@ The steps here only need to be done once before preparing the finetuning dataset
 
 &nbsp;
 
-
 The Alpaca dataset consists of 52,000 instructions and demonstrations produced by OpenAI's text-davinci-003 engine. This data is used in instruction-tuning, helping improve the performance of language models to follow instructions.
 
 In its development, the creators leveraged the data generation methodology from the [Self-Instruct framework](https://github.com/yizhongw/self-instruct).
@@ -91,8 +90,6 @@ python scripts/prepare_alpaca.py \
  --max_seq_length 256
 ```
 
-
-
 &nbsp;
 
 ### Dolly
@@ -139,7 +136,7 @@ The more detailed dataset composition is as follows based on a table taken from
 |                        | BEA-GEC        | 1,203                  |
 |                        | Enron          | 372                    |
 | **Total**              |                | 27,739                 |
-|  |   |  |
+|                        |                |                        |
 | **Train**              |                | 23,652                 |
 | **Validation**         |                | 2,042                  |
 | **Test**               |                | 2,045                  |
@@ -188,6 +185,30 @@ python scripts/prepare_dolly.py \
  --max_seq_length 512
 ```
 
+&nbsp;
+
+### FLAN
+
+FLAN is a collection of several datset subsets by Google. In particular, the provided script in Lit-GPT loads the subsets from
+[here](https://huggingface.co/datasets/Muennighoff/flan).
+
+By default, all subsets (1,386,050 samples) and validations sets (367,190 subsets) are combined into a single dataset:
+
+```bash
+python scripts/prepare_flan.py \
+  --checkpoint_dir "checkpoints/tiiuae/falcon-7b"
+```
+
+However, you can also select individual subsets via comma-separated strings as follows:
+
+
+```bash
+python scripts/prepare_flan.py \
+  --checkpoint_dir "checkpoints/tiiuae/falcon-7b" \
+  --subsets "aeslc_10templates,ag_news_subset_10templates,anli_r1_10templates"
+```
+
+You can find a list of all 66 supported subsets [here](https://huggingface.co/datasets/Muennighoff/flan).
 
 &nbsp;
 
@@ -197,15 +218,15 @@ After preparing the dataset, you can finetune the model using the [`finetune/*.p
 
 ```bash
 python finetune/lora.py
- --checkpoint_dir "checkpoints/tiiuae/falcon-7b" \
- --data_dir "data/alpaca_libre" \
- --out_dir "out/lora/alpaca"
+ --io.checkpoint_dir "checkpoints/tiiuae/falcon-7b" \
+ --io.train_data_dir "data/alpaca_libre" --io.val_data_dir "data/alpaca_libre" \
+ --io.out_dir "out/lora/alpaca"
 ```
 
-Please read the [tutorials/finetune_*.md](../tutorials) documents for more information about finetuning models.
+Please read the [tutorials/finetune_*.md](.) documents for more information about finetuning models.
 
 > [!IMPORTANT]
-> Make sure that the `prepare_*.py` and `finetune/*.py` scripts use the same model checkpoint specified via `--checkpoint_dir`.
+> Make sure that the `prepare_*.py` and `finetune/*.py` scripts use the same model checkpoint specified via `checkpoint_dir`.
 
 > [!IMPORTANT]
 > By default, the maximum sequence length is obtained from the model configuration file. In case you run into out-of-memory errors, especially in the cases of LIMA and Dolly,
@@ -217,7 +238,7 @@ Please read the [tutorials/finetune_*.md](../tutorials) documents for more infor
 
 The models in Lit-GPT expect datasets for instruction finetuning in the following format:
 
-```
+```text
 [
     {
         "instruction": "Write a limerick about a
@@ -237,7 +258,8 @@ The models in Lit-GPT expect datasets for instruction finetuning in the followin
     },
 ]
 ```
-(Note that epending on the task, the `"input"` text can be an empty string, as shown above.)
+
+(Note that depending on the task, the `"input"` text can be an empty string, as shown above.)
 
 Custom datasets can be prepared by either creating a new `scripts/prepare_dataset.py` script or reading the dataset
 from a CSV file.
@@ -259,6 +281,7 @@ Before you finetune, prepare the dataset using the `prepare_csv.py` script:
 ```bash
 python scripts/prepare_csv.py --csv_path path/to/the/file.csv
 ```
+
 You can also customize the dataset generation by using these additional parameters
 
 - `destination_path`: The folder where the binary data will be saved. By default, it is saved inside `data/csv`
@@ -273,22 +296,23 @@ You can also customize the dataset generation by using these additional paramete
 
 - `ignore_index`: Mask out all the tokens after this index when preparing the dataset.
 
-To use the the settings described above, you can add the respective command line arguments when calling `prepare_csv.py` as shown in the example below:
+To use the settings described above, you can add the respective command line arguments when calling `prepare_csv.py` as shown in the example below:
 
 ```bash
 python scripts/prepare_csv.py --csv_path test_data.csv \
---destination_path data/csv \
---checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b \
---test_split_fraction 0.1 \
---seed 42 \
---mask_inputs false \
---ignore_index -1
+  --destination_path data/csv \
+  --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b \
+  --test_split_fraction 0.1 \
+  --seed 42 \
+  --mask_inputs false \
+  --ignore_index -1
 ```
-Replace `test_data.csv` with your CSV path and the other additional parameters accordingly. Executing the command above will save `train.pt` and `test.pt` on your disk at the `destination_path`. Now you can use the prepared data to [finetune your model](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/finetune_lora.md#running-the-finetuning).
+
+Replace `test_data.csv` with your CSV path and the other additional parameters accordingly. Executing the command above will save `train.pt` and `test.pt` on your disk at the `destination_path`. Now you can use the prepared data to [finetune your model](finetune_lora.md#running-the-finetuning).
 
 &nbsp;
 
-### Preparing Custom Datasets Using a Dataset Prepration Script
+### Preparing Custom Datasets Using a Dataset Preparation Script
 
 If you don't have a CSV file following the format described in the previous section, the easiest way to prepare a new dataset is to copy and modify one of the existing dataset preparation scripts:
 
@@ -299,11 +323,8 @@ These scripts may look intimidating at first glance since they include code for
 
 In [`scripts/prepare_lima.py`](https://github.com/Lightning-AI/lit-gpt/blob/main/scripts/prepare_lima.py), the [line 26](https://github.com/Lightning-AI/lit-gpt/blob/98fad263a62e5e57821de817bdd5e316abfb34d4/scripts/prepare_lima.py#L26) references the HF repo ID, and the lines [50-53](https://github.com/Lightning-AI/lit-gpt/blob/98fad263a62e5e57821de817bdd5e316abfb34d4/scripts/prepare_lima.py#L50-L53) save the dataset as `train_data`. Here, `train_data` is a list that contains the instruction examples in the format mentioned above.
 
-
 In [`scripts/prepare_alpaca.py`](https://github.com/Lightning-AI/lit-gpt/blob/main/scripts/prepare_alpaca.py), you only need to modify [lines 24-25](https://github.com/Lightning-AI/lit-gpt/blob/98fad263a62e5e57821de817bdd5e316abfb34d4/scripts/prepare_alpaca.py#L24-L25) for the file name and URL, assuming the JSON file you are working with has the same format as the [Alpaca JSON file](https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json).
 
-
-
 &nbsp;
 
 ## Preparing Pretraining Datasets
@@ -311,4 +332,4 @@ In [`scripts/prepare_alpaca.py`](https://github.com/Lightning-AI/lit-gpt/blob/ma
 In addition to the finetuning dataset described above, Lit-GPT also supports several datasets for pretraining. The pretraining datasets are described in more detail in the following separate tutorial documents:
 
 - [Pretrain Llama 2 on OpenWebText](./pretrain_openwebtext.md)
-- [Pretrain Llama 2 on RedPajama](./pretrain_redpajama.md)
\ No newline at end of file
+- [Pretrain Llama 2 on RedPajama](./pretrain_redpajama.md)
diff --git a/tutorials/pretrain_openwebtext.md b/tutorials/pretrain_openwebtext.md
index 6dd1288..5dbf036 100644
--- a/tutorials/pretrain_openwebtext.md
+++ b/tutorials/pretrain_openwebtext.md
@@ -6,10 +6,8 @@ This tutorial will walk you through setting up the OpenWebText dataset and launc
 
 [OpenWebText](https://github.com/jcpeterson/openwebtext) is an open-source reproduction of OpenAI's unreleased WebText training dataset, which was originally used to train GPT-2. The version that is used here consists of 8M documents and is loaded via the `load_dataset("openwebtext", ...)` function from the [datasets](https://github.com/huggingface/datasets) Python package. [Please refer to the website hosting the dataset](https://huggingface.co/datasets/Skylion007/openwebtext) for the licensing information.
 
-
 ## Prepare OpenWebText for training
 
-
 In order to start pretraining lit-gpt on it, you need to read, tokenize, and write the data in binary format.
 
 To prepare the dataset with the Llama 2 tokenizer, run
@@ -19,20 +17,17 @@ pip install datasets
 
 python scripts/prepare_openwebtext.py \
   --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-hf/ \
-  --destination_path data/lit-openwebtext
+  --destination_path data/openwebtext
 ```
 
 The script will take about 15 min to run.
 
-
 ## Pretraining
 
 Running the pretraining script with its default settings requires at least 4 GPUs with 40GB+ each. (However, alternatively, you can train a smaller Pythia-70m on 1 GPU, more information about that further below).
 
 ```bash
-python pretrain/openwebtext.py \
-  --devices 4 \
-  --train_data_dir data/lit-openwebtext
+python pretrain/openwebtext.py --devices 4
 ```
 
 The script will save checkpoints periodically to the folder `out/`.
@@ -42,25 +37,17 @@ By default, the `pretrain/openwebtext.py` script will pretrain the Llama 2 7B mo
 
 You can easily change the size of the model by passing a different string to the model name variable
 
-```python
-model_name = "Llama-2-7b-hf"
+```shell
+--model_name "Llama-2-7b-hf"
 ```
 
 at the top of this script.
 
-The currently supported model names are contained in the [config.py](https://github.com/Lightning-AI/lit-gpt/lit_gpt/config.py) file. 
-You can 
+The currently supported model names are contained in the [config.py](https://github.com/Lightning-AI/lit-gpt/lit_gpt/config.py) file.
+You can
 
 1) either search this file for lines containing "name =",
-2) run `python scripts/download.py` without additional command line arguments,
-3) or obtain the list of all supported models programmatically, as follows:
-
-```python
-from lit_gpt.config import configs
-
-for conf in configs:
-    print(conf["name"])
-```
+2) or run `python scripts/download.py` without additional command line arguments,
 
 Keep in mind that the original LLaMA training for the 7B model required 83k A100 80GB
 hours (on a bigger dataset). However, for full pretraining on OpenWebText, you'll likely still need access to a cluster.
@@ -72,9 +59,9 @@ to launch the script across machines:
 - [Barebones cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/barebones.html)
 - [MPI](https://lightning.ai/docs/fabric/stable/guide/multi_node/other.html)
 
-The [script contains several configurations and hyperparameters](https://github.com/Lightning-AI/lit-gpt/blob/main/pretrain/redpajama.py#L23-L45) you can tweak.
+The exposes several hyperparameters you can tweak through the command line.
 
-For instance, `micro_batch_size` should be adjusted so the process will use the available
+For instance, `--train.micro_batch_size` should be adjusted so the process will use the available
 GPU memory. For more tips to avoid out-of-memory issues, please also see the more detailed
 [Dealing with out-of-memory (OOM) errors](oom.md) guide.
 
@@ -84,14 +71,9 @@ call a logging client library like `wandb` directly.
 
 ## Training a smaller model on a single GPU
 
-To train a smaller Pythia 70M model on a single GPU, you can modify the `pretrain/openwebtext.py` file to use the following settings:
-
+To train a smaller Pythia 70M model on a single GPU, you can pass the `--model_name "pythia-70m"` argument.
 
-```python
-model_name = "pythia-70m"
-```
-
-(Please see the the `download_*` scripts in the [../tutorials](../tutorials) for more information on downloading model checkpoints for different models.)
+(Please see the `download_*` scripts in the [tutorials](.) for more information on downloading model checkpoints for different models.)
 
 Also, before you start training, note that you will need to prepare the dataset specifically for this model since it may use a different tokenizer:
 
@@ -100,19 +82,5 @@ python scripts/prepare_openwebtext.py \
   --checkpoint_dir checkpoints/EleutherAI/pythia-70m/ \
   --destination_path data/lit-openwebtext
 
-python pretrain/openwebtext.py \
-  --devices 4 \
-  --train_data_dir data/lit-openwebtext
-```
-
-## Using the PyTorch Lightning `Trainer`
-
-The `pretrain/openwebtext.py` used and discussed above uses Lightning Fabric, which is an open-source library for accessing more advanced PyTorch features conveniently (for example, mixed-precision training, multi-GPU training like FSDP, and more).
-
-The PyTorch Lightning Trainer, which shares the same accelerator code with Fabric, offers additional features, such as more advanced checkpointing and logging. If you prefer using the PyTorch Lightning Trainer, you can use the alternative `pretrain/openwebtext_trainer.py` script:
-
-```bash
-python pretrain/openwebtext_trainer.py \
-  --devices 4 \
-  --train_data_dir data/lit-openwebtext
+python pretrain/openwebtext.py --devices 4
 ```
diff --git a/tutorials/pretrain_redpajama.md b/tutorials/pretrain_redpajama.md
index 3b03f6d..ca969c1 100644
--- a/tutorials/pretrain_redpajama.md
+++ b/tutorials/pretrain_redpajama.md
@@ -30,7 +30,7 @@ You can download the data using git lfs:
 
 ```bash
 # Make sure you have git-lfs installed (https://git-lfs.com):
-git lfs install
+sudo apt install git-lfs
 ```
 
 ```bash
@@ -51,11 +51,12 @@ on it, you need to read, tokenize, and write the data in binary chunks. This wil
 streaming dataset that comes with lit-gpt. You will need to have the tokenizer config available:
 
 ```bash
-pip install huggingface_hub sentencepiece
+pip install 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub' sentencepiece
 
 python scripts/download.py \
    --repo_id meta-llama/Llama-2-7b-chat-hf \
-   --access_token your_hf_token
+   --access_token your_hf_token \
+   --tokenizer_only true
 ```
 
 Then, run
@@ -90,7 +91,7 @@ Running the pretraining script with its default settings requires at least 4 GPU
 ```bash
 python pretrain/redpajama.py \
   --devices 4 \
-  --train_data_dir data/lit-redpajama
+  --io.train_data_dir data/lit-redpajama
 ```
 
 For running on the sample dataset:
@@ -98,7 +99,7 @@ For running on the sample dataset:
 ```bash
 python pretrain/redpajama.py \
   --devices 4 \
-  --train_data_dir data/lit-redpajama-sample
+  --io.train_data_dir data/lit-redpajama-sample
 ```
 
 The script will save checkpoints periodically to the folder `out/`.
@@ -108,8 +109,8 @@ By default, the `pretrain/redpajama.py` script will pretrain the Llama 2 7B mode
 
 You can easily change the size of the model by passing a different string to the model name variable
 
-```python
-model_name = "Llama-2-7b-hf"
+```shell
+--model_name "Llama-2-7b-hf"
 ```
 
 at the top of this script.
@@ -130,7 +131,7 @@ to launch the script across machines:
 - [Barebones cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/barebones.html)
 - [MPI](https://lightning.ai/docs/fabric/stable/guide/multi_node/other.html)
 
-The [script contains several configurations and hyperparameters](https://github.com/Lightning-AI/lit-gpt/blob/main/pretrain/openwebtext.py#L23-L46) you can tweak.
+The exposes several hyperparameters you can tweak through the command line.
 
 For instance, `micro_batch_size` should be adjusted so the process will use the available
 GPU memory. For more tips to avoid out-of-memory issues, please also see the more detailed
diff --git a/tutorials/pretrain_tinyllama.md b/tutorials/pretrain_tinyllama.md
new file mode 100644
index 0000000..a22a331
--- /dev/null
+++ b/tutorials/pretrain_tinyllama.md
@@ -0,0 +1,173 @@
+# Pretrain TinyLlama
+
+This tutorial will walk you through pretraining [TinyLlama](https://github.com/jzhang38/TinyLlama/).
+
+> [!TIP]
+> To get started with zero setup, clone the [TinyLlama studio on Lightning AI](https://lightning.ai/lightning-ai/studios/llm-pretrain-tinyllama-1-1b).
+
+## What's TinyLlama?
+
+[TinyLlama](https://github.com/jzhang38/TinyLlama/) is architecturally the same as Meta AI's LLama 2, but only has 1.1B parameters and is instead trained on multiple epochs on a mix of [SlimPajama](https://huggingface.co/datasets/cerebras/SlimPajama-627B) and [Starcoder](https://huggingface.co/datasets/bigcode/starcoderdata) datasets.
+
+Here is a quick fact sheet:
+
+| Name                          | Description                                                                                                                                                  |
+|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Parameters                    | 1.1B                                                                                                                                                         |
+| Model Size                    | Layers: 22, Heads: 32, Query Groups: 4, Embedding Size: 2048, Intermediate Size: 5632                                                                        |
+| Sequence Length               | 2048                                                                                                                                                         |
+| Learning Rate                 | 4e-4                                                                                                                                                         |
+| Learning Rate Schedule        | Cosine with 2000 warmup steps                                                                                                                                |
+| Training Data                 | [SlimPajama](https://huggingface.co/datasets/cerebras/slimpajama-627b) (893 GB), [Starcoder](https://huggingface.co/datasets/bigcode/starcoderdata) (290 GB) |
+| Combined Dataset Size         | Around 950B tokens                                                                                                                                           |
+| Total Tokens During Training  | 3 trillion (3 epochs)                                                                                                                                        |
+| Time to complete training     | ~ 4 weeks with 64 A100 GPUs                                                                                                                                  |
+| Model FLOPs Utilization (MFU) | 52%                                                                                                                                                          |
+
+(this table was sourced from the author's [README](https://github.com/jzhang38/TinyLlama/))
+
+## Download datasets
+
+You can download the data using git lfs:
+
+```bash
+# Make sure you have git-lfs installed (https://git-lfs.com):
+sudo apt install git-lfs
+```
+
+```bash
+git clone https://huggingface.co/datasets/cerebras/slimpajama-627b data/slimpajama-raw
+git clone https://huggingface.co/datasets/bigcode/starcoderdata data/starcoderdata-raw
+```
+
+Around 1.2 TB of disk space is required to store both datasets.
+
+## Prepare the datasets for training
+
+In order to start pretraining lit-gpt on it, you need to read, tokenize, and write the data in binary chunks. This will leverage our `lightning.data` optimization pipeline and streaming dataset that comes with Lightning.
+
+First, install additional dependencies for preprocessing:
+
+```bash
+pip install 'lightning[data]' torchmetrics tensorboard sentencepiece zstandard pandas pyarrow 'huggingface_hub[hf_transfer] @ git+https://github.com/huggingface/huggingface_hub'
+```
+
+You will need to have the tokenizer config available:
+
+```bash
+python scripts/download.py \
+   --repo_id meta-llama/Llama-2-7b-hf \
+   --access_token your_hf_token \
+   --tokenizer_only true
+```
+
+Then, run the preprocessing script for each dataset and split.
+You will require **1.1 TB** of disk space for Starcoder and **2.5** TB of space for the SlimPajama dataset.
+
+**Starcoder:**
+
+```bash
+python scripts/prepare_starcoder.py \
+  --input_dir data/starcoderdata-raw \
+  --output_dir data/starcoder \
+  --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
+```
+
+**SlimPajama:**
+
+```bash
+python scripts/prepare_slimpajama.py \
+  --input_dir data/slimpajama-raw/validation \
+  --output_dir data/slimpajama/val \
+  --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
+
+python scripts/prepare_slimpajama.py \
+  --input_dir data/slimpajama-raw/test \
+  --output_dir data/slimpajama/test \
+  --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
+
+python scripts/prepare_slimpajama.py \
+  --input_dir data/slimpajama-raw/train \
+  --output_dir data/slimpajama/train \
+  --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
+```
+
+If you want to run on a small slice of the datasets first, pass the flag `--fast_dev_run=true` to the commands above.
+In the above we are assuming that you will be using the same tokenizer as used in LlaMA/TinyLlama, but any trained [SentencePiece](https://github.com/google/sentencepiece) tokenizer with a 32000 vocabulary size will do here.
+
+## Pretraining
+
+Running the pretraining script with its default settings requires at least 8 A100 GPUs.
+
+```bash
+python pretrain/tinyllama.py
+```
+
+The script will save checkpoints periodically to the folder `out/`.
+By default, the `pretrain/tinyllama.py` script will pretrain the model with FSDP in
+`bfloat16` mixed precision and gradient accumulation.
+
+Note that the `pretrain/tinyllama.py` is not actually a model-specific training script, so feel free to change
+the configuration and size by passing a different string to the model name argument, for example:
+
+```shell
+python pretrain/tinyllama.py --model.name Gemma-2b
+```
+
+The currently supported model names are contained in the [config.py](https://github.com/Lightning-AI/lit-gpt/lit_gpt/config.py) file.
+You can
+
+1) either search this file for lines containing "name =",
+2) or run `python scripts/download.py` without additional command line arguments
+
+Keep in mind that training with a single machine will take weeks. To speed up the process, you'll need access to a cluster.
+Once you're in a cluster, you can follow [these instructions](https://lightning.ai/docs/fabric/stable/fundamentals/launch.html#launch-on-a-cluster)
+to launch the script across machines:
+
+- [Lightning AI](https://lightning.ai/docs/fabric/stable/guide/multi_node/cloud.html)
+- [SLURM cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/slurm.html)
+- [Barebones cluster](https://lightning.ai/docs/fabric/stable/guide/multi_node/barebones.html)
+- [MPI](https://lightning.ai/docs/fabric/stable/guide/multi_node/other.html)
+
+The script exposes several hyperparameters you can tweak through the command line.
+
+For instance, `--train.micro_batch_size` should be adjusted so the process will use the available
+GPU memory. For more tips to avoid out-of-memory issues, please also see the more detailed
+[Dealing with out-of-memory (OOM) errors](oom.md) guide.
+
+Last, logging is kept minimal in the script, but for long-running experiments we recommend switching to a proper experiment tracker.
+As an example, we included WandB (set `use_wandb=True`) to show how you can integrate any experiment tracking framework.
+For reference, [here are the loss curves for our reproduction](https://api.wandb.ai/links/awaelchli/y7pzdpwy).
+
+## Resume training
+
+The checkpoints saved during pretraining contain all the information to resume if needed.
+Simply rerun the script with the `--resume` argument:
+
+```bash
+python pretrain/tinyllama.py --resume out/tiny-llama-1.1b/step-00060500.pth
+```
+
+## Export checkpoints
+
+After training is completed, you can convert the checkpoint to a format that can be loaded for evaluation, inference, finetuning etc.
+
+```bash
+python scripts/convert_pretrained_checkpoint.py \
+  --checkpoint_file out/tiny-llama-1.1b/step-00060500.pth \
+  --tokenizer_dir checkpoints/meta-llama/Llama-2-7b-hf \
+  --config_name tiny-llama-1.1b \
+  --output_dir checkpoints/lit-tiny-llama-1.1b
+```
+
+After conversion, the output folder will contain these files:
+```
+checkpoints/lit-tiny-llama-1.1b
+├── lit_config.json
+├── lit_model.pth
+├── tokenizer_config.json
+├── tokenizer.json
+└── tokenizer.model
+```
+
+You can then use this checkpoint folder to run [evaluation](evaluation.md), [inference](inference.md), [finetuning](finetune_lora.md) or [process the checkpoint further](convert_lit_models.md).
diff --git a/tutorials/quantize.md b/tutorials/quantize.md
index 03974bd..c0d611a 100644
--- a/tutorials/quantize.md
+++ b/tutorials/quantize.md
@@ -32,14 +32,14 @@ Memory used: 14.50 GB
 
 To reduce the memory requirements further, Lit-GPT supports several quantization techniques, which are shown below.
 
-> [!NOTE]
+> [!TIP]
 > Most quantization examples below also use the `--precision bf16-true` setting explained above. If your GPU does not support `bfloat16`, you can change it to `--precision 16-true`.
 
 ## `bnb.nf4`
 
 Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check out the [paper](https://arxiv.org/abs/2305.14314v1) to learn more about how it works.
 
-> [!NOTE]
+> [!IMPORTANT]
 > `bitsandbytes` only supports `CUDA` devices and the `Linux` operating system.
 > Windows users should use [WSL2](https://learn.microsoft.com/en-us/windows/ai/directml/gpu-cuda-in-wsl).
 
@@ -113,29 +113,3 @@ python generate/base.py --quantize bnb.int8 --checkpoint_dir checkpoints/tiiuae/
 Time for inference 1: 20.22 sec total, 12.66 tokens/sec
 Memory used: 8.70 GB
 ```
-
-## `gptq.int4`
-
-Check out the [paper](https://arxiv.org/abs/2210.17323) to learn more about how it works.
-
-This technique needs a conversion of the weights first:
-
-```bash
-pip install datasets
-
-python quantize/gptq.py --precision bf16-true --checkpoint_dir checkpoints/tiiuae/falcon-7b
-...
-Time for quantization: 850.25 sec total
-Memory used: 23.68 GB
-```
-
-It is important to note that this conversion step required a considerable amount of memory (higher than regular inference) and may take a long time, depending on the size of the model.
-
-generation then works as usual with `--quantize gptq.int4` which will load the newly quantized checkpoint file:
-
-```bash
-python generate/base.py --quantize gptq.int4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision 32-true --max_new_tokens 256
-...
-Time for inference 1: 34.35 sec total, 7.45 tokens/sec
-Memory used: 5.05 GB
-```
diff --git a/tutorials/resource-tables.md b/tutorials/resource-tables.md
new file mode 100644
index 0000000..e9cde2a
--- /dev/null
+++ b/tutorials/resource-tables.md
@@ -0,0 +1,158 @@
+# Resource Tables
+
+- Last updated: 10/20/2023
+- Lit-GPT version: commit 8641822
+- Hardware: NVIDIA A100-SXM4-40GB
+- OS: Ubuntu 22.04.3 LTS (x86_64)
+- Nvidia driver version: 525.125.06
+- Relevant libraries
+  - PyTorch 2.1.0+cu121
+  - Bitsandbytes 0.41.1
+
+This document provides an overview and examples of hardware requirements when running models in Lit-GPT.
+
+For additional tips on lowering the GPU memory footprint, please also see the [Dealing with out-of-memory (OOM) errors](oom.md) document.
+
+All experiments were run using 16-bit brain floating point precision (`--precision bf16-true`). If your GPU does not support brain floating point precision, you can use regular 16-bit floating point precision (`--precision 16-true`).
+
+All experiments were conducted using the Alpaca dataset with its default length. Note that due to different tokenizers being used by the different models, the number of tokens in the longest training example differs based on the model:
+
+- phi1.5: 1044 tokens
+- StableLM Alpha: 1034 tokens
+- Llama 2: 1304 tokens
+- Falcon 1079 tokens
+
+Note that the number of tokens in the training set does not affect the supported context width (block size) of the models, which is as follows:
+
+- phi1.5: 2048 tokens
+- StableLM 3B Alpha: 4096 tokens
+- Llama 2: 4048 tokens
+- Falcon: 2048 tokens
+- CodeLlama 13B: 16384 tokens
+
+&nbsp;
+
+## Finetuning with LoRA on 1 GPU
+
+The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/lora.py` script.
+
+| Size  | Model          | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations |
+|-------|----------------|--------------|-----------------|----------------------|-------------|--------------------|
+| 1.3 B | phi-1.5        | None         | 1               | 1,572,864            | 4.82 GB     | 1.62 min           |
+| 1.3 B | phi-1.5        | bnb.nf4      | 1               | 1,572,864            | 3.78 GB     | 1.77 min           |
+| 1.3 B | phi-1.5        | bnb.nf4-dq   | 1               | 1,572,864            | 3.72 GB     | 1.87 min           |
+| 1.3 B | phi-1.5        | None         | 2               | 1,572,864            | 6.76 GB     | 1.65 min           |
+| 1.3 B | phi-1.5        | None         | 4               | 1,572,864            | 10.68 GB    | 1.70 min           |
+|       |                |              |                 |                      |             |                    |
+| 3 B   | StableLM Alpha | None         | 1               | 2,097,152            | 9.69 GB     | 1.24 min           |
+| 3 B   | StableLM Alpha | bnb.nf4      | 1               | 2,097,152            | 6.35 GB     | 1.82 min           |
+| 3 B   | StableLM Alpha | bnb.nf4-dq   | 1               | 2,097,152            | 6.19 GB     | 1.87 min           |
+| 3 B   | StableLM Alpha | None         | 2               | 2,097,152            | 12.10 GB    | 1.33 min           |
+| 3 B   | StableLM Alpha | None         | 4               | 2,097,152            | 16.92 GB    | 1.50 min           |
+|       |                |              |                 |                      |             |                    |
+| 7 B   | Llama 2        | None         | 1               | 4,194,304            | 21.30 GB    | 2.36 min           |
+| 7 B   | Llama 2        | bnb.nf4      | 1               | 4,194,304            | 14.14 GB    | 3.68 min           |
+| 7 B   | Llama 2        | bnb.nf4-dq   | 1               | 4,194,304            | 13.84 GB    | 3.83 min           |
+| 7 B   | Llama 2        | None         | 2               | 4,194,304            | 29.07 GB    | 2.52 min           |
+| 7 B   | Llama 2        | None         | 4               | 4,194,304            | OOM         | -                  |
+|       |                |              |                 |                      |             |                    |
+| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 38.12 GB    | 3.19 min           |
+| 13 B  | Llama 2        | bnb.nf4      | 1               | 6,553,600            | 23.14 GB    | 6.38 min           |
+| 13 B  | Llama 2        | bnb.nf4-dq   | 1               | 6,553,600            | 22.55 GB    | 6.55 min           |
+| 13 B  | Llama 2        | None         | 2               | 6,553,600            | OOM         | -                  |
+| 13 B  | Llama 2        | None         | 4               | 6,553,600            | OOM         | -                  |
+|       |                |              |                 |                      |             |                    |
+| 40 B  | Falcon         | None         | 1               | 12,042,240           | OOM         | -                  |
+| 40 B  | Falcon         | bnb.nf4      | 1               | 12,042,240           | OOM         | -                  |
+| 40 B  | Falcon         | bnb.nf4-dq   | 1               | 12,042,240           | OOM         | -                  |
+
+&nbsp;
+
+## Finetuning with Adapter on 1 GPU
+
+The following experiments were conducted on 1xA100 with a minibatch size of 128 using the `finetune/adapter.py` script.
+
+| Size | Model          | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations |
+|------|----------------|--------------|-----------------|----------------------|-------------|--------------------|
+| 3 B  | StableLM Alpha | None         | 1               | 573,888              | 9.10 GB     | 0.74 min           |
+| 3 B  | StableLM Alpha | bnb.nf4      | 1               | 573,888              | 5.65 GB     | 1.38 min           |
+| 3 B  | StableLM Alpha | bnb.nf4-dq   | 1               | 573,888              | 5.48 GB     | 1.46 min           |
+|      |                |              |                 |                      |             |                    |
+| 7 B  | Llama 2        | None         | 1               | 1,229,760            | 19.98 GB    | 1.50 min           |
+| 7 B  | Llama 2        | bnb.nf4      | 1               | 1,229,760            | 12.68 GB    | 2.93 min           |
+| 7 B  | Llama 2        | bnb.nf4-dq   | 1               | 1,229,760            | 12.38 GB    | 3.00 min           |
+
+The same config, but using the `finetune/adapter_v2.py` script.
+
+| Size | Model          | Quantization | Microbatch size | Trainable parameters | Max GPU RAM | Time 1k iterations |
+|------|----------------|--------------|-----------------|----------------------|-------------|--------------------|
+| 3 B  | StableLM Alpha | None         | 1               | 2,125,248            | 10.71 GB    | 0.87 min           |
+| 3 B  | StableLM Alpha | bnb.nf4      | 1               | 2,125,248            | 7.41 GB     | 1.59 min           |
+| 3 B  | StableLM Alpha | bnb.nf4-dq   | 1               | 2,125,248            | 7.25 GB     | 1.62 min           |
+|      |                |              |                 |                      |             |                    |
+| 7 B  | Llama 2        | None         | 1               | 4,279,744            | 25.51 GB    | 1.81 min           |
+| 7 B  | Llama 2        | bnb.nf4      | 1               | 4,279,744            | 18.30 GB    | 3.23 min           |
+| 7 B  | Llama 2        | bnb.nf4-dq   | 1               | 4,279,744            | 17.98 GB    | 3.32 min           |
+
+&nbsp;
+
+## Finetuning with LoRA on Multiple GPUs
+
+The following experiments were conducted on multiple A100 GPUs with a minibatch size of 128 using the `finetune/lora.py` script.
+
+| Size  | Model          | Quantization | Microbatch size | Trainable parameters | GPU      | Max GPU RAM | Time 1k iterations |
+|-------|----------------|--------------|-----------------|----------------------|----------|-------------|--------------------|
+| 1.3 B | phi-1.5        | None         | 1               | 1,572,864            | 2 x A100 | 4.86 GB     | 3.81 min           |
+| 1.3 B | phi-1.5        | bnb.nf4      | 1               | 1,572,864            | 2 x A100 | N/A         | -                  |
+| 1.3 B | phi-1.5        | bnb.nf4-dq   | 1               | 1,572,864            | 2 x A100 | N/A         | -                  |
+| 1.3 B | phi-1.5        | None         | 2               | 1,572,864            | 2 x A100 | 5.05 GB     | 3.63 min           |
+| 1.3 B | phi-1.5        | None         | 4               | 1,572,864            | 2 x A100 | 5.88 GB     | 3.64 min           |
+|       |                |              |                 |                      |          |             |                    |
+| 3 B   | StableLM Alpha | None         | 1               | 2,097,152            | 2 x A100 | 12.75 GB    | 2.92 min           |
+| 3 B   | StableLM Alpha | None         | 2               | 2,097,152            | 2 x A100 | 12.94 GB    | 3.06 min           |
+| 3 B   | StableLM Alpha | None         | 4               | 2,097,152            | 2 x A100 | 13.45 GB    | 3.86 min           |
+|       |                |              |                 |                      |          |             | -                  |
+| 7 B   | Llama 2        | None         | 1               | 4,194,304            | 2 x A100 | 22.18 GB    | 5.93 min           |
+| 7 B   | Llama 2        | None         | 2               | 4,194,304            | 2 x A100 | 22.47 GB    | 6.48 min           |
+| 7 B   | Llama 2        | None         | 4               | 4,194,304            | 2 x A100 | 23.39 GB    | 8.66 min           |
+|       |                |              |                 |                      |          |             |                    |
+| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 2 x A100 | OOM         | -                  |
+| 13 B  | Llama 2        | bnb.nf4      | 1               | 6,553,600            | 2 x A100 | N/A         | -                  |
+| 13 B  | Llama 2        | bnb.nf4-dq   | 1               | 6,553,600            | 2 x A100 | N/A         | -                  |
+|       |                |              |                 |                      |          |             |                    |
+| 13 B  | Llama 2        | None         | 1               | 6,553,600            | 4 x A100 | 35.57 GB    | 10.25 min          |
+| 40 B  | Falcon         | None         | 1               | 12,042,240           | 4 x A100 | OOM         | -                  |
+
+&nbsp;
+
+## Single-GPU Inference
+
+| Size  | Model          | Quantization | GPU      | Max GPU RAM                               | Token/sec |
+|-------|----------------|--------------|----------|-------------------------------------------|-----------|
+| 1.3 B | phi-1.5        | None         | 1 x A100 | 2.86 GB                                   | 42.56     |
+| 1.3 B | phi-1.5        | bnb.nf4      | 1 x A100 | 1.39 GB                                   | 22.89     |
+| 1.3 B | phi-1.5        | bnb.nf4-dq   | 1 x A100 | 1.33 GB                                   | 22.75     |
+|       |                |              |          |                                           |           |
+| 3 B   | StableLM Alpha | None         | 1 x A100 | 7.30 GB                                   | 49.01     |
+| 3 B   | StableLM Alpha | bnb.nf4      | 1 x A100 | 3.20 GB                                   | 29.04     |
+| 3 B   | StableLM Alpha | bnb.nf4-dq   | 1 x A100 | 3.04 GB                                   | 27.15     |
+|       |                |              |          |                                           |           |
+| 7 B   | Llama 2        | None         | 1 x A100 | 13.52 GB                                  | 30.97     |
+| 7 B   | Llama 2        | bnb.nf4      | 1 x A100 | 4.57 GB                                   | 19.98     |
+| 7 B   | Llama 2        | bnb.nf4-dq   | 1 x A100 | 4.26 GB                                   | 17.3      |
+|       |                |              |          |                                           |           |
+| 13 B  | Llama 2        | None         | 1 x A100 | 26.21 GB                                  | 24.82     |
+| 13 B  | Llama 2        | bnb.nf4      | 1 x A100 | 8.32 GB                                   | 16.73     |
+| 13 B  | Llama 2        | bnb.nf4-dq   | 1 x A100 | 7.72 GB                                   | 14.43     |
+|       |                |              |          |                                           |           |
+| 34 B  | CodeLlama      | None         | 1 x A100 | OOM                                       | -         |
+| 34 B  | CodeLlama      | bnb.nf4      | 1 x A100 | 20.52 GB                                  | 14.32     |
+| 34 B  | CodeLlama      | bnb.nf4-dq   | 1 x A100 | 18.95 GB                                  | 12.37     |
+|       |                |              |          |                                           |           |
+| 40 B  | Falcon         | None         | 1 x A100 | OOM                                       | -         |
+| 40 B  | Falcon         | bnb.nf4      | 1 x A100 | 26.55 GB                                  | 13.25     |
+| 40 B  | Falcon         | bnb.nf4-dq   | 1 x A100 | 24.63 GB                                  | 11.64     |
+|       |                |              |          |                                           |           |
+| 70 B  | Llama 2        | None         | 1 x A100 | OOM                                       | -         |
+| 70 B  | Llama 2        | bnb.nf4      | 1 x A100 | CUDA error: CUBLAS_STATUS_NOT_INITIALIZED | -         |
+| 70 B  | Llama 2        | bnb.nf4-dq   | 1 x A100 | 37.21 GB                                  | 7.97      |
diff --git a/xla/finetune/adapter.py b/xla/finetune/adapter.py
index cac41fe..d6371b3 100644
--- a/xla/finetune/adapter.py
+++ b/xla/finetune/adapter.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import os
 import sys
 import time
@@ -10,16 +12,15 @@
 from lightning.fabric.accelerators import XLAAccelerator
 from lightning.fabric.loggers import CSVLogger
 from lightning.fabric.strategies import XLAFSDPStrategy
+from lightning.fabric.utilities import ThroughputMonitor, measure_flops
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.parent.resolve()
 sys.path.append(str(wd))
 
 from lit_gpt.adapter import GPT, Block, Config, adapter_filter, mark_only_adapter_as_trainable
-from lit_gpt.speed_monitor import SpeedMonitorFabric as SpeedMonitor
-from lit_gpt.speed_monitor import estimate_flops, measure_flops
 from lit_gpt.tokenizer import Tokenizer
-from lit_gpt.utils import check_valid_checkpoint_dir, chunked_cross_entropy, lazy_load, num_parameters
+from lit_gpt.utils import check_valid_checkpoint_dir, chunked_cross_entropy, estimate_flops, lazy_load, num_parameters
 from scripts.prepare_alpaca import generate_prompt
 from xla.generate.base import generate
 from xla.utils import rank_print, sequential_load_and_fsdp_wrap
@@ -74,8 +75,6 @@ def setup(
 def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path) -> None:
     check_valid_checkpoint_dir(checkpoint_dir)
 
-    speed_monitor = SpeedMonitor(fabric, window_size=50, time_unit="seconds")
-
     fabric.seed_everything(1337)  # same seed for every process to init model (FSDP)
 
     if fabric.global_rank == 0:
@@ -111,7 +110,7 @@ def main(fabric: L.Fabric, data_dir: Path, checkpoint_dir: Path, out_dir: Path)
     fabric.seed_everything(1337 + fabric.global_rank)
 
     train_time = time.perf_counter()
-    train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir, speed_monitor)
+    train(fabric, model, optimizer, train_data, val_data, checkpoint_dir, out_dir)
     rank_print(fabric, f"Training time: {(time.perf_counter()-train_time):.2f}s")
 
     # Save the final checkpoint at the end of training
@@ -127,7 +126,6 @@ def train(
     val_data: List[Dict],
     checkpoint_dir: Path,
     out_dir: Path,
-    speed_monitor: SpeedMonitor,
 ) -> None:
     tokenizer = Tokenizer(checkpoint_dir)
     longest_seq_length = get_longest_seq_length(train_data)
@@ -143,22 +141,24 @@ def train(
         mark_only_adapter_as_trainable(meta_model)
         # "estimated" is not as precise as "measured". Estimated is optimistic but widely used in the wild.
         # When comparing MFU or FLOP numbers with other projects that use estimated FLOPs,
-        # consider passing `SpeedMonitor(flops_per_batch=estimated_flops)` instead
-        estimated_flops = estimate_flops(meta_model) * micro_batch_size
+        # consider passing `flops_per_batch=estimated_flops` instead
+        estimated_flops = estimate_flops(meta_model, training=True) * micro_batch_size
         rank_print(fabric, f"Estimated TFLOPs: {estimated_flops * fabric.world_size / 1e12:.2f}")
         # this assumes that all samples have a fixed length equal to the longest sequence length
         # which is most likely false during finetuning
         x = torch.randint(0, 1, (micro_batch_size, longest_seq_length))
-        measured_flops = measure_flops(meta_model, x)
+        forward_fn = lambda: meta_model(x)
+        loss_fn = lambda y: chunked_cross_entropy(y, x, chunk_size=0)
+        measured_flops = measure_flops(meta_model, forward_fn, loss_fn)
         rank_print(fabric, f"Measured TFLOPs: {measured_flops * fabric.world_size / 1e12:.2f}")
         del meta_model, x
 
+    throughput = ThroughputMonitor(fabric, window_size=50)
     step_count = 0
-    total_lengths = 0
     total_t0 = time.perf_counter()
 
     xm.mark_step()
-    for iter_num in range(max_iters):
+    for iter_num in range(1, max_iters + 1):
         if step_count <= warmup_steps:
             # linear warmup
             lr = learning_rate * step_count / warmup_steps
@@ -169,7 +169,7 @@ def train(
 
         input_ids, targets = get_batch(fabric, train_data, longest_seq_length)
 
-        is_accumulating = (iter_num + 1) % gradient_accumulation_iters != 0
+        is_accumulating = iter_num % gradient_accumulation_iters != 0
         with fabric.no_backward_sync(model, enabled=is_accumulating):
             logits = model(input_ids, lm_head_chunk_size=128)
             xm.mark_step()
@@ -186,17 +186,16 @@ def train(
         else:
             xm.mark_step()
 
-        t1 = time.perf_counter()
-        total_lengths += input_ids.size(1)
-        speed_monitor.on_train_batch_end(
-            (iter_num + 1) * micro_batch_size,
-            t1 - total_t0,
-            # this assumes that device FLOPs are the same and that all devices have the same batch size
-            fabric.world_size,
-            flops_per_batch=estimated_flops,
-            lengths=total_lengths,
-        )
         if iter_num % log_interval == 0:
+            t1 = time.perf_counter()
+            throughput.update(
+                time=t1 - total_t0,
+                batches=iter_num,
+                samples=iter_num * micro_batch_size,
+                lengths=iter_num * micro_batch_size * longest_seq_length,
+                flops=measured_flops * log_interval,
+            )
+            throughput.compute_and_log(step=iter_num)
             rank_print(
                 fabric,
                 f"iter {iter_num} step {step_count}:"
@@ -209,7 +208,6 @@ def train(
             t0 = time.perf_counter()
             val_loss = validate(fabric, model, val_data, tokenizer, longest_seq_length)
             t1 = time.perf_counter() - t0
-            speed_monitor.eval_end(t1)
             rank_print(fabric, f"step {iter_num}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
             fabric.barrier()
         if not is_accumulating and step_count % save_interval == 0:
diff --git a/xla/generate/adapter.py b/xla/generate/adapter.py
index 6db674c..bac6c79 100644
--- a/xla/generate/adapter.py
+++ b/xla/generate/adapter.py
@@ -1,6 +1,9 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 import time
 from pathlib import Path
+from typing import Optional
 
 import lightning as L
 from lightning.fabric.accelerators import XLAAccelerator
@@ -25,7 +28,7 @@ def setup(
     adapter_path: Path = Path("out/adapter/alpaca/lit_model_adapter_finetuned.pth"),
     checkpoint_dir: Path = Path("checkpoints/tiiuae/falcon-7b"),
     max_new_tokens: int = 100,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
     precision: str = "bf16-true",
 ) -> None:
@@ -58,7 +61,7 @@ def main(
     adapter_path: Path,
     checkpoint_dir: Path,
     max_new_tokens: int,
-    top_k: int,
+    top_k: Optional[int],
     temperature: float,
 ) -> None:
     check_valid_checkpoint_dir(checkpoint_dir)
diff --git a/xla/generate/base.py b/xla/generate/base.py
index a4cf2b0..db5a8ae 100644
--- a/xla/generate/base.py
+++ b/xla/generate/base.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import sys
 import time
 from pathlib import Path
@@ -96,7 +98,7 @@ def setup(
     *,
     num_samples: int = 1,
     max_new_tokens: int = 100,
-    top_k: int = 200,
+    top_k: Optional[int] = 200,
     temperature: float = 0.8,
     checkpoint_dir: Path = Path("checkpoints/tiiuae/falcon-7b"),
     precision: str = "bf16-true",
@@ -124,7 +126,7 @@ def main(
     prompt: str,
     num_samples: int,
     max_new_tokens: int,
-    top_k: int,
+    top_k: Optional[int],
     temperature: float,
     checkpoint_dir: Path,
 ) -> None:
diff --git a/xla/utils.py b/xla/utils.py
index ee1abc3..64d7083 100644
--- a/xla/utils.py
+++ b/xla/utils.py
@@ -1,3 +1,5 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+
 import itertools
 from functools import partial
 from pathlib import Path