From f89fdd47b34c96789544a0c22aa22d711b64bbe9 Mon Sep 17 00:00:00 2001 From: joecummings Date: Fri, 18 Oct 2024 12:38:36 -0700 Subject: [PATCH 01/19] [WIP] Quantization for generate_v2 --- recipes/configs/llama2/generation_v2.yaml | 19 +++++--- recipes/dev/generate_v2.py | 53 +++++++++++++---------- 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/recipes/configs/llama2/generation_v2.yaml b/recipes/configs/llama2/generation_v2.yaml index 7ce4e2c43d..1b9967829e 100644 --- a/recipes/configs/llama2/generation_v2.yaml +++ b/recipes/configs/llama2/generation_v2.yaml @@ -9,6 +9,11 @@ # Model arguments model: _component_: torchtune.models.llama2.llama2_7b +# You can turn uncomment the following lines to enable quantization for faster inference +# and potentially lower VRAM +# quantization_method: +# _component_: torchao.quantization.quant_api.int4_weight_only # int4 weight only is a good balance +# use_hqq: False # Transform arguments tokenizer: @@ -27,16 +32,16 @@ checkpointer: output_dir: ./ model_type: LLAMA2 -# Device -device: cuda -dtype: bf16 -seed: 1234 -log_level: INFO - # Generation arguments prompt: system: You are a helpful and creative AI assistant. user: What is the capital of France? -max_new_tokens: 200 +max_new_tokens: 500 temperature: 0.6 # 0.8 and 0.6 are popular values to try top_k: 300 + +# Device +device: cuda +dtype: bf16 +seed: 1234 +log_level: INFO diff --git a/recipes/dev/generate_v2.py b/recipes/dev/generate_v2.py index e63ea2dcb0..5688f86b01 100644 --- a/recipes/dev/generate_v2.py +++ b/recipes/dev/generate_v2.py @@ -17,6 +17,7 @@ from torchtune.generation import sample from torchtune.modules.transforms import Transform +from torchtune.training import compile_model class SingleTurnYAMLToMessages(Transform): @@ -65,29 +66,37 @@ class InferenceRecipe: This *does not* currently support the following features: - torch.compile - - quantization through torchao - multi-GPU generation - batch generation """ def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) - self._dtype = training.get_dtype(dtype=cfg.dtype, device=self._device) - self._logger = utils.get_logger(cfg.log_level) + self.device = utils.get_device(device=cfg.device) + self.dtype = training.get_dtype(dtype=cfg.dtype, device=self.device) + self.logger = utils.get_logger(cfg.log_level) training.set_seed(seed=cfg.seed) def setup(self, cfg: DictConfig) -> None: """Setup the model and transforms.""" - # Load checkpointer and state_dict + # Load checkpointer _checkpointer = config.instantiate(cfg.checkpointer) _ckpt_dict = _checkpointer.load_checkpoint() # Instantiate model - with training.set_default_dtype(self._dtype), self._device: + with training.set_default_dtype(self.dtype), self.device: model = config.instantiate(cfg.model) model.load_state_dict(_ckpt_dict[training.MODEL_KEY]) + self.logger.info(f"Model was initialized with precision {self.dtype}.") + + # Quantize the model if specified + if cfg.get("quantization_method") is not None: + from torchao.quantization.quant_api import quantize_ + + quantization_method = config.instantiate(cfg.quantization_method) + compile_model(model) + quantize_(model, quantization_method, device=self.device) + self.model = model - self._logger.info(f"Model was initialized with precision {self._dtype}.") # Instantiate transforms self.model_transform = config.instantiate(cfg.tokenizer) @@ -105,13 +114,13 @@ def log_metrics(self, total_time: int, tokens_per_second: float) -> None: for p in itertools.chain(self.model.parameters(), self.model.buffers()) ] ) - self._logger.info( + self.logger.info( f"Time for inference: {total_time:.02f} sec total, {tokens_per_second:.02f} tokens/sec" ) - self._logger.info( + self.logger.info( f"Bandwidth achieved: {model_size * tokens_per_second / 1e9:.02f} GB/s" ) - self._logger.info( + self.logger.info( f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" ) @@ -128,10 +137,10 @@ def generate(self, cfg: DictConfig): total_response_length = seq_len + cfg.max_new_tokens # 3. Setup KV cache - with self._device: + with self.device: self.model.setup_caches( batch_size=1, - dtype=self._dtype, + dtype=self.dtype, encoder_max_seq_len=( self.model_transform.image_seq_len if is_multimodal_input else None ), @@ -143,7 +152,7 @@ def generate(self, cfg: DictConfig): torch.ones( size=(total_response_length, total_response_length), dtype=torch.bool, - device=self._device, + device=self.device, ) ) input_pos = torch.arange(total_response_length) @@ -155,20 +164,20 @@ def generate(self, cfg: DictConfig): [model_inputs], pad_direction="left", pad_max_images=1 ) batch["encoder_mask"] = batch["encoder_mask"][:, :seq_len] - prompt = batch.pop("tokens").to(self._device) + prompt = batch.pop("tokens").to(self.device) else: - prompt = torch.tensor( - model_inputs["tokens"], device=self._device - ).unsqueeze(0) + prompt = torch.tensor(model_inputs["tokens"], device=self.device)[None, :] batch["mask"] = causal_mask[None, :seq_len] batch["input_pos"] = input_pos[None, :seq_len] - utils.batch_to_device(batch, self._device) + utils.batch_to_device(batch, self.device) # 6. Prefill step generated_tokens = [] t0 = time.perf_counter() logits = self.model(prompt, **batch)[:, -1] token = sample(logits, temperature=cfg.temperature, top_k=cfg.top_k) + t1 = time.perf_counter() + self.logger.info(f"Time to generate first token: {t1 - t0:.02f} sec") generated_tokens.append(token.item()) if is_multimodal_input: @@ -192,15 +201,15 @@ def generate(self, cfg: DictConfig): generated_tokens.append(token.item()) seq_len += 1 - t = time.perf_counter() - t0 + t2 = time.perf_counter() - t1 # 8. Translate tokens back to text decoded = self.model_transform.decode(generated_tokens) - self._logger.info(f"\n\n{decoded}\n") + self.logger.info(f"\n{decoded}\n") # 9. Log metrics - tokens_per_second = len(generated_tokens) / t - self.log_metrics(total_time=t, tokens_per_second=tokens_per_second) + tokens_per_second = len(generated_tokens) / t2 + self.log_metrics(total_time=t2, tokens_per_second=tokens_per_second) @config.parse From 86b77841a6e68db3351d2e415b222c4304712c65 Mon Sep 17 00:00:00 2001 From: joecummings Date: Fri, 18 Oct 2024 12:49:19 -0700 Subject: [PATCH 02/19] Update config --- recipes/configs/llama2/generation_v2.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/recipes/configs/llama2/generation_v2.yaml b/recipes/configs/llama2/generation_v2.yaml index 1b9967829e..07ee7830a9 100644 --- a/recipes/configs/llama2/generation_v2.yaml +++ b/recipes/configs/llama2/generation_v2.yaml @@ -9,11 +9,10 @@ # Model arguments model: _component_: torchtune.models.llama2.llama2_7b -# You can turn uncomment the following lines to enable quantization for faster inference -# and potentially lower VRAM +# You can turn uncomment the following lines to enable quantization for faster inference and potentially lower VRAM # quantization_method: -# _component_: torchao.quantization.quant_api.int4_weight_only # int4 weight only is a good balance -# use_hqq: False +# _component_: torchao.quantization.quant_api.int4_weight_only # int4_weight_only is a good balance of speed and memory +# use_hqq: False # Turn on for more accurate results # Transform arguments tokenizer: From e006f78fc39a4d3c95a55febaca7a2ada4039c26 Mon Sep 17 00:00:00 2001 From: joecummings Date: Fri, 18 Oct 2024 13:13:20 -0700 Subject: [PATCH 03/19] Add initial test for quantization --- tests/recipes/dev/test_generate_v2.py | 52 ++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index be3f995f58..6bac22110c 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -12,7 +12,12 @@ from tests.common import TUNE_PATH from tests.recipes.utils import MODEL_TEST_CONFIGS, write_hf_ckpt_config -from tests.test_utils import CKPT_MODEL_PATHS, mps_ignored_test, TOKENIZER_PATHS +from tests.test_utils import ( + CKPT_MODEL_PATHS, + gpu_test, + mps_ignored_test, + TOKENIZER_PATHS, +) class TestGenerateV2: @@ -62,6 +67,51 @@ def test_llama2_generate_results(self, caplog, monkeypatch, tmpdir): logs = caplog.text assert expected_output in logs + @pytest.mark.integration_test + @gpu_test(gpu_count=1) + def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): + ckpt = "llama2_tune" + ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) + tokenizer_path = Path(TOKENIZER_PATHS["llama2"]) + ckpt_dir = ckpt_path.parent + + # Config file needed for model conversion. + write_hf_ckpt_config(ckpt_dir) + + cmd = f""" + tune run dev/generate_v2 \ + --config llama2/generation_v2 \ + output_dir={tmpdir} \ + checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ + checkpointer.checkpoint_dir='{ckpt_dir}' \ + checkpointer.checkpoint_files=[{ckpt_path}]\ + checkpointer.output_dir={tmpdir} \ + checkpointer.model_type=LLAMA2 \ + tokenizer.path=/tmp/test-artifacts/tokenizer.model \ + device=cuda \ + dtype=bf16 \ + max_new_tokens=10 \ + seed=123 \ + quantization_method._component_=torchao.quantization.quant_api.int4_weight_only \ + """.split() + + model_config = MODEL_TEST_CONFIGS["llama2"] + cmd = cmd + model_config + + monkeypatch.setattr(sys, "argv", cmd) + with pytest.raises(SystemExit, match=""): + runpy.run_path(TUNE_PATH, run_name="__main__") + + # this is gibberish b/c the model is random weights, but it's + # the expected value for what we currently have in V2 + # this test should catch any changes to the generate recipe that affect output + expected_output = ( + "Halfotherтература retir pushingroad Chem CURLorientationocation Stadium" + ) + + logs = caplog.text + assert expected_output in logs + @pytest.mark.integration_test def test_llama2_fail_on_bad_input(self, capsys, monkeypatch, tmpdir): """Should fail when user passes in a bad input: From eafd3b294c305f81c0aaba374ac27de4d59e26a3 Mon Sep 17 00:00:00 2001 From: joecummings Date: Fri, 18 Oct 2024 14:06:48 -0700 Subject: [PATCH 04/19] Remove annoying logging errors due to atexit usage in _dynamo --- tests/conftest.py | 8 ++++++++ tests/recipes/dev/test_generate_v2.py | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 7618d393e0..bae7ab0ff1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import argparse +import logging import os import uuid from pathlib import Path @@ -18,6 +19,13 @@ CACHE_ARTIFACTS_SCRIPT_PATH = root + "/tests/cache_artifacts.sh" +def pytest_sessionfinish(): + """ + Register a hook to suppress logging errors after the session finishes. + """ + logging.raiseExceptions = False + + def pytest_configure(config): """ This hook runs before each pytest invocation. Its purpose is to handle optional fetching diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index 6bac22110c..208bfd281c 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -10,6 +10,8 @@ import pytest +import torch + from tests.common import TUNE_PATH from tests.recipes.utils import MODEL_TEST_CONFIGS, write_hf_ckpt_config from tests.test_utils import ( @@ -109,6 +111,8 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): "Halfotherтература retir pushingroad Chem CURLorientationocation Stadium" ) + torch._dynamo.reset() + logs = caplog.text assert expected_output in logs From 0575b6799e1f60dfe1a8ca03400d2094234feea6 Mon Sep 17 00:00:00 2001 From: joecummings Date: Sat, 26 Oct 2024 07:36:24 -0700 Subject: [PATCH 05/19] Update comment for what is not supported --- recipes/dev/generate_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/dev/generate_v2.py b/recipes/dev/generate_v2.py index 5688f86b01..1fb03594be 100644 --- a/recipes/dev/generate_v2.py +++ b/recipes/dev/generate_v2.py @@ -65,7 +65,7 @@ class InferenceRecipe: This works for text-only generation and image-text generation. This *does not* currently support the following features: - - torch.compile + - torch.compile for the prefill step - multi-GPU generation - batch generation """ From f318412139c5287bddd2161e5bbac4e76fdce4bb Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 04:21:55 -0700 Subject: [PATCH 06/19] Don't claim that HQQ is better --- recipes/configs/llama2/generation_v2.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/configs/llama2/generation_v2.yaml b/recipes/configs/llama2/generation_v2.yaml index 07ee7830a9..992f22373a 100644 --- a/recipes/configs/llama2/generation_v2.yaml +++ b/recipes/configs/llama2/generation_v2.yaml @@ -12,7 +12,7 @@ model: # You can turn uncomment the following lines to enable quantization for faster inference and potentially lower VRAM # quantization_method: # _component_: torchao.quantization.quant_api.int4_weight_only # int4_weight_only is a good balance of speed and memory -# use_hqq: False # Turn on for more accurate results +# use_hqq: False # Turn on to use Half-Quadratic Quantization # Transform arguments tokenizer: From 2faf50cd43e52739308ed778d7789e0e868e2873 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 05:50:00 -0700 Subject: [PATCH 07/19] Debugging code for CI --- tests/recipes/dev/test_generate_v2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index 208bfd281c..ca7aacf0ff 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -6,6 +6,7 @@ import runpy import sys +import os from pathlib import Path import pytest @@ -77,6 +78,9 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): tokenizer_path = Path(TOKENIZER_PATHS["llama2"]) ckpt_dir = ckpt_path.parent + # Debugging code - remove when done + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + # Config file needed for model conversion. write_hf_ckpt_config(ckpt_dir) From 80bb4e3f950265da9237bb55461631c7ccbe3819 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 06:42:08 -0700 Subject: [PATCH 08/19] Try smaller but more powerful, newer G5 runner --- .github/workflows/gpu_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 67b4a0705a..c91e86b801 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -24,7 +24,7 @@ defaults: jobs: gpu_test: if: github.repository_owner == 'pytorch' - runs-on: linux.8xlarge.nvidia.gpu + runs-on: linux.g5.4xlarge.nvidia.gpu strategy: matrix: python-version: ['3.9', '3.10', '3.11'] From ff2ffbaf7acf946a9b25c5cc022e52c281a00849 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 06:44:26 -0700 Subject: [PATCH 09/19] Fix linting dummy --- tests/recipes/dev/test_generate_v2.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index ca7aacf0ff..208bfd281c 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -6,7 +6,6 @@ import runpy import sys -import os from pathlib import Path import pytest @@ -78,9 +77,6 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): tokenizer_path = Path(TOKENIZER_PATHS["llama2"]) ckpt_dir = ckpt_path.parent - # Debugging code - remove when done - os.environ["CUDA_LAUNCH_BLOCKING"] = "1" - # Config file needed for model conversion. write_hf_ckpt_config(ckpt_dir) From 322c8024aad132715ce63ea3cae85e3c3be31af0 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 07:22:48 -0700 Subject: [PATCH 10/19] Specify 2.5.1 in runer --- .github/workflows/gpu_test.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index c91e86b801..b74aed05a0 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -49,7 +49,10 @@ jobs: run: python -m pip install --pre torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu121 - name: Install torch stable if: ${{ matrix.torch-version == 'stable' }} - run: python -m pip install torch torchvision torchao + # Need to specify torch==2.5.1 to solve CuDNN error, but it's only available in RC cut right now + run: | + python -m pip install torch==2.5.1 --index-url https://download.pytorch/whl/test/cu121 + python -m pip install torchvision torchao - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" From 24508fb08b8cf7b312f29e5515067df4bcbb8e27 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 07:32:37 -0700 Subject: [PATCH 11/19] Dumb typo --- .github/workflows/gpu_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index b74aed05a0..94bce0d98c 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -51,7 +51,7 @@ jobs: if: ${{ matrix.torch-version == 'stable' }} # Need to specify torch==2.5.1 to solve CuDNN error, but it's only available in RC cut right now run: | - python -m pip install torch==2.5.1 --index-url https://download.pytorch/whl/test/cu121 + python -m pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cu121 python -m pip install torchvision torchao - name: Install remaining dependencies run: | From 67718b972eeede8a60b9dd170d09dfe63d2880e7 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 07:41:23 -0700 Subject: [PATCH 12/19] Hopefully fix formatting to pick up GPU tests --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 94bce0d98c..36b01a4000 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -51,8 +51,8 @@ jobs: if: ${{ matrix.torch-version == 'stable' }} # Need to specify torch==2.5.1 to solve CuDNN error, but it's only available in RC cut right now run: | - python -m pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cu121 - python -m pip install torchvision torchao + python -m pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cu121 + python -m pip install torchvision torchao - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" From a864ef9ac514e7fe310f90c9749a1ab0bf42ed31 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 29 Oct 2024 08:41:51 -0700 Subject: [PATCH 13/19] Switch order of these things b/c apparantly torchvision installs stable --- .github/workflows/gpu_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 36b01a4000..1830bd0c61 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -51,8 +51,8 @@ jobs: if: ${{ matrix.torch-version == 'stable' }} # Need to specify torch==2.5.1 to solve CuDNN error, but it's only available in RC cut right now run: | - python -m pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cu121 python -m pip install torchvision torchao + python -m pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cu121 - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" From 31f64b584bc365b887b2703cd21dae29ef3be625 Mon Sep 17 00:00:00 2001 From: joecummings Date: Wed, 30 Oct 2024 10:41:08 -0700 Subject: [PATCH 14/19] 2.5.1 was released so we're all good --- .github/workflows/gpu_test.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 1830bd0c61..d430ac398f 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -49,10 +49,7 @@ jobs: run: python -m pip install --pre torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu121 - name: Install torch stable if: ${{ matrix.torch-version == 'stable' }} - # Need to specify torch==2.5.1 to solve CuDNN error, but it's only available in RC cut right now - run: | - python -m pip install torchvision torchao - python -m pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/test/cu121 + run: python -m pip install torchvision torchao torch - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" From fc501ee045e3cad9de487c314989f9afaae43e11 Mon Sep 17 00:00:00 2001 From: joecummings Date: Wed, 30 Oct 2024 11:20:46 -0700 Subject: [PATCH 15/19] Test reseting compile before test --- recipes/dev/generate_v2.py | 3 ++- tests/recipes/dev/test_generate_v2.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/recipes/dev/generate_v2.py b/recipes/dev/generate_v2.py index 1fb03594be..2f58663076 100644 --- a/recipes/dev/generate_v2.py +++ b/recipes/dev/generate_v2.py @@ -93,8 +93,9 @@ def setup(self, cfg: DictConfig) -> None: from torchao.quantization.quant_api import quantize_ quantization_method = config.instantiate(cfg.quantization_method) - compile_model(model) quantize_(model, quantization_method, device=self.device) + # Compile for most speedup + compile_model(model) self.model = model diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index 208bfd281c..3506bffde5 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -77,6 +77,8 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): tokenizer_path = Path(TOKENIZER_PATHS["llama2"]) ckpt_dir = ckpt_path.parent + torch._dynamo.reset() + # Config file needed for model conversion. write_hf_ckpt_config(ckpt_dir) From 3594697611214272a67163861acc2f193ce3604c Mon Sep 17 00:00:00 2001 From: joecummings Date: Thu, 31 Oct 2024 07:32:35 -0700 Subject: [PATCH 16/19] Comment out LoRA just for testing --- tests/recipes/test_lora_finetune_single_device.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py index 80bc5dc072..05bcdd8a38 100644 --- a/tests/recipes/test_lora_finetune_single_device.py +++ b/tests/recipes/test_lora_finetune_single_device.py @@ -135,6 +135,7 @@ def test_loss_qlora( tmpdir, monkeypatch, ): + return True ckpt = "llama2_meta" ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) ckpt_dir = ckpt_path.parent From d03db9fd6de9695db61802d3172a5e85fbcd051c Mon Sep 17 00:00:00 2001 From: joecummings Date: Thu, 31 Oct 2024 07:38:07 -0700 Subject: [PATCH 17/19] TORCHINDUCTOR DISABLE CACHES --- tests/recipes/dev/test_generate_v2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index 3506bffde5..eabd09a458 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -77,13 +77,12 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): tokenizer_path = Path(TOKENIZER_PATHS["llama2"]) ckpt_dir = ckpt_path.parent - torch._dynamo.reset() - # Config file needed for model conversion. write_hf_ckpt_config(ckpt_dir) cmd = f""" - tune run dev/generate_v2 \ + TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 \ + tune run dev/generate_v2 \ --config llama2/generation_v2 \ output_dir={tmpdir} \ checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ From ae2cf5b4c3e2f87ee5c065c716d97b070d909eae Mon Sep 17 00:00:00 2001 From: joecummings Date: Thu, 31 Oct 2024 08:30:24 -0700 Subject: [PATCH 18/19] Set backend to eager --- tests/recipes/dev/test_generate_v2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index eabd09a458..032d6fb730 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -81,8 +81,7 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): write_hf_ckpt_config(ckpt_dir) cmd = f""" - TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 \ - tune run dev/generate_v2 \ + tune run dev/generate_v2 \ --config llama2/generation_v2 \ output_dir={tmpdir} \ checkpointer=torchtune.training.FullModelTorchTuneCheckpointer \ @@ -101,6 +100,8 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): model_config = MODEL_TEST_CONFIGS["llama2"] cmd = cmd + model_config + import os + os.environ["TORCH_COMPILE_BACKEND"] = "eager" monkeypatch.setattr(sys, "argv", cmd) with pytest.raises(SystemExit, match=""): runpy.run_path(TUNE_PATH, run_name="__main__") From 23c9bb7807237cfb4bd9eb9e50dfd2977d0efb6f Mon Sep 17 00:00:00 2001 From: joecummings Date: Thu, 31 Oct 2024 08:51:58 -0700 Subject: [PATCH 19/19] Fix linting --- tests/recipes/dev/test_generate_v2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/recipes/dev/test_generate_v2.py b/tests/recipes/dev/test_generate_v2.py index 032d6fb730..dff82673cb 100644 --- a/tests/recipes/dev/test_generate_v2.py +++ b/tests/recipes/dev/test_generate_v2.py @@ -101,6 +101,7 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): cmd = cmd + model_config import os + os.environ["TORCH_COMPILE_BACKEND"] = "eager" monkeypatch.setattr(sys, "argv", cmd) with pytest.raises(SystemExit, match=""): @@ -114,6 +115,7 @@ def test_llama2_generate_with_quantization(self, caplog, monkeypatch, tmpdir): ) torch._dynamo.reset() + del os.environ["TORCH_COMPILE_BACKEND"] logs = caplog.text assert expected_output in logs