From c1f4faacd20a79ecfe69eafafd62ffb0f81a7476 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Wed, 25 Sep 2024 10:18:59 -0400 Subject: [PATCH 1/5] cleanup: Move tokenizer_data_utils to /utils to /data, change imports Signed-off-by: Will Johnson --- scripts/run_inference.py | 4 ++-- tests/utils/test_embedding_resize.py | 6 +++--- tuning/data/__init__.py | 13 ------------- tuning/sft_trainer.py | 4 ++-- tuning/{data => utils}/tokenizer_data_utils.py | 0 5 files changed, 7 insertions(+), 20 deletions(-) delete mode 100644 tuning/data/__init__.py rename tuning/{data => utils}/tokenizer_data_utils.py (100%) diff --git a/scripts/run_inference.py b/scripts/run_inference.py index 7e4465cac..de8462826 100644 --- a/scripts/run_inference.py +++ b/scripts/run_inference.py @@ -34,7 +34,7 @@ import torch # Local -from tuning.data import tokenizer_data_utils +from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize ### Utilities @@ -219,7 +219,7 @@ def load( # where the model's layers are modified, in our case the embedding layer # is modified, so we resize the backbone model's embedding layer with our own # utility before passing it along to load the PEFT model. - tokenizer_data_utils.tokenizer_and_embedding_resize( + tokenizer_and_embedding_resize( {}, tokenizer=tokenizer, model=base_model ) model = PeftModel.from_pretrained( diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py index 9a72f397b..d531348d2 100644 --- a/tests/utils/test_embedding_resize.py +++ b/tests/utils/test_embedding_resize.py @@ -20,7 +20,7 @@ import torch # Local -from tuning.data import tokenizer_data_utils +from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize MODEL_NAME = "Maykeye/TinyLLama-v0" @@ -46,11 +46,11 @@ def test_output_unaltered_across_embedding_resizes(): model_not_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME) - tokenizer_data_utils.tokenizer_and_embedding_resize( + tokenizer_and_embedding_resize( special_tokens_dict={}, tokenizer=tokenizer, model=model_resized, multiple_of=8 ) - tokenizer_data_utils.tokenizer_and_embedding_resize( + tokenizer_and_embedding_resize( special_tokens_dict={}, tokenizer=tokenizer, model=model_not_resized, diff --git a/tuning/data/__init__.py b/tuning/data/__init__.py deleted file mode 100644 index 38a9531ef..000000000 --- a/tuning/data/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright The FMS HF Tuning Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index beb894624..2f9bf0c98 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -53,9 +53,9 @@ FileLoggingTrackerConfig, TrackerConfigFactory, ) -from tuning.data import tokenizer_data_utils from tuning.trackers.tracker_factory import FILE_LOGGING_TRACKER, get_tracker from tuning.trainercontroller import TrainerControllerCallback +from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize from tuning.utils.config_utils import get_hf_peft_config, get_json_config from tuning.utils.data_type_utils import get_torch_dtype from tuning.utils.error_logging import ( @@ -298,7 +298,7 @@ def train( # TODO: lower priority but understand if resizing impacts inference quality and why its needed. # It makes sense if we manipulate tokenizer that we also save it and provide it to inference. - tokenizer_data_utils.tokenizer_and_embedding_resize( + tokenizer_and_embedding_resize( special_tokens_dict=special_tokens_dict, tokenizer=tokenizer, model=model, diff --git a/tuning/data/tokenizer_data_utils.py b/tuning/utils/tokenizer_data_utils.py similarity index 100% rename from tuning/data/tokenizer_data_utils.py rename to tuning/utils/tokenizer_data_utils.py From ac05a6d2a436bd4b23565f5ec1ac01d2ba7b44c4 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Wed, 25 Sep 2024 11:02:25 -0400 Subject: [PATCH 2/5] tests: Add additional tests for test_embedding_resize to check resize with special tokens, resize multiple of. fmt Signed-off-by: Will Johnson --- tests/utils/test_embedding_resize.py | 65 +++++++++++++++++++++++++++- tuning/sft_trainer.py | 2 +- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py index d531348d2..5f26addb9 100644 --- a/tests/utils/test_embedding_resize.py +++ b/tests/utils/test_embedding_resize.py @@ -23,6 +23,7 @@ from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize MODEL_NAME = "Maykeye/TinyLLama-v0" +INPUT_TEXT = "### Text: @NortonSupport Thanks much.\n\n### Label:" def _inference( @@ -41,7 +42,7 @@ def _inference( def test_output_unaltered_across_embedding_resizes(): - input_text = "### Text: @NortonSupport Thanks much.\n\n### Label:" + input_text = INPUT_TEXT tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model_not_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME) model_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME) @@ -74,3 +75,65 @@ def test_output_unaltered_across_embedding_resizes(): ) assert output_from_model_not_resized == output_from_model_resized + + +def test_resize_with_special_tokens(): + input_text = INPUT_TEXT + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + + input_tokenizer_len = len(tokenizer.get_vocab()) + + special_tokens = {"sep_token": "", "pad_token": ""} + tokenizer_and_embedding_resize( + special_tokens_dict=special_tokens, + tokenizer=tokenizer, + model=model, + multiple_of=1, + ) + + assert "" in tokenizer.get_vocab() + assert "" in tokenizer.get_vocab() + + output_tokenizer_len = len(tokenizer.get_vocab()) + + assert output_tokenizer_len == input_tokenizer_len + 2 + + output = _inference( + tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20 + ) + assert output is not None + + +def test_no_resize_when_no_special_tokens(): + input_text = INPUT_TEXT + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + + input_tokenizer_len = len(tokenizer.get_vocab()) + + tokenizer_and_embedding_resize( + special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=1 + ) + + output_tokenizer_len = len(tokenizer.get_vocab()) + + assert input_tokenizer_len == output_tokenizer_len + + output = _inference( + tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20 + ) + + assert output is not None + + +def test_resize_with_multiple_of(): + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + + tokenizer_and_embedding_resize( + special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8 + ) + + assert model.get_input_embeddings().embedding_dim % 8 == 0 + assert model.get_output_embeddings().out_features % 8 == 0 diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 2f9bf0c98..be002e820 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -55,7 +55,6 @@ ) from tuning.trackers.tracker_factory import FILE_LOGGING_TRACKER, get_tracker from tuning.trainercontroller import TrainerControllerCallback -from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize from tuning.utils.config_utils import get_hf_peft_config, get_json_config from tuning.utils.data_type_utils import get_torch_dtype from tuning.utils.error_logging import ( @@ -70,6 +69,7 @@ is_pretokenized_dataset, validate_data_args, ) +from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize def train( From d9dcd2c4444437b1534715169627883aed3be355 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 4 Oct 2024 09:17:42 -0400 Subject: [PATCH 3/5] lint Signed-off-by: Will Johnson --- tests/utils/test_tokenizer_data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_tokenizer_data_utils.py b/tests/utils/test_tokenizer_data_utils.py index 118805100..1afd34d4d 100644 --- a/tests/utils/test_tokenizer_data_utils.py +++ b/tests/utils/test_tokenizer_data_utils.py @@ -7,7 +7,7 @@ # Local # First party -from tuning.data.tokenizer_data_utils import tokenizer_and_embedding_resize +from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize def test_tokenizer_and_embedding_resize_return_values(): From b8c4f9d5e90ffbb2b2a859d38cdccc25a5309548 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 7 Oct 2024 12:43:22 -0400 Subject: [PATCH 4/5] fix: more thorough testing from output of function Signed-off-by: Will Johnson --- tests/utils/test_embedding_resize.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py index 5f26addb9..8be327ef5 100644 --- a/tests/utils/test_embedding_resize.py +++ b/tests/utils/test_embedding_resize.py @@ -85,7 +85,7 @@ def test_resize_with_special_tokens(): input_tokenizer_len = len(tokenizer.get_vocab()) special_tokens = {"sep_token": "", "pad_token": ""} - tokenizer_and_embedding_resize( + resize_result = tokenizer_and_embedding_resize( special_tokens_dict=special_tokens, tokenizer=tokenizer, model=model, @@ -98,6 +98,7 @@ def test_resize_with_special_tokens(): output_tokenizer_len = len(tokenizer.get_vocab()) assert output_tokenizer_len == input_tokenizer_len + 2 + assert resize_result["num_new_tokens"] == output_tokenizer_len - input_tokenizer_len output = _inference( tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20 @@ -112,13 +113,14 @@ def test_no_resize_when_no_special_tokens(): input_tokenizer_len = len(tokenizer.get_vocab()) - tokenizer_and_embedding_resize( + resize_result = tokenizer_and_embedding_resize( special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=1 ) output_tokenizer_len = len(tokenizer.get_vocab()) assert input_tokenizer_len == output_tokenizer_len + assert resize_result["num_new_tokens"] == 0 output = _inference( tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20 @@ -131,9 +133,10 @@ def test_resize_with_multiple_of(): tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) - tokenizer_and_embedding_resize( + resize_result = tokenizer_and_embedding_resize( special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8 ) + assert resize_result["new_embedding_size"] % 8 == 0 assert model.get_input_embeddings().embedding_dim % 8 == 0 assert model.get_output_embeddings().out_features % 8 == 0 From 998eff323ff543bfe30f66b0f9f2ae6d5e7a5e55 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Tue, 8 Oct 2024 16:43:11 -0400 Subject: [PATCH 5/5] test: move assertion Signed-off-by: Will Johnson --- tests/utils/test_embedding_resize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_embedding_resize.py b/tests/utils/test_embedding_resize.py index 8be327ef5..43c6adbd4 100644 --- a/tests/utils/test_embedding_resize.py +++ b/tests/utils/test_embedding_resize.py @@ -137,6 +137,6 @@ def test_resize_with_multiple_of(): special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8 ) - assert resize_result["new_embedding_size"] % 8 == 0 assert model.get_input_embeddings().embedding_dim % 8 == 0 + assert resize_result["new_embedding_size"] % 8 == 0 assert model.get_output_embeddings().out_features % 8 == 0