Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: move tokenizer_data_utils with the rest of utils, add further unit testing. #348

Merged
merged 7 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/run_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


### Utilities
Expand Down Expand Up @@ -219,7 +219,7 @@ def load(
# where the model's layers are modified, in our case the embedding layer
# is modified, so we resize the backbone model's embedding layer with our own
# utility before passing it along to load the PEFT model.
tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
{}, tokenizer=tokenizer, model=base_model
)
model = PeftModel.from_pretrained(
Expand Down
74 changes: 70 additions & 4 deletions tests/utils/test_embedding_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize

MODEL_NAME = "Maykeye/TinyLLama-v0"
INPUT_TEXT = "### Text: @NortonSupport Thanks much.\n\n### Label:"


def _inference(
Expand All @@ -41,16 +42,16 @@ def _inference(


def test_output_unaltered_across_embedding_resizes():
input_text = "### Text: @NortonSupport Thanks much.\n\n### Label:"
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_not_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model_resized, multiple_of=8
)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={},
tokenizer=tokenizer,
model=model_not_resized,
Expand All @@ -74,3 +75,68 @@ def test_output_unaltered_across_embedding_resizes():
)

assert output_from_model_not_resized == output_from_model_resized


def test_resize_with_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

special_tokens = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
resize_result = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens,
tokenizer=tokenizer,
model=model,
multiple_of=1,
)

assert "<SEP>" in tokenizer.get_vocab()
assert "<PAD>" in tokenizer.get_vocab()

output_tokenizer_len = len(tokenizer.get_vocab())

assert output_tokenizer_len == input_tokenizer_len + 2
Copy link
Collaborator

@aluu317 aluu317 Oct 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the new return of of tokenizer_and_embedding_resize, I think we can test it even more thorough here with the returned value which is {"num_new_tokens": num_new_tokens, "new_embedding_size": embedding_size}.
We can assign an output above:

resize_results = tokenizer_and_embedding_resize(
        special_tokens_dict=special_tokens,
        tokenizer=tokenizer,
        model=model,
        multiple_of=1,
    )

and then here, we can do in addition of checking that it's 2:

assert output_tokenizer_len - input_tokenizer_len == resize_results["num_new_tokens"]

assert resize_result["num_new_tokens"] == output_tokenizer_len - input_tokenizer_len

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)
assert output is not None


def test_no_resize_when_no_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=1
)

output_tokenizer_len = len(tokenizer.get_vocab())

assert input_tokenizer_len == output_tokenizer_len
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in addition, we can do another assert here resize_result[num_new_tokens] == 0 as well

assert resize_result["num_new_tokens"] == 0

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)

assert output is not None


def test_resize_with_multiple_of():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8
)

assert model.get_input_embeddings().embedding_dim % 8 == 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to make sure that the input embeddings has multiple of 8?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we do. Pls ignore my silly question

assert resize_result["new_embedding_size"] % 8 == 0
assert model.get_output_embeddings().out_features % 8 == 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is interesting. Is out_features the same as embedding size?
We can test here as well that resize_result["new_embedding_size"] %8 ==0 tho

2 changes: 1 addition & 1 deletion tests/utils/test_tokenizer_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# Local
# First party
from tuning.data.tokenizer_data_utils import tokenizer_and_embedding_resize
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def test_tokenizer_and_embedding_resize_return_values():
Expand Down
13 changes: 0 additions & 13 deletions tuning/data/__init__.py

This file was deleted.

4 changes: 2 additions & 2 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
FileLoggingTrackerConfig,
TrackerConfigFactory,
)
from tuning.data import tokenizer_data_utils
from tuning.trackers.tracker_factory import FILE_LOGGING_TRACKER, get_tracker
from tuning.trainercontroller import TrainerControllerCallback
from tuning.utils.config_utils import get_hf_peft_config, get_json_config
Expand All @@ -70,6 +69,7 @@
is_pretokenized_dataset,
validate_data_args,
)
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def train(
Expand Down Expand Up @@ -294,7 +294,7 @@ def train(

# TODO: lower priority but understand if resizing impacts inference quality and why its needed.
# It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
added_tokens_dict = tokenizer_data_utils.tokenizer_and_embedding_resize(
added_tokens_dict = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model,
Expand Down
Loading