-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor: move tokenizer_data_utils with the rest of utils, add further unit testing. #348
Changes from 5 commits
c1f4faa
ac05a6d
c34377b
d9dcd2c
b8c4f9d
998eff3
dac71f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,9 +20,10 @@ | |
import torch | ||
|
||
# Local | ||
from tuning.data import tokenizer_data_utils | ||
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize | ||
|
||
MODEL_NAME = "Maykeye/TinyLLama-v0" | ||
INPUT_TEXT = "### Text: @NortonSupport Thanks much.\n\n### Label:" | ||
|
||
|
||
def _inference( | ||
|
@@ -41,16 +42,16 @@ def _inference( | |
|
||
|
||
def test_output_unaltered_across_embedding_resizes(): | ||
input_text = "### Text: @NortonSupport Thanks much.\n\n### Label:" | ||
input_text = INPUT_TEXT | ||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | ||
model_not_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | ||
model_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | ||
|
||
tokenizer_data_utils.tokenizer_and_embedding_resize( | ||
tokenizer_and_embedding_resize( | ||
special_tokens_dict={}, tokenizer=tokenizer, model=model_resized, multiple_of=8 | ||
) | ||
|
||
tokenizer_data_utils.tokenizer_and_embedding_resize( | ||
tokenizer_and_embedding_resize( | ||
special_tokens_dict={}, | ||
tokenizer=tokenizer, | ||
model=model_not_resized, | ||
|
@@ -74,3 +75,68 @@ def test_output_unaltered_across_embedding_resizes(): | |
) | ||
|
||
assert output_from_model_not_resized == output_from_model_resized | ||
|
||
|
||
def test_resize_with_special_tokens(): | ||
input_text = INPUT_TEXT | ||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | ||
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | ||
|
||
input_tokenizer_len = len(tokenizer.get_vocab()) | ||
|
||
special_tokens = {"sep_token": "<SEP>", "pad_token": "<PAD>"} | ||
resize_result = tokenizer_and_embedding_resize( | ||
special_tokens_dict=special_tokens, | ||
tokenizer=tokenizer, | ||
model=model, | ||
multiple_of=1, | ||
) | ||
|
||
assert "<SEP>" in tokenizer.get_vocab() | ||
assert "<PAD>" in tokenizer.get_vocab() | ||
|
||
output_tokenizer_len = len(tokenizer.get_vocab()) | ||
|
||
assert output_tokenizer_len == input_tokenizer_len + 2 | ||
assert resize_result["num_new_tokens"] == output_tokenizer_len - input_tokenizer_len | ||
|
||
output = _inference( | ||
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20 | ||
) | ||
assert output is not None | ||
|
||
|
||
def test_no_resize_when_no_special_tokens(): | ||
input_text = INPUT_TEXT | ||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | ||
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | ||
|
||
input_tokenizer_len = len(tokenizer.get_vocab()) | ||
|
||
resize_result = tokenizer_and_embedding_resize( | ||
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=1 | ||
) | ||
|
||
output_tokenizer_len = len(tokenizer.get_vocab()) | ||
|
||
assert input_tokenizer_len == output_tokenizer_len | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in addition, we can do another assert here |
||
assert resize_result["num_new_tokens"] == 0 | ||
|
||
output = _inference( | ||
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20 | ||
) | ||
|
||
assert output is not None | ||
|
||
|
||
def test_resize_with_multiple_of(): | ||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | ||
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) | ||
|
||
resize_result = tokenizer_and_embedding_resize( | ||
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8 | ||
) | ||
|
||
assert resize_result["new_embedding_size"] % 8 == 0 | ||
assert model.get_input_embeddings().embedding_dim % 8 == 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need to make sure that the input embeddings has multiple of 8? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we do. Pls ignore my silly question |
||
assert model.get_output_embeddings().out_features % 8 == 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is interesting. Is |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because the new return of of
tokenizer_and_embedding_resize
, I think we can test it even more thorough here with the returned value which is{"num_new_tokens": num_new_tokens, "new_embedding_size": embedding_size}
.We can assign an output above:
and then here, we can do in addition of checking that it's 2: