Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: merge set of changes for v2.1.0 #376

Merged
merged 9 commits into from
Oct 18, 2024
4 changes: 2 additions & 2 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ name: Format

on:
push:
branches: [ "main" ]
branches: [ "main", "release" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "release" ]

jobs:
lint:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: Image
on:
push:
branches: [ "main" ]
branches: [ "main", "release" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "release" ]

jobs:
build:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: Test
on:
push:
branches: [ "main" ]
branches: [ "main", "release" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "release" ]

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
# https://help.github.com/en/articles/about-code-owners
#

* @anhuong @Ssukriti @alex-jw-brooks
* @anhuong @Ssukriti @aluu317 @fabianlim @kmehant
6 changes: 6 additions & 0 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ RUN mkdir /app && \
chown -R $USER:0 /app /tmp && \
chmod -R g+rwX /app /tmp

# Set Triton environment variables for qLoRA
ENV TRITON_HOME="/tmp/triton_home"
ENV TRITON_DUMP_DIR="/tmp/triton_dump_dir"
ENV TRITON_CACHE_DIR="/tmp/triton_cache_dir"
ENV TRITON_OVERRIDE_DIR="/tmp/triton_override_dir"

# Need a better way to address these hacks
RUN if [[ "${ENABLE_AIM}" == "true" ]] ; then \
touch /.aim_profile && \
Expand Down
3 changes: 2 additions & 1 deletion fixtures/accelerate_fsdp_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ fsdp_config:
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP

# this controls the FSDP pipelining
fsdp_backward_prefetch_policy: BACKWARD_PRE # set to BACKWARD_PRE for the most time-efficient pipeline
fsdp_backward_prefetch: BACKWARD_PRE # set to BACKWARD_PRE for the most time-efficient pipeline
# but requires the most memory. BACKWARD_POST is the less
# memory intensive option
fsdp_backward_prefetch_policy: BACKWARD_PRE # for backwards compatibility

# setting this to true will increase forward memory by prefetching the next FSDP all-gather, while performing
# the current forward pass.
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ classifiers=[
]
dependencies = [
"numpy>=1.26.4,<2.0",
"accelerate>=0.20.3,<0.34",
"transformers>4.41,<4.45",
"torch>=2.2.0,<3.0",
"accelerate>=0.20.3,!=0.34,<1.1",
"transformers>4.41,<4.50",
"torch>=2.2.0,<2.5",
"sentencepiece>=0.1.99,<0.3",
"tokenizers>=0.13.3,<1.0",
"tqdm>=4.66.2,<5.0",
"trl>=0.9.3,<1.0",
"peft>=0.8.0,<0.13",
"peft>=0.8.0,<0.14",
"protobuf>=5.28.0,<6.0.0",
"datasets>=2.15.0,<3.0",
"simpleeval>=0.9.13,<1.0",
Expand Down
4 changes: 2 additions & 2 deletions scripts/run_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


### Utilities
Expand Down Expand Up @@ -219,7 +219,7 @@ def load(
# where the model's layers are modified, in our case the embedding layer
# is modified, so we resize the backbone model's embedding layer with our own
# utility before passing it along to load the PEFT model.
tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
{}, tokenizer=tokenizer, model=base_model
)
model = PeftModel.from_pretrained(
Expand Down
2 changes: 1 addition & 1 deletion tests/build/dummy_job_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"dynamo_use_dynamic": true,
"num_machines": 1,
"main_process_port": 1234,
"fsdp_backward_prefetch_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_backward_prefetch": "TRANSFORMER_BASED_WRAP",
"fsdp_sharding_strategy": 1,
"fsdp_state_dict_type": "FULL_STATE_DICT",
"fsdp_cpu_ram_efficient_loading": true,
Expand Down
2 changes: 1 addition & 1 deletion tests/build/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_process_accelerate_launch_args(job_config):
args = process_accelerate_launch_args(job_config)
# json config values used
assert args.use_fsdp is True
assert args.fsdp_backward_prefetch_policy == "TRANSFORMER_BASED_WRAP"
assert args.fsdp_backward_prefetch == "TRANSFORMER_BASED_WRAP"
assert args.env == ["env1", "env2"]
assert args.training_script == "tuning.sft_trainer"
assert args.config_file == "fixtures/accelerate_fsdp_defaults.yaml"
Expand Down
74 changes: 70 additions & 4 deletions tests/utils/test_embedding_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize

MODEL_NAME = "Maykeye/TinyLLama-v0"
INPUT_TEXT = "### Text: @NortonSupport Thanks much.\n\n### Label:"


def _inference(
Expand All @@ -41,16 +42,16 @@ def _inference(


def test_output_unaltered_across_embedding_resizes():
input_text = "### Text: @NortonSupport Thanks much.\n\n### Label:"
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_not_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model_resized, multiple_of=8
)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={},
tokenizer=tokenizer,
model=model_not_resized,
Expand All @@ -74,3 +75,68 @@ def test_output_unaltered_across_embedding_resizes():
)

assert output_from_model_not_resized == output_from_model_resized


def test_resize_with_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

special_tokens = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
resize_result = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens,
tokenizer=tokenizer,
model=model,
multiple_of=1,
)

assert "<SEP>" in tokenizer.get_vocab()
assert "<PAD>" in tokenizer.get_vocab()

output_tokenizer_len = len(tokenizer.get_vocab())

assert output_tokenizer_len == input_tokenizer_len + 2
assert resize_result["num_new_tokens"] == output_tokenizer_len - input_tokenizer_len

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)
assert output is not None


def test_no_resize_when_no_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=1
)

output_tokenizer_len = len(tokenizer.get_vocab())

assert input_tokenizer_len == output_tokenizer_len
assert resize_result["num_new_tokens"] == 0

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)

assert output is not None


def test_resize_with_multiple_of():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8
)

assert model.get_input_embeddings().embedding_dim % 8 == 0
assert resize_result["new_embedding_size"] % 8 == 0
assert model.get_output_embeddings().out_features % 8 == 0
2 changes: 1 addition & 1 deletion tests/utils/test_tokenizer_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# Local
# First party
from tuning.data.tokenizer_data_utils import tokenizer_and_embedding_resize
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def test_tokenizer_and_embedding_resize_return_values():
Expand Down
13 changes: 0 additions & 13 deletions tuning/data/__init__.py

This file was deleted.

7 changes: 5 additions & 2 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
FileLoggingTrackerConfig,
TrackerConfigFactory,
)
from tuning.data import tokenizer_data_utils
from tuning.trackers.tracker_factory import FILE_LOGGING_TRACKER, get_tracker
from tuning.trainercontroller import TrainerControllerCallback
from tuning.utils.config_utils import get_hf_peft_config, get_json_config
Expand All @@ -70,6 +69,7 @@
is_pretokenized_dataset,
validate_data_args,
)
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def train(
Expand Down Expand Up @@ -294,7 +294,7 @@ def train(

# TODO: lower priority but understand if resizing impacts inference quality and why its needed.
# It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
added_tokens_dict = tokenizer_data_utils.tokenizer_and_embedding_resize(
added_tokens_dict = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model,
Expand Down Expand Up @@ -637,6 +637,9 @@ def main():
combined_tracker_configs.file_logger_config = file_logger_config
combined_tracker_configs.aim_config = aim_config

if training_args.output_dir:
os.makedirs(training_args.output_dir, exist_ok=True)
logger.info("using the output directory at %s", training_args.output_dir)
try:
trainer, additional_train_info = train(
model_args=model_args,
Expand Down
Loading