Skip to content

Commit

Permalink
Improvements to arguments and finetuning (#393)
Browse files Browse the repository at this point in the history
  • Loading branch information
abhishekkrthakur authored Dec 14, 2023
1 parent 3064377 commit baaf553
Show file tree
Hide file tree
Showing 21 changed files with 532 additions and 569 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive \
TZ=UTC \
Expand Down Expand Up @@ -64,8 +64,8 @@ RUN conda create -p /app/env -y python=3.10
SHELL ["conda", "run","--no-capture-output", "-p","/app/env", "/bin/bash", "-c"]

RUN conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia && \
conda clean -ya
#conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc && conda clean -ya && \
conda clean -ya && \
conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc && conda clean -ya
#conda install -c "nvidia/label/cuda-12.1.0" cuda-toolkit && conda clean -ya

# install NGC CLI
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ invisible-watermark==0.2.0
packaging==23.1
# latest versions
tensorboard
peft==0.6.2
peft==0.7.1
trl==0.7.4
tiktoken==0.5.1
transformers==4.35.2
accelerate==0.24.0
transformers==4.36.1
accelerate==0.25.0
diffusers==0.21.4
bitsandbytes==0.41.0
# extras
Expand Down
22 changes: 13 additions & 9 deletions src/autotrain/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,23 @@
PARAMS["llm"] = LLMTrainingParams(
target_modules="",
log="tensorboard",
fp16=True,
use_int4=True,
use_int8=False,
use_peft=True,
mixed_precision="fp16",
quantization="int4",
peft=True,
block_size=1024,
epochs=3,
).model_dump()

PARAMS["text-classification"] = TextClassificationParams().model_dump()
PARAMS["image-classification"] = ImageClassificationParams().model_dump()
PARAMS["text-classification"] = TextClassificationParams(
mixed_precision="fp16",
).model_dump()
PARAMS["image-classification"] = ImageClassificationParams(
mixed_precision="fp16",
).model_dump()
PARAMS["seq2seq"] = Seq2SeqParams(
mixed_precision="fp16",
).model_dump()
PARAMS["tabular"] = TabularParams().model_dump()
PARAMS["dreambooth"] = DreamBoothTrainingParams(
prompt="<enter your prompt here>",
num_steps=500,
Expand All @@ -94,9 +101,6 @@
gradient_accumulation=4,
lr=1e-4,
).model_dump()
PARAMS["seq2seq"] = Seq2SeqParams().model_dump()
PARAMS["tabular"] = TabularParams().model_dump()


app = FastAPI()
# app.mount("/css", StaticFiles(directory="css"), name="css")
Expand Down
182 changes: 11 additions & 171 deletions src/autotrain/app_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

import psutil
import requests
import torch

from autotrain import config, logger
from autotrain.commands import launch_command
from autotrain.trainers.clm.params import LLMTrainingParams
from autotrain.trainers.dreambooth.params import DreamBoothTrainingParams
from autotrain.trainers.generic.params import GenericParams
from autotrain.trainers.image_classification.params import ImageClassificationParams
from autotrain.trainers.seq2seq.params import Seq2SeqParams
from autotrain.trainers.tabular.params import TabularParams
from autotrain.trainers.text_classification.params import TextClassificationParams
Expand All @@ -21,7 +22,6 @@ def get_process_status(pid):
try:
process = psutil.Process(pid)
proc_status = process.status()
logger.info(f"Process status: {proc_status}")
return proc_status
except psutil.NoSuchProcess:
logger.info(f"No process found with PID: {pid}")
Expand Down Expand Up @@ -108,188 +108,28 @@ def run_training(params, task_id, local=False):
logger.info(params)
if task_id == 9:
params = LLMTrainingParams(**params)
if not local:
params.project_name = "/tmp/model"
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
num_gpus = torch.cuda.device_count()
if num_gpus == 0:
raise ValueError("No GPU found. Please use a GPU instance.")
if num_gpus == 1:
cmd = [
"accelerate",
"launch",
"--num_machines",
"1",
"--num_processes",
"1",
]
else:
if params.use_int4 or params.use_int8 or (params.fp16 and params.use_peft):
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(num_gpus))
else:
cmd = [
"accelerate",
"launch",
"--use_deepspeed",
"--zero_stage",
"3",
"--offload_optimizer_device",
"cpu",
"--offload_param_device",
"cpu",
"--zero3_save_16bit_model",
"true",
]
cmd.append("--mixed_precision")
if params.fp16:
cmd.append("fp16")
else:
cmd.append("no")

cmd.extend(
[
"-m",
"autotrain.trainers.clm",
"--training_config",
os.path.join(params.project_name, "training_params.json"),
]
)
elif task_id == 28:
params = Seq2SeqParams(**params)
if not local:
params.project_name = "/tmp/model"
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
num_gpus = torch.cuda.device_count()
if num_gpus == 0:
raise ValueError("No GPU found. Please use a GPU instance.")
if num_gpus == 1:
cmd = [
"accelerate",
"launch",
"--num_machines",
"1",
"--num_processes",
"1",
]
else:
if params.use_int8 or (params.fp16 and params.use_peft):
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(num_gpus))
else:
cmd = [
"accelerate",
"launch",
"--use_deepspeed",
"--zero_stage",
"3",
"--offload_optimizer_device",
"cpu",
"--offload_param_device",
"cpu",
"--zero3_save_16bit_model",
"true",
]
cmd.append("--mixed_precision")
if params.fp16:
cmd.append("fp16")
else:
cmd.append("no")

cmd.extend(
[
"-m",
"autotrain.trainers.seq2seq",
"--training_config",
os.path.join(params.project_name, "training_params.json"),
]
)
elif task_id in (1, 2):
params = TextClassificationParams(**params)
if not local:
params.project_name = "/tmp/model"
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
cmd = ["accelerate", "launch", "--num_machines", "1", "--num_processes", "1"]
cmd.append("--mixed_precision")
if params.fp16:
cmd.append("fp16")
else:
cmd.append("no")

cmd.extend(
[
"-m",
"autotrain.trainers.text_classification",
"--training_config",
os.path.join(params.project_name, "training_params.json"),
]
)
elif task_id in (13, 14, 15, 16, 26):
params = TabularParams(**params)
if not local:
params.project_name = "/tmp/model"
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
cmd = [
"python",
"-m",
"autotrain.trainers.tabular",
"--training_config",
os.path.join(params.project_name, "training_params.json"),
]
elif task_id == 27:
params = GenericParams(**params)
if not local:
params.project_name = "/tmp/model"
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
cmd = [
"python",
"-m",
"autotrain.trainers.generic",
"--config",
os.path.join(params.project_name, "training_params.json"),
]
elif task_id == 25:
params = DreamBoothTrainingParams(**params)
if not local:
params.project_name = "/tmp/model"
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
cmd = [
"python",
"-m",
"autotrain.trainers.dreambooth",
"--training_config",
os.path.join(params.project_name, "training_params.json"),
]

elif task_id == 18:
params = ImageClassificationParams(**params)
else:
raise NotImplementedError

if not local:
params.project_name = "/tmp/model"
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
cmd = launch_command(params=params)

cmd = [str(c) for c in cmd]
logger.info(cmd)
env = os.environ.copy()
Expand Down
40 changes: 30 additions & 10 deletions src/autotrain/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,17 @@ def _text_clf_munge_data(params, username):
return params.data_path


def _img_clf_munge_data(params, username):
train_data_path = f"{params.data_path}/{params.train_split}"
if params.valid_split is not None:
valid_data_path = f"{params.data_path}/{params.valid_split}"
else:
valid_data_path = None
if os.path.isdir(train_data_path) or os.path.isdir(valid_data_path):
raise Exception("Image classification is not yet supported for local datasets.")
return params.data_path


def _dreambooth_munge_data(params, username):
# check if params.image_path is a directory
if os.path.isdir(params.image_path):
Expand Down Expand Up @@ -219,6 +230,8 @@ def __post_init__(self):
self.task_id = 25
elif isinstance(self.params, Seq2SeqParams):
self.task_id = 28
elif isinstance(self.params, ImageClassificationParams):
self.task_id = 18
else:
raise NotImplementedError

Expand Down Expand Up @@ -255,6 +268,11 @@ def prepare(self):
data_path = _seq2seq_munge_data(self.params, self.username)
space_id = self._create_space()
return space_id
if isinstance(self.params, ImageClassificationParams):
self.task_id = 18
data_path = _img_clf_munge_data(self.params, self.username)
space_id = self._create_space()
return space_id
raise NotImplementedError

def _create_readme(self):
Expand Down Expand Up @@ -418,6 +436,7 @@ class NGCRunner:
job_name: str
env_vars: dict
backend: str
enable_diag: bool = False

def __post_init__(self):
self.ngc_ace = os.environ.get("NGC_ACE")
Expand Down Expand Up @@ -474,16 +493,17 @@ def create(self):
logger.error(ngc_config_process.stderr.read())
raise Exception("Failed to set NGC API key")

# ngc_diag_cmd = ["ngc", "diag", "all"]
# process = subprocess.run(ngc_diag_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# output = process.stdout
# error = process.stderr
# if process.returncode != 0:
# logger.info("NGC DIAG ALL Error occurred:")
# logger.info(error)
# else:
# logger.info("NGC DIAG ALL output:")
# logger.info(output)
if self.enable_diag:
ngc_diag_cmd = ["ngc", "diag", "all"]
process = subprocess.run(ngc_diag_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
output = process.stdout
error = process.stderr
if process.returncode != 0:
logger.info("NGC DIAG ALL Error occurred:")
logger.info(error)
else:
logger.info("NGC DIAG ALL output:")
logger.info(output)

logger.info("Creating NGC Job")
subprocess.run(
Expand Down
Loading

0 comments on commit baaf553

Please sign in to comment.