Skip to content

Commit

Permalink
Deepspeed (#387)
Browse files Browse the repository at this point in the history
  • Loading branch information
abhishekkrthakur authored Dec 12, 2023
1 parent fa11c28 commit 1102882
Show file tree
Hide file tree
Showing 10 changed files with 145 additions and 48 deletions.
3 changes: 2 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ test.py
.vscode/
op*
op_*
.git
.git
*.db
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ output2/
logs/
op_*/
autotrain.db

*.db
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
8 changes: 5 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive \
TZ=UTC \
Expand Down Expand Up @@ -64,8 +64,9 @@ RUN conda create -p /app/env -y python=3.10
SHELL ["conda", "run","--no-capture-output", "-p","/app/env", "/bin/bash", "-c"]

RUN conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia && \
conda clean -ya && \
conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc && conda clean -ya
conda clean -ya
#conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc && conda clean -ya && \
#conda install -c "nvidia/label/cuda-12.1.0" cuda-toolkit && conda clean -ya

# install NGC CLI
RUN wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.34.1/files/ngccli_linux.zip -O ngccli_linux.zip && unzip ngccli_linux.zip && \
Expand All @@ -76,4 +77,5 @@ RUN pip install -e . && \
python -m nltk.downloader punkt && \
autotrain setup && \
pip install flash-attn && \
pip install deepspeed && \
pip cache purge
53 changes: 51 additions & 2 deletions src/autotrain/app_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import psutil
import requests
import torch

from autotrain import config, logger
from autotrain.trainers.clm.params import LLMTrainingParams
Expand Down Expand Up @@ -112,7 +113,31 @@ def run_training(params, task_id, local=False):
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
cmd = ["accelerate", "launch", "--num_machines", "1", "--num_processes", "1"]
num_gpus = torch.cuda.device_count()
if params.use_int4 or params.use_int8 or (params.fp16 and params.use_peft):
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(num_gpus))
else:
cmd = [
"accelerate",
"launch",
"--use_deepspeed",
"--zero_stage",
"3",
"--offload_optimizer_device",
"cpu",
"--offload_param_device",
"cpu",
"--zero3_save_16bit_model",
"true",
]
cmd.append("--mixed_precision")
if params.fp16:
cmd.append("fp16")
Expand All @@ -134,7 +159,31 @@ def run_training(params, task_id, local=False):
else:
params.project_name = os.path.join("output", params.project_name)
params.save(output_dir=params.project_name)
cmd = ["accelerate", "launch", "--num_machines", "1", "--num_processes", "1"]
num_gpus = torch.cuda.device_count()
if params.use_int8 or (params.fp16 and params.use_peft):
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(num_gpus))
else:
cmd = [
"accelerate",
"launch",
"--use_deepspeed",
"--zero_stage",
"3",
"--offload_optimizer_device",
"cpu",
"--offload_param_device",
"cpu",
"--zero3_save_16bit_model",
"true",
]
cmd.append("--mixed_precision")
if params.fp16:
cmd.append("fp16")
Expand Down
20 changes: 10 additions & 10 deletions src/autotrain/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,16 +474,16 @@ def create(self):
logger.error(ngc_config_process.stderr.read())
raise Exception("Failed to set NGC API key")

ngc_diag_cmd = ["ngc", "diag", "all"]
process = subprocess.run(ngc_diag_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
output = process.stdout
error = process.stderr
if process.returncode != 0:
logger.info("NGC DIAG ALL Error occurred:")
logger.info(error)
else:
logger.info("NGC DIAG ALL output:")
logger.info(output)
# ngc_diag_cmd = ["ngc", "diag", "all"]
# process = subprocess.run(ngc_diag_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# output = process.stdout
# error = process.stderr
# if process.returncode != 0:
# logger.info("NGC DIAG ALL Error occurred:")
# logger.info(error)
# else:
# logger.info("NGC DIAG ALL output:")
# logger.info(output)

logger.info("Creating NGC Job")
subprocess.run(
Expand Down
33 changes: 24 additions & 9 deletions src/autotrain/cli/run_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,15 +549,30 @@ def run(self):
if self.num_gpus == 1:
train_llm(params)
else:
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(self.num_gpus))
if self.args.use_int4 or self.args.use_int8 or (self.args.fp16 and self.args.use_peft):
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(self.num_gpus))
else:
cmd = [
"accelerate",
"launch",
"--use_deepspeed",
"--zero_stage",
"3",
"--offload_optimizer_device",
"cpu",
"--offload_param_device",
"cpu",
"--zero3_save_16bit_model",
"true",
]
cmd.append("--mixed_precision")
if self.args.fp16:
cmd.append("fp16")
Expand Down
33 changes: 24 additions & 9 deletions src/autotrain/cli/run_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,15 +401,30 @@ def run(self):
if self.num_gpus == 1:
train_seq2seq(params)
else:
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(self.num_gpus))
if self.args.use_int8 or (self.args.fp16 and self.args.use_peft):
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
]
cmd.append(str(self.num_gpus))
else:
cmd = [
"accelerate",
"launch",
"--use_deepspeed",
"--zero_stage",
"3",
"--offload_optimizer_device",
"cpu",
"--offload_param_device",
"cpu",
"--zero3_save_16bit_model",
"true",
]
cmd.append("--mixed_precision")
if self.args.fp16:
cmd.append("fp16")
Expand Down
14 changes: 9 additions & 5 deletions src/autotrain/cli/run_text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,12 +353,16 @@ def run(self):
cmd = [
"accelerate",
"launch",
"--multi_gpu",
"--num_machines",
"1",
"--num_processes",
"--use_deepspeed",
"--zero_stage",
"3",
"--offload_optimizer_device",
"cpu",
"--offload_param_device",
"cpu",
"--zero3_save_16bit_model",
"true",
]
cmd.append(str(self.num_gpus))
cmd.append("--mixed_precision")
if self.args.fp16:
cmd.append("fp16")
Expand Down
19 changes: 13 additions & 6 deletions src/autotrain/trainers/clm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,33 +148,40 @@ def train(config):
bnb_4bit_use_double_quant=False,
)
config.fp16 = True
additional_kwargs = {
"device_map": {"": Accelerator().process_index} if torch.cuda.is_available() else None,
"torch_dtype": torch.float16,
}
elif config.use_int8:
bnb_config = BitsAndBytesConfig(load_in_8bit=config.use_int8)
config.fp16 = True
additional_kwargs = {
"device_map": {"": Accelerator().process_index} if torch.cuda.is_available() else None,
"torch_dtype": torch.float16,
}
else:
bnb_config = None
additional_kwargs = {}

if config.trainer == "reward":
model = AutoModelForSequenceClassification.from_pretrained(
config.model,
config=model_config,
token=config.token,
quantization_config=bnb_config,
torch_dtype=torch.float16,
device_map={"": Accelerator().process_index} if torch.cuda.is_available() else None,
trust_remote_code=True,
use_flash_attention_2=config.use_flash_attention_2,
**additional_kwargs,
)
else:
model = AutoModelForCausalLM.from_pretrained(
config.model,
config=model_config,
token=config.token,
quantization_config=bnb_config,
torch_dtype=torch.float16,
device_map={"": Accelerator().process_index} if torch.cuda.is_available() else None,
trust_remote_code=True,
use_flash_attention_2=config.use_flash_attention_2,
**additional_kwargs,
)
model_ref = None
else:
Expand Down Expand Up @@ -401,7 +408,7 @@ def train(config):
**trainer_args,
train_dataset=train_data,
eval_dataset=valid_data if config.valid_split is not None else None,
peft_config=peft_config,
peft_config=peft_config if config.use_peft else None,
tokenizer=tokenizer,
)
elif config.trainer == "dpo":
Expand Down Expand Up @@ -429,7 +436,7 @@ def train(config):
max_length=max_length,
max_prompt_length=max_prompt_length,
max_target_length=max_target_length,
peft_config=peft_config,
peft_config=peft_config if config.use_peft else None,
)
else:
raise ValueError(f"trainer `{config.trainer}` not supported")
Expand Down
8 changes: 6 additions & 2 deletions src/autotrain/trainers/seq2seq/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,21 @@ def train(config):
if config.use_int8:
bnb_config = BitsAndBytesConfig(load_in_8bit=config.use_int8)
config.fp16 = True
additional_kwargs = {
"device_map": {"": Accelerator().process_index} if torch.cuda.is_available() else None,
"torch_dtype": torch.float16,
}
else:
bnb_config = None
additional_kwargs = {}

model = AutoModelForSeq2SeqLM.from_pretrained(
config.model,
config=model_config,
token=config.token,
quantization_config=bnb_config,
torch_dtype=torch.float16 if config.fp16 else None,
device_map={"": Accelerator().process_index} if torch.cuda.is_available() else None,
trust_remote_code=True,
**additional_kwargs,
)
else:
model = AutoModelForSeq2SeqLM.from_pretrained(
Expand Down

0 comments on commit 1102882

Please sign in to comment.