Skip to content

Commit

Permalink
added fix to keyerror due to empty output dict from OOM
Browse files Browse the repository at this point in the history
  • Loading branch information
achew010 committed May 24, 2024
1 parent b6f7519 commit 2ee8902
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions scripts/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]:
when `skip_memory_metrics` is set to `False` in transformers.TrainingArguments
This function is called only when `--skip_memory_metrics` exist in the experiment arg
and is set to False. The memory key values are expected to be inside output_metrics.
and is set to False. The memory key values are expected to be inside output_metrics. If
output_metrics is empty, return peak=0 and usage=0
Returns
- gpu_peak value in Bytes
Expand All @@ -106,6 +107,9 @@ def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]:
# we exclude the model loading stages for now, due to
# https://github.com/foundation-model-stack/fms-acceleration/issues/18
# we will renable the loading stages later on once this issue is addressed
if len(output_metrics.keys())<1:
return 0, 0

trainer_stage_order = [
(HF_TRAINER_LOG_GPU_STAGE_BEFORE_INIT, False),
(HF_TRAINER_LOG_GPU_STAGE_INIT, False),
Expand All @@ -117,7 +121,6 @@ def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]:
for STAGE_NAME, include in trainer_stage_order:
delta_key = f"{STAGE_NAME}_{KEYWORD_ALLOC_DELTA}"
alloc_running_sum += output_metrics[delta_key] if delta_key in output_metrics else output_metrics[STAGE_NAME]

peak_delta = output_metrics.get(f"{STAGE_NAME}_{KEYWORD_PEAKED_DELTA}", 0)
if include:
list_of_alloc_running_sums.append(alloc_running_sum)
Expand Down

0 comments on commit 2ee8902

Please sign in to comment.