Skip to content

Commit

Permalink
Merge pull request #23 from anandhu-eng/cm_readme_inference_update
Browse files Browse the repository at this point in the history
added tips about batch size and dataset for nvidia llama2
  • Loading branch information
arjunsuresh authored Sep 10, 2024
2 parents b30e51a + b8188c2 commit 3369b3c
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
content += f"{cur_space3}* `--docker_cm_repo=<Custom CM repo URL>`: to use a custom fork of cm4mlops repository inside the docker image\n\n"
content += f"{cur_space3}* `--docker_cache=no`: to not use docker cache during the image build\n"

if implementation.lower() == "nvidia":
content += f"{cur_space3}* `--gpu_name=<Name of the GPU>` : The GPUs with configured batch sizes in CM are `orin`, `rtx_4090`, `rtx_a6000`, `rtx_6000_ada`, `l4`, `t4`and `a100`.\n"

if device.lower() not in [ "cuda" ]:
content += f"{cur_space3}* `--docker_os=ubuntu`: ubuntu and rhel are supported. \n"
content += f"{cur_space3}* `--docker_os_version=20.04`: [20.04, 22.04] are supported for Ubuntu and [8, 9] for RHEL\n"
Expand All @@ -174,6 +177,8 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
run_suffix += f"{cur_space3}<summary> Please click here to see more options for the RUN command</summary>\n\n"
run_suffix += f"{cur_space3}* Use `--division=closed` to do a closed division submission which includes compliance runs\n\n"
run_suffix += f"{cur_space3}* Use `--rerun` to do a rerun even when a valid run exists\n"
if implementation.lower() == "nvidia":
run_suffix += f"{cur_space3}* `--gpu_name=<Name of the GPU>` : The GPU's with configured batch sizes in CM are `orin`, `rtx_4090`, `rtx_a6000`, `rtx_6000_ada`, `l4`, `t4`and `a100`.\n"
run_suffix += f"{cur_space3}</details>\n\n"

if "bert" in model.lower() and framework == "deepsparse":
Expand Down Expand Up @@ -316,9 +321,14 @@ def get_docker_info(spaces, model, implementation, device):
pre_space = pre_space + " "
pre_space += " "
#pre_space = " "
if implementation == "nvidia":
info += f"\n{pre_space}!!! tip\n\n"
info+= f"{pre_space} If ran with `--all_models=yes`, all the benchmark models of NVIDIA implementation could be run within the same container.\n\n"
info += f"\n{pre_space}!!! tip\n\n"
info+= f"{pre_space} - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size.\n\n"
if implementation.lower() == "nvidia":
info+= f"{pre_space} - Default batch size is assigned based on either [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n"
info+= f"{pre_space} - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n"
if "llama2" in model.lower():
info+= f"{pre_space} - The dataset for NVIDIA's implementation of Llama2 is not publicly available. The user must fill [this](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform?pli=1&fbzx=-8842630989397184967) form and be verified as a MLCommons member to access the dataset.\n\n"
info+= f"{pre_space} - `PATH_TO_PICKE_FILE` should be replaced with path to the downloaded pickle file.\n\n"
return info

def get_readme_suffix(spaces, model, implementation):
Expand All @@ -341,7 +351,6 @@ def get_run_cmd_extra(f_pre_space, model, implementation, device, scenario, scen
f_pre_space += ""
if scenario == "Server" or (scenario == "All Scenarios" and "Server" in scenarios):
extra_content += f"{f_pre_space} * `<SERVER_TARGET_QPS>` must be determined manually. It is usually around 80% of the Offline QPS, but on some systems, it can drop below 50%. If a higher value is specified, the latency constraint will not be met, and the run will be considered invalid.\n"

if "gptj" in model and device == "cuda" and implementation == "reference":
extra_content += f"{f_pre_space} * `--precision=[float16|bfloat16]` can help run on GPUs with less RAM \n"
extra_content += f"{f_pre_space} * `--beam-size=1` Beam size of 4 is mandatory for a closed division submission but reducing the beam size can help in running the model on GPUs with lower device memory\n"
Expand Down

0 comments on commit 3369b3c

Please sign in to comment.