diff --git a/main.py b/main.py index c9e3e1b56..7ba058750 100755 --- a/main.py +++ b/main.py @@ -157,6 +157,9 @@ def mlperf_inference_implementation_readme(spaces, model, implementation): content += f"{cur_space3}* `--docker_cm_repo=`: to use a custom fork of cm4mlops repository inside the docker image\n\n" content += f"{cur_space3}* `--docker_cache=no`: to not use docker cache during the image build\n" + if implementation.lower() == "nvidia": + content += f"{cur_space3}* `--gpu_name=` : The GPUs with configured batch sizes in CM are `orin`, `rtx_4090`, `rtx_a6000`, `rtx_6000_ada`, `l4`, `t4`and `a100`.\n" + if device.lower() not in [ "cuda" ]: content += f"{cur_space3}* `--docker_os=ubuntu`: ubuntu and rhel are supported. \n" content += f"{cur_space3}* `--docker_os_version=20.04`: [20.04, 22.04] are supported for Ubuntu and [8, 9] for RHEL\n" @@ -174,6 +177,8 @@ def mlperf_inference_implementation_readme(spaces, model, implementation): run_suffix += f"{cur_space3} Please click here to see more options for the RUN command\n\n" run_suffix += f"{cur_space3}* Use `--division=closed` to do a closed division submission which includes compliance runs\n\n" run_suffix += f"{cur_space3}* Use `--rerun` to do a rerun even when a valid run exists\n" + if implementation.lower() == "nvidia": + run_suffix += f"{cur_space3}* `--gpu_name=` : The GPU's with configured batch sizes in CM are `orin`, `rtx_4090`, `rtx_a6000`, `rtx_6000_ada`, `l4`, `t4`and `a100`.\n" run_suffix += f"{cur_space3}\n\n" if "bert" in model.lower() and framework == "deepsparse": @@ -316,9 +321,14 @@ def get_docker_info(spaces, model, implementation, device): pre_space = pre_space + " " pre_space += " " #pre_space = " " - if implementation == "nvidia": - info += f"\n{pre_space}!!! tip\n\n" - info+= f"{pre_space} If ran with `--all_models=yes`, all the benchmark models of NVIDIA implementation could be run within the same container.\n\n" + info += f"\n{pre_space}!!! tip\n\n" + info+= f"{pre_space} - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size.\n\n" + if implementation.lower() == "nvidia": + info+= f"{pre_space} - Default batch size is assigned based on either [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n" + info+= f"{pre_space} - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n" + if "llama2" in model.lower(): + info+= f"{pre_space} - The dataset for NVIDIA's implementation of Llama2 is not publicly available. The user must fill [this](https://docs.google.com/forms/d/e/1FAIpQLSc_8VIvRmXM3I8KQaYnKf7gy27Z63BBoI_I1u02f4lw6rBp3g/viewform?pli=1&fbzx=-8842630989397184967) form and be verified as a MLCommons member to access the dataset.\n\n" + info+= f"{pre_space} - `PATH_TO_PICKE_FILE` should be replaced with path to the downloaded pickle file.\n\n" return info def get_readme_suffix(spaces, model, implementation): @@ -341,7 +351,6 @@ def get_run_cmd_extra(f_pre_space, model, implementation, device, scenario, scen f_pre_space += "" if scenario == "Server" or (scenario == "All Scenarios" and "Server" in scenarios): extra_content += f"{f_pre_space} * `` must be determined manually. It is usually around 80% of the Offline QPS, but on some systems, it can drop below 50%. If a higher value is specified, the latency constraint will not be met, and the run will be considered invalid.\n" - if "gptj" in model and device == "cuda" and implementation == "reference": extra_content += f"{f_pre_space} * `--precision=[float16|bfloat16]` can help run on GPUs with less RAM \n" extra_content += f"{f_pre_space} * `--beam-size=1` Beam size of 4 is mandatory for a closed division submission but reducing the beam size can help in running the model on GPUs with lower device memory\n"