fix some benchmark issues

Signed-off-by: Yu Chin Fabian Lim <[email protected]>
foundation-model-stack · Jul 31, 2024 · b04e2c0 · b04e2c0
1 parent 50b9404
commit b04e2c0
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 16 deletions.
diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py
@@ -90,6 +90,7 @@
 RESULT_FIELD_ALLOCATED_GPU_MEM = "mem_torch_mem_alloc_in_bytes"
 RESULT_FIELD_PEAK_ALLOCATED_GPU_MEM = "mem_peak_torch_mem_alloc_in_bytes"
 ERROR_MESSAGES = "error_messages"
+DRY_RUN_MESSAGE = "dry_run"
 
 
 def extract_gpu_memory_metrics(output_metrics) -> Tuple[float]:
@@ -366,14 +367,19 @@ def __init__(
 
     @property
     def is_completed(self):
+
         if not os.path.exists(self.results_filename):
             return False
         # otherwise open it and check for errors
         with open(self.results_filename) as f:
             results = json.load(f)
 
         # return complete only if no errors
-        return not ERROR_MESSAGES in results
+        # and is not a dry run
+        return (
+            not ERROR_MESSAGES in results and
+            results.get(DRY_RUN_MESSAGE, False) == False
+        )
 
     def run(
         self,
@@ -558,7 +564,8 @@ def _dummy(*args, **kwargs):
     def get_experiment_final_metrics(
         self, final_metrics_keys: List[str] = ["train_loss", "train_runtime"]
     ):
-        return {}
+        # will insert a special dry run key
+        return {DRY_RUN_MESSAGE: True}
 
     def maybe_get_experiment_error_traceback(self):
         return None

diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
@@ -98,22 +98,33 @@ def main(
     result_dir, reference_benchmark_filepath, plot_columns, threshold_ratio, indices
 ):
     ref, args_ref = read_df(reference_benchmark_filepath, indices, plot_columns)
+    new_benchmark_filepath = os.path.join(result_dir, BENCHMARK_FILENAME)
     df, args_df = read_df(
-        os.path.join(result_dir, BENCHMARK_FILENAME), indices, plot_columns
+        new_benchmark_filepath, indices, plot_columns
     )
     # Analyse between both sets of results and retrieve outliers
+    # - this has a side effect of plotting the charts
     outliers_df, outliers, charts = compare_results(
         df, ref, plot_columns, threshold_ratio=threshold_ratio
     )
-    # Find arguments that are different between ref and new
-    # to highlight as possible cause of anomaly
-    diff = args_df.compare(args_ref, align_axis=1).rename(
-        columns={"self": "new", "other": "ref"}, level=-1
-    )
-    diff = diff[diff.index.isin([outlier for outlier in outliers])]
-    if not diff.empty:
-        outliers_df = outliers_df.set_index(indices).merge(
-            diff, left_index=True, right_index=True
+    # this logic is brittle and will not hold if new benchmark is not 
+    # of the exact same format as the reference benchmark,
+    # so put a try-catch. 
+    try:
+        # Find arguments that are different between ref and new
+        # to highlight as possible cause of anomaly
+        diff = args_df.compare(args_ref, align_axis=1).rename(
+            columns={"self": "new", "other": "ref"}, level=-1
+        )
+        diff = diff[diff.index.isin([outlier for outlier in outliers])]
+        if not diff.empty:
+            outliers_df = outliers_df.set_index(indices).merge(
+                diff, left_index=True, right_index=True
+            )
+    except ValueError: 
+        print (
+            f"New '{new_benchmark_filepath}' is probably a partial bench. So unable"
+            "to properly compare if the arguments are consistent with old bench."
         )
     outliers_df.to_csv(os.path.join(result_dir, OUTLIERS_FILENAME))
     for chart, filename in charts:

diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh
@@ -44,7 +44,7 @@ MEMORY_LOGGING=${MEMORY_LOGGING:-"all"}
 NUM_GPUS_MATRIX=${1-"1 2"}
 RESULT_DIR=${2:-"benchmark_outputs"}
 SCENARIOS_CONFIG=${3:-$SCENARIOS_CONFIG}
-SCENARIOS_FILTER=${4-$SCNTAG_PEFT_AUTOGPTQ}
+SCENARIOS_FILTER=${4:-$SCNTAG_PEFT_AUTOGPTQ}
 
 echo "NUM_GPUS_MATRIX: $NUM_GPUS_MATRIX"
 echo "RESULT_DIR: $RESULT_DIR"
@@ -77,7 +77,7 @@ PIP_REQUIREMENTS_FILE=$RESULT_DIR/$PIP_REQUIREMENTS_FILE
 # preload models by default
 EXTRA_ARGS="--preload_models"
 
-if [ ! -z "$SCENARIOS_FILTER" ]; then 
+if [ "$SCENARIOS_FILTER" != "none" ]; then 
     EXTRA_ARGS="$EXTRA_ARGS --run_only_scenarios $SCENARIOS_FILTER"
 fi
 
@@ -137,5 +137,9 @@ PYTHONPATH=. \
         'error_messages' \
         'acceleration_framework_config_file'
 
-PYTHONPATH=. \
-    python $WORKING_DIR/compare_with_reference.py --result_dir $RESULT_DIR
+if [ "$DRY_RUN" = "true" ]; then 
+    echo "DRY_RUN=True, will skip compare with reference logic"
+else
+    PYTHONPATH=. \
+        python $WORKING_DIR/compare_with_reference.py --result_dir $RESULT_DIR
+fi