From 3cf8291f2fcc40e5c731c4fa4f96f7679d1d41ee Mon Sep 17 00:00:00 2001 From: Francis Lam Date: Thu, 2 May 2024 17:14:46 -0700 Subject: [PATCH] mlperf/resnet: update beam params to increase time and quality (#4396) * mlperf/resnet: update beam params to increase time and quality * revert upcast 8 in search space and add rocm setup function * refactor to independent setup.sh script --- .../resnet/implementations/tinybox_green/dev_beam.sh | 2 +- .../resnet/implementations/tinybox_green/dev_run.sh | 2 +- .../resnet/implementations/tinybox_green/run_and_time.sh | 2 +- .../resnet/implementations/tinybox_red/dev_beam.sh | 4 +++- .../benchmarks/resnet/implementations/tinybox_red/dev_run.sh | 4 +++- .../resnet/implementations/tinybox_red/run_and_time.sh | 2 +- .../benchmarks/resnet/implementations/tinybox_red/setup.sh | 5 +++++ 7 files changed, 15 insertions(+), 6 deletions(-) create mode 100755 examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh diff --git a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh index ae12553204..48d148b13a 100755 --- a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh +++ b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh @@ -4,7 +4,7 @@ export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=48 LR=7 export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0 -export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=50 +export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 export BENCHMARK=10 DEBUG=2 diff --git a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh index 6ff6846e2e..3d0583fd4e 100755 --- a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh +++ b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh @@ -4,7 +4,7 @@ export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=48 LR=7 export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0 -export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=50 +export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 export EVAL_START_EPOCH=3 EVAL_FREQ=4 diff --git a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh index 2c60088e96..7c3b32cc2e 100755 --- a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh +++ b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh @@ -7,7 +7,7 @@ export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=48 LR=7 export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0 -export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=50 +export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 # pip install -e ".[mlperf]" export LOGMLPERF=1 diff --git a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh index 41197eb53e..e89f1f075b 100755 --- a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh @@ -1,10 +1,12 @@ +#!/bin/bash + export PYTHONPATH="." export MODEL="resnet" export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=48 LR=7 export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0 -export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=25 +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 export BENCHMARK=10 DEBUG=2 diff --git a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh index c4c2ec5907..db49b76f4a 100755 --- a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh +++ b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh @@ -1,10 +1,12 @@ +#!/bin/bash + export PYTHONPATH="." export MODEL="resnet" export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=48 LR=7 export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0 -export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=25 +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 export EVAL_START_EPOCH=3 EVAL_FREQ=4 diff --git a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh index 990f60496c..9165e01369 100755 --- a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh +++ b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh @@ -7,7 +7,7 @@ export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=48 LR=7 export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0 -export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=25 +export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0 # pip install -e ".[mlperf]" export LOGMLPERF=1 diff --git a/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh new file mode 100755 index 0000000000..eefe662457 --- /dev/null +++ b/examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +rocm-smi --setprofile compute +rocm-smi --setmclk 3 +rocm-smi --setperflevel high