From a3f938dbee2d1fd7d76eb851f37dec5833951619 Mon Sep 17 00:00:00 2001 From: chenyu Date: Wed, 23 Apr 2025 14:32:54 -0400 Subject: [PATCH] remove retinanet INITMLPERF from beam script (#10011) it only controls logging, loading real data or not is solely controlled by RUNMLPERF --- examples/mlperf/model_train.py | 10 +++++----- .../benchmarks/retinanet/tinybox_green/dev_beam.sh | 3 +-- .../benchmarks/retinanet/tinybox_red/dev_beam.sh | 3 +-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index 937a12d9a1..990f5bdbd7 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -361,7 +361,7 @@ def train_retinanet(): NUM_CLASSES = len(MLPERF_CLASSES) BASEDIR = getenv("BASEDIR", BASEDIR) BENCHMARK = getenv("BENCHMARK") - INITMLPERF = getenv("INITMLPERF") + # INITMLPERF = getenv("INITMLPERF") RUNMLPERF = getenv("RUNMLPERF") config["gpus"] = GPUS = [f"{Device.DEFAULT}:{i}" for i in range(getenv("GPUS", 6))] @@ -479,7 +479,7 @@ def train_retinanet(): # ** training loop ** BEAM.value = TRAIN_BEAM - if INITMLPERF: + if not RUNMLPERF: i, proc = 0, _fake_data_get(BS) else: train_dataloader = batch_load_retinanet(train_dataset, False, base_dir_path, batch_size=BS, seed=SEED) @@ -499,7 +499,7 @@ def train_retinanet(): if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued try: - if INITMLPERF: + if not RUNMLPERF: next_proc = _fake_data_get(BS) else: next_proc = _data_get(it) @@ -552,7 +552,7 @@ def train_retinanet(): if getenv("RESET_STEP", 1): _train_step.reset() with Tensor.train(mode=False), Tensor.test(): - if INITMLPERF: + if not RUNMLPERF: i, proc = 0, _fake_data_get(EVAL_BS, val=(val:=True)) else: val_dataloader = batch_load_retinanet(val_dataset, (val:=True), Path(BASEDIR), batch_size=EVAL_BS, shuffle=False, seed=SEED) @@ -583,7 +583,7 @@ def train_retinanet(): if len(prev_cookies) == getenv("STORE_COOKIES", 1): prev_cookies = [] # free previous cookies after gpu work has been enqueued try: - if INITMLPERF: + if not RUNMLPERF: next_proc = _fake_data_get(EVAL_BS, val=val) else: next_proc = _data_get(it, val=val) diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_green/dev_beam.sh index 18f95a5c5d..fc386ce312 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_green/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_green/dev_beam.sh @@ -9,7 +9,6 @@ export BASEDIR="/raid/datasets/openimages" export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 -export INITMLPERF=1 -export BENCHMARK=10 DEBUG=2 +export BENCHMARK=5 DEBUG=2 python examples/mlperf/model_train.py diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_red/dev_beam.sh index 18f95a5c5d..fc386ce312 100755 --- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_red/dev_beam.sh +++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/tinybox_red/dev_beam.sh @@ -9,7 +9,6 @@ export BASEDIR="/raid/datasets/openimages" export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0 -export INITMLPERF=1 -export BENCHMARK=10 DEBUG=2 +export BENCHMARK=5 DEBUG=2 python examples/mlperf/model_train.py