From fd9f236a82a97301f6ad79aab225b79e09d2fbef Mon Sep 17 00:00:00 2001 From: geohotstan <135171913+geohotstan@users.noreply.github.com> Date: Mon, 26 May 2025 09:51:51 +0800 Subject: [PATCH] move test over (#10508) --- extra/onnx.py | 16 +++---- test/external/external_test_onnx_ops.py | 62 ++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/extra/onnx.py b/extra/onnx.py index 6cdd4d6e03..04cad320c3 100644 --- a/extra/onnx.py +++ b/extra/onnx.py @@ -761,7 +761,6 @@ def get_onnx_ops(): return _op_integer(Tensor.matmul, [A,B], [a_zero_point,b_zero_point]) # ***** Training Ops ***** - # NOTE: onnx test coverage only covers `T==0` cases, so for all `T>0` this isn't tested # NOTE: onnx training ops actually don't need the state for optim, all the ops work in a functional way, but we still can reuse optim.py code @_onnx_training(3) def Adagrad(R:Tensor, T:int, *inputs:Tensor, decay_factor:float=0.0, epsilon:float=0.0, norm_coefficient:float=0.0): @@ -778,7 +777,7 @@ def get_onnx_ops(): norm_coefficient_post:float=0.0): from tinygrad.nn.optim import Adam as TinyAdam X, G, V, H = inputs - G, V, H = G.detach(), V.detach(), H.detach() # TODO we shouldn't need these detaches + G, V, H = G.detach(), V.detach(), H.detach() X.grad = norm_coefficient * X.detach() + G opt = TinyAdam([X], b1=alpha, b2=beta, eps=epsilon) opt.m, opt.v, opt.lr = [V], [H], R @@ -794,13 +793,12 @@ def get_onnx_ops(): @_onnx_training(3) def Momentum(R:Tensor, T:int, *inputs:Tensor, alpha:float, beta:float, mode:str, norm_coefficient:float): - from tinygrad.nn.optim import SGD - X, G, V = inputs - G, V = G.detach(), V.detach() - X.grad = (norm_coefficient * X.detach() + G) * (beta if T > 0 else 1) - opt = SGD([X], momentum=alpha, nesterov=(mode=="nesterov")) - opt.b, opt.lr = [V], R - opt.step() + X, G, V = (i.detach() for i in inputs) + grad = norm_coefficient * X + G + # NOTE: this beta_adjusted term makes it so we can't use SGD for nesterov + beta_adjusted = beta if T > 0 else 1 + V.assign(alpha * V + grad * beta_adjusted) + X.assign(X - R * (V if mode == "standard" else (grad + alpha * V))) return [X, V] def Gradient(*inputs:Tensor, y:str, intermediate_tensors:dict[str, Tensor], **_): diff --git a/test/external/external_test_onnx_ops.py b/test/external/external_test_onnx_ops.py index adc63edda0..8150995d50 100644 --- a/test/external/external_test_onnx_ops.py +++ b/test/external/external_test_onnx_ops.py @@ -8,6 +8,8 @@ from tinygrad import dtypes from tinygrad.frontend.onnx import OnnxRunner import numpy as np from extra.onnx_helpers import validate +from onnx.defs import ONNX_DOMAIN, AI_ONNX_PREVIEW_TRAINING_DOMAIN +MICROSOFT_CONTRIB_OPS_DOMAIN = "com.microsoft" class TestOnnxOps(unittest.TestCase): DOMAIN = None @@ -26,7 +28,7 @@ class TestOnnxOps(unittest.TestCase): validate(tmp.name, inps, rtol, atol) class TestMainOnnxOps(TestOnnxOps): - DOMAIN = "" + DOMAIN = ONNX_DOMAIN def test_reshape(self): inputs = {"in": np.arange(6, dtype=np.float32), "shape": np.array([2,3], dtype=np.int64)} attributes = {} @@ -195,8 +197,64 @@ class TestMainOnnxOps(TestOnnxOps): def test_qlinearmatmul_2D_int8_float32(self): self._run_qlinearmatmul_test(np.int8, np.float32, 2) def test_qlinearmatmul_3D_int8_float32(self): self._run_qlinearmatmul_test(np.int8, np.float32, 3) +class TestTrainingOnnxOps(TestOnnxOps): + # NOTE: ORT doesn't actually support training ops on cpu so we test using functions provided by onnx + DOMAIN = AI_ONNX_PREVIEW_TRAINING_DOMAIN + def _validate_training(self, op:str, onnx_fxn, inps:dict[str, np.ndarray], opts:dict[str, Any], outs:list[str]): + model = self.helper_build_model(op, inps, opts, outs) + if op == "Momentum": del opts['mode'] + runner = OnnxRunner(model) + tiny_out = runner(inps) + onnx_out = onnx_fxn(**inps, **opts) + for (nm, t_out), o_out in zip(tiny_out.items(), onnx_out): + np.testing.assert_allclose(t_out.numpy(), o_out, rtol=1e-3, atol=1e-6, err_msg=f"{nm} failed") + + def test_adagrad_t_greater_than_zero(self): + from onnx.backend.test.case.node.adagrad import apply_adagrad + for t in [1, 3, 100]: + inputs = { + "r": np.array(0.01, dtype=np.float32), + "t": np.array(t, dtype=np.int32), + "x": np.random.randn(3, 3).astype(np.float32), + "g": np.random.randn(3, 3).astype(np.float32), + "h": np.random.randn(3, 3).astype(np.float32), + } + attributes = {"decay_factor": 0.1, "epsilon": 1e-6, "norm_coefficient": 0.01} + outputs = ["X_out", "H_out"] + self._validate_training("Adagrad", apply_adagrad, inputs, attributes, outputs) + + def test_momentum_t_greater_than_zero(self): + from onnx.backend.test.case.node.momentum import apply_momentum, apply_nesterov + for onnx_fxn, mode in ((apply_momentum, "standard"), (apply_nesterov, "nesterov")): + for t in [1, 3, 100]: + inputs = { + "r": np.array(0.01, dtype=np.float32), + "t": np.array(t, dtype=np.int32), + "x": np.random.randn(3, 3).astype(np.float32), + "g": np.random.randn(3, 3).astype(np.float32), + "v": np.random.randn(3, 3).astype(np.float32), + } + attributes = {"alpha": 0.9, "beta": 0.1, "mode": mode, "norm_coefficient": 0.01} + outputs = ["X_out", "V_out"] + self._validate_training("Momentum", onnx_fxn, inputs, attributes, outputs) + + def test_adam_t_greater_than_zero(self): + from onnx.backend.test.case.node.adam import apply_adam + for t in [1, 3, 100]: + inputs = { + "r": np.array(0.01, dtype=np.float32), + "t": np.array(t, dtype=np.int32), + "x": np.random.randn(3, 3).astype(np.float32), + "g": np.random.randn(3, 3).astype(np.float32), + "v": np.random.randn(3, 3).astype(np.float32), + "h": np.random.randn(3, 3).astype(np.float32), + } + attributes = { "alpha": 0.9, "beta": 0.999, "epsilon": 1e-8, "norm_coefficient": 0.01, "norm_coefficient_post": 0.02 } + outputs = ["X_new", "V_new", "H_new"] + self._validate_training("Adam", apply_adam, inputs, attributes, outputs) + class TestContribOnnxOps(TestOnnxOps): - DOMAIN = "com.microsoft" + DOMAIN = MICROSOFT_CONTRIB_OPS_DOMAIN def test_attention(self): batch_size, seq_len, input_hidden_size = 2, 8, 256 num_heads, head_size = 4, 64