From fd9f236a82a97301f6ad79aab225b79e09d2fbef Mon Sep 17 00:00:00 2001
From: geohotstan <135171913+geohotstan@users.noreply.github.com>
Date: Mon, 26 May 2025 09:51:51 +0800
Subject: [PATCH] move test over (#10508)

---
 extra/onnx.py                           | 16 +++----
 test/external/external_test_onnx_ops.py | 62 ++++++++++++++++++++++++-
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/extra/onnx.py b/extra/onnx.py
index 6cdd4d6e03..04cad320c3 100644
--- a/extra/onnx.py
+++ b/extra/onnx.py
@@ -761,7 +761,6 @@ def get_onnx_ops():
     return _op_integer(Tensor.matmul, [A,B], [a_zero_point,b_zero_point])
 
   # ***** Training Ops *****
-  # NOTE: onnx test coverage only covers `T==0` cases, so for all `T>0` this isn't tested
   # NOTE: onnx training ops actually don't need the state for optim, all the ops work in a functional way, but we still can reuse optim.py code
   @_onnx_training(3)
   def Adagrad(R:Tensor, T:int, *inputs:Tensor, decay_factor:float=0.0, epsilon:float=0.0, norm_coefficient:float=0.0):
@@ -778,7 +777,7 @@ def get_onnx_ops():
           norm_coefficient_post:float=0.0):
     from tinygrad.nn.optim import Adam as TinyAdam
     X, G, V, H = inputs
-    G, V, H = G.detach(), V.detach(), H.detach()  # TODO we shouldn't need these detaches
+    G, V, H = G.detach(), V.detach(), H.detach()
     X.grad = norm_coefficient * X.detach() + G
     opt = TinyAdam([X], b1=alpha, b2=beta, eps=epsilon)
     opt.m, opt.v, opt.lr = [V], [H], R
@@ -794,13 +793,12 @@ def get_onnx_ops():
 
   @_onnx_training(3)
   def Momentum(R:Tensor, T:int, *inputs:Tensor, alpha:float, beta:float, mode:str, norm_coefficient:float):
-    from tinygrad.nn.optim import SGD
-    X, G, V = inputs
-    G, V = G.detach(), V.detach()
-    X.grad = (norm_coefficient * X.detach() + G) * (beta if T > 0 else 1)
-    opt = SGD([X], momentum=alpha, nesterov=(mode=="nesterov"))
-    opt.b, opt.lr = [V], R
-    opt.step()
+    X, G, V = (i.detach() for i in inputs)
+    grad = norm_coefficient * X + G
+    # NOTE: this beta_adjusted term makes it so we can't use SGD for nesterov
+    beta_adjusted = beta if T > 0 else 1
+    V.assign(alpha * V + grad * beta_adjusted)
+    X.assign(X - R * (V if mode == "standard" else (grad + alpha * V)))
     return [X, V]
 
   def Gradient(*inputs:Tensor, y:str, intermediate_tensors:dict[str, Tensor], **_):
diff --git a/test/external/external_test_onnx_ops.py b/test/external/external_test_onnx_ops.py
index adc63edda0..8150995d50 100644
--- a/test/external/external_test_onnx_ops.py
+++ b/test/external/external_test_onnx_ops.py
@@ -8,6 +8,8 @@ from tinygrad import dtypes
 from tinygrad.frontend.onnx import OnnxRunner
 import numpy as np
 from extra.onnx_helpers import validate
+from onnx.defs import ONNX_DOMAIN, AI_ONNX_PREVIEW_TRAINING_DOMAIN
+MICROSOFT_CONTRIB_OPS_DOMAIN = "com.microsoft"
 
 class TestOnnxOps(unittest.TestCase):
   DOMAIN = None
@@ -26,7 +28,7 @@ class TestOnnxOps(unittest.TestCase):
       validate(tmp.name, inps, rtol, atol)
 
 class TestMainOnnxOps(TestOnnxOps):
-  DOMAIN = ""
+  DOMAIN = ONNX_DOMAIN
   def test_reshape(self):
     inputs = {"in": np.arange(6, dtype=np.float32), "shape": np.array([2,3], dtype=np.int64)}
     attributes = {}
@@ -195,8 +197,64 @@ class TestMainOnnxOps(TestOnnxOps):
   def test_qlinearmatmul_2D_int8_float32(self): self._run_qlinearmatmul_test(np.int8, np.float32, 2)
   def test_qlinearmatmul_3D_int8_float32(self): self._run_qlinearmatmul_test(np.int8, np.float32, 3)
 
+class TestTrainingOnnxOps(TestOnnxOps):
+  # NOTE: ORT doesn't actually support training ops on cpu so we test using functions provided by onnx
+  DOMAIN = AI_ONNX_PREVIEW_TRAINING_DOMAIN
+  def _validate_training(self, op:str, onnx_fxn, inps:dict[str, np.ndarray], opts:dict[str, Any], outs:list[str]):
+    model = self.helper_build_model(op, inps, opts, outs)
+    if op == "Momentum": del opts['mode']
+    runner = OnnxRunner(model)
+    tiny_out = runner(inps)
+    onnx_out = onnx_fxn(**inps, **opts)
+    for (nm, t_out), o_out in  zip(tiny_out.items(), onnx_out):
+      np.testing.assert_allclose(t_out.numpy(), o_out, rtol=1e-3, atol=1e-6, err_msg=f"{nm} failed")
+
+  def test_adagrad_t_greater_than_zero(self):
+    from onnx.backend.test.case.node.adagrad import apply_adagrad
+    for t in [1, 3, 100]:
+      inputs = {
+        "r": np.array(0.01, dtype=np.float32),
+        "t": np.array(t, dtype=np.int32),
+        "x": np.random.randn(3, 3).astype(np.float32),
+        "g": np.random.randn(3, 3).astype(np.float32),
+        "h": np.random.randn(3, 3).astype(np.float32),
+      }
+      attributes = {"decay_factor": 0.1, "epsilon": 1e-6, "norm_coefficient": 0.01}
+      outputs = ["X_out", "H_out"]
+      self._validate_training("Adagrad", apply_adagrad, inputs, attributes, outputs)
+
+  def test_momentum_t_greater_than_zero(self):
+    from onnx.backend.test.case.node.momentum import apply_momentum, apply_nesterov
+    for onnx_fxn, mode in ((apply_momentum, "standard"), (apply_nesterov, "nesterov")):
+      for t in [1, 3, 100]:
+        inputs = {
+          "r": np.array(0.01, dtype=np.float32),
+          "t": np.array(t, dtype=np.int32),
+          "x": np.random.randn(3, 3).astype(np.float32),
+          "g": np.random.randn(3, 3).astype(np.float32),
+          "v": np.random.randn(3, 3).astype(np.float32),
+        }
+        attributes = {"alpha": 0.9, "beta": 0.1, "mode": mode, "norm_coefficient": 0.01}
+        outputs = ["X_out", "V_out"]
+        self._validate_training("Momentum", onnx_fxn, inputs, attributes, outputs)
+
+  def test_adam_t_greater_than_zero(self):
+    from onnx.backend.test.case.node.adam import apply_adam
+    for t in [1, 3, 100]:
+      inputs = {
+        "r": np.array(0.01, dtype=np.float32),
+        "t": np.array(t, dtype=np.int32),
+        "x": np.random.randn(3, 3).astype(np.float32),
+        "g": np.random.randn(3, 3).astype(np.float32),
+        "v": np.random.randn(3, 3).astype(np.float32),
+        "h": np.random.randn(3, 3).astype(np.float32),
+      }
+      attributes = { "alpha": 0.9, "beta": 0.999, "epsilon": 1e-8, "norm_coefficient": 0.01, "norm_coefficient_post": 0.02 }
+      outputs = ["X_new", "V_new", "H_new"]
+      self._validate_training("Adam", apply_adam, inputs, attributes, outputs)
+
 class TestContribOnnxOps(TestOnnxOps):
-  DOMAIN = "com.microsoft"
+  DOMAIN = MICROSOFT_CONTRIB_OPS_DOMAIN
   def test_attention(self):
     batch_size, seq_len, input_hidden_size = 2, 8, 256
     num_heads, head_size = 4, 64