From 409bb0c9ada39dd00effaaee1887db828c9bfcf0 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Fri, 15 May 2026 02:01:04 -0400
Subject: [PATCH] requires_grad cannot be None (#16212)

final goal is to remove requires_grad, first change the default to True, and don't allow None
---
 examples/gradaccum_mnist.py               |  3 +-
 examples/mlperf/models/flat_llama.py      |  2 +-
 extra/torch_backend/test_kernel_fusion.py |  2 +-
 test/backend/test_asm_gemm.py             |  4 +-
 test/backend/test_dtype.py                |  4 --
 test/backend/test_tensor.py               |  3 +-
 test/unit/test_disk_tensor.py             |  4 --
 test/unit/test_gradient.py                |  2 +-
 tinygrad/nn/optim.py                      |  4 --
 tinygrad/tensor.py                        | 49 +++++++++--------------
 10 files changed, 27 insertions(+), 50 deletions(-)

diff --git a/examples/gradaccum_mnist.py b/examples/gradaccum_mnist.py
index a660afddf4..2a0ac6f143 100644
--- a/examples/gradaccum_mnist.py
+++ b/examples/gradaccum_mnist.py
@@ -35,9 +35,8 @@ if __name__ == "__main__":
 
   params = nn.state.get_parameters(model)
 
-  # init params, set requires grad on the ones we need gradients of
+  # init params
   for x in params:
-    if x.requires_grad is None: x.requires_grad_()
     x.replace(x.contiguous())
   Tensor.realize(*params)
 
diff --git a/examples/mlperf/models/flat_llama.py b/examples/mlperf/models/flat_llama.py
index 50fa509ccb..018d61f7aa 100644
--- a/examples/mlperf/models/flat_llama.py
+++ b/examples/mlperf/models/flat_llama.py
@@ -307,7 +307,7 @@ if __name__ == "__main__":
 
   # preallocate all the grad buffers and zero them out
   grads = {x:Tensor.zeros(x.shape, dtype=x.dtype, device=x.device).contiguous()
-           for x in state.values() if x.requires_grad is None}
+           for x in state.values() if x.requires_grad}
 
   # print model size
   sz = 0
diff --git a/extra/torch_backend/test_kernel_fusion.py b/extra/torch_backend/test_kernel_fusion.py
index 0a748fe4ad..171872a68b 100644
--- a/extra/torch_backend/test_kernel_fusion.py
+++ b/extra/torch_backend/test_kernel_fusion.py
@@ -105,7 +105,7 @@ class TestKernelFusionRegression(unittest.TestCase):
       view = x[1:3]
       view += 1.0
       return x.sum()
-    self._check_kernel_count(fn, 8)
+    self._check_kernel_count(fn, 7)
 
   def test_batchnorm_running_stats_update(self):
     def fn():
diff --git a/test/backend/test_asm_gemm.py b/test/backend/test_asm_gemm.py
index 518aaeeb3e..6792040a6e 100644
--- a/test/backend/test_asm_gemm.py
+++ b/test/backend/test_asm_gemm.py
@@ -12,8 +12,8 @@ def is_cdna4(): return Device[Device.DEFAULT].renderer.target.arch.startswith("g
 
 def run_asm_gemm(a_shape, b_shape, dtype=dtypes.float16, a_shard=None, b_shard=None, gpus:int=1) -> None:
   Tensor.manual_seed(0)
-  a_rand = Tensor.randn(a_shape, dtype=dtypes.float).sub(0.5).cast(dtype)
-  b_rand = Tensor.randn(b_shape, dtype=dtypes.float).sub(0.5).cast(dtype)
+  a_rand = Tensor.randn(a_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype)
+  b_rand = Tensor.randn(b_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype)
   with Context(DEBUG=0):
     Tensor.realize(a_rand, b_rand)
 
diff --git a/test/backend/test_dtype.py b/test/backend/test_dtype.py
index 84bb16fe31..430bfa5db2 100644
--- a/test/backend/test_dtype.py
+++ b/test/backend/test_dtype.py
@@ -330,10 +330,6 @@ class TestBitCast(unittest.TestCase):
       # should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2
       Tensor.empty((3,), dtype=dtypes.int8).bitcast(dtypes.float16)
 
-    with self.assertRaises(RuntimeError):
-      # should fail because backprop through bitcast is undefined
-      Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True).bitcast(dtypes.float16)
-
   def test_bitcast_float_to_int32(self):
     a = Tensor([1.,2,3])
     b = a.bitcast(dtypes.int32)
diff --git a/test/backend/test_tensor.py b/test/backend/test_tensor.py
index d699ed7bd9..4135217f58 100644
--- a/test/backend/test_tensor.py
+++ b/test/backend/test_tensor.py
@@ -179,8 +179,7 @@ class TestTinygrad(unittest.TestCase):
     def test_tinygrad():
       w1 = Tensor(init)
       w2 = Tensor(init)
-      assert w1.requires_grad is None and w2.requires_grad is None
-      # optimizer sets requires_grad=True for params with requires_grad=None
+      assert w1.requires_grad is True and w2.requires_grad is True
       nn.optim.SGD([w1, w2], lr=0.01)
       assert w1.requires_grad is True and w2.requires_grad is True
       out = w1.add(w2)
diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py
index a851660bf4..3db019a138 100644
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -92,10 +92,6 @@ class TestRawDiskBuffer(unittest.TestCase):
       # should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2
       Tensor.empty((3,), dtype=dtypes.int8, device=f"DISK:{tmp}").bitcast(dtypes.float16)
 
-    with self.assertRaises(RuntimeError):
-      # should fail because backprop through bitcast is undefined
-      Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True, device=f"DISK:{tmp}").bitcast(dtypes.float16)
-
     pathlib.Path(tmp).unlink()
 
 @unittest.skipUnless(is_dtype_supported(dtypes.uint8), "need uint8")
diff --git a/test/unit/test_gradient.py b/test/unit/test_gradient.py
index 80e46cd859..a52af55ed7 100644
--- a/test/unit/test_gradient.py
+++ b/test/unit/test_gradient.py
@@ -70,7 +70,7 @@ class TestTensorGradient(unittest.TestCase):
     self.assertIs(x.grad, old_grad)
 
   def test_gradient_through_clone_from_non_grad_src(self):
-    src = Tensor([1.0, 2.0, 3.0, 4.0])
+    src = Tensor([1.0, 2.0, 3.0, 4.0], requires_grad=False)
     x = src.clone().requires_grad_(True)
     (x * 2.0).sum().backward()
     np.testing.assert_allclose(x.grad.numpy(), [2.0, 2.0, 2.0, 2.0])
diff --git a/tinygrad/nn/optim.py b/tinygrad/nn/optim.py
index a912bd6501..a256c90684 100644
--- a/tinygrad/nn/optim.py
+++ b/tinygrad/nn/optim.py
@@ -10,10 +10,6 @@ class Optimizer:
   """
   def __init__(self, params: list[Tensor], lr: float, device=None, fused=FUSE_OPTIM):
     if lr < 0: raise ValueError(f"Invalid learning rate: {lr}")
-    # if requires_grad is None, but being put into an optimizer, set it to True
-    for x in params:
-      if x.requires_grad is None: x.requires_grad_(True)
-
     self.params: list[Tensor] = dedup([x for x in params if x.requires_grad])
     assert len(self.params) != 0, "optimizer must have at least one param"
     self.buffers: list[Tensor] = dedup([x for x in params if not x.requires_grad])   # buffers are still realized
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 807e205ccf..2c1fb8adb3 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -92,7 +92,7 @@ class Tensor(OpMixin):
   training: ClassVar[bool] = False
 
   def __init__(self, data:ConstType|bytes|list|tuple|UOp|'numpy.ndarray'|pathlib.Path|None,
-               device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, _force_unique:bool=False):
+               device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, _force_unique:bool=False):
     if device is None:
       if isinstance(data, pathlib.Path): device = f"DISK:{data.resolve()}"  # keep it on the disk if device is None
       elif isinstance(data, UOp): device = data._device
@@ -103,9 +103,7 @@ class Tensor(OpMixin):
     # tensors can have gradients if you have called .backward
     self.grad:Tensor|None = None
 
-    # NOTE: this can be in three states. False and None: no gradient, True: gradient
-    # None (the default) will be updated to True if it's put in an optimizer
-    self.requires_grad:bool|None = requires_grad
+    self.requires_grad:bool = requires_grad
 
     # create a UOp from the different types of inputs
     if isinstance(data, UOp):
@@ -115,8 +113,8 @@ class Tensor(OpMixin):
     elif data is None:
       data = UOp.const(_dtype or dtypes.default_float, 0, _device)
     elif isinstance(data, get_args(ConstType)):
-      if _force_unique or requires_grad: data = UOp.unique_const(data, _dtype, _device)
-      else: data = UOp.const(_dtype or dtypes.from_py(data), data, _device)
+      dt = _dtype or dtypes.from_py(data)
+      data = UOp.unique_const(data, dt, _device) if _force_unique or (requires_grad and dtypes.is_float(dt)) else UOp.const(dt, data, _device)
     elif isinstance(data, bytes): data = _frompy(data, _dtype or dtypes.uint8, _device)
     elif isinstance(data, (list, tuple)):
       if _dtype is None:
@@ -151,11 +149,10 @@ class Tensor(OpMixin):
     srcs = (self,)+x
     new_uop: UOp = fxn(*[t.uop for t in srcs], *extra_args, **kwargs)
     if TRACEMETA >= 1 and (metadata:=_METADATA.get()) is not None: all_metadata[new_uop] = (metadata,)
-    needs_input_grad = [t.requires_grad for t in srcs]
     # directly create the Tensor
     ret = Tensor.__new__(Tensor)
     ret.uop, ret.grad = new_uop, None
-    ret.requires_grad = True if any(needs_input_grad) else None if None in needs_input_grad else False
+    ret.requires_grad = any(t.requires_grad for t in srcs)
     # add to all_tensors after construction succeeds
     all_tensors[weakref.ref(ret)] = None
     return ret
@@ -166,7 +163,7 @@ class Tensor(OpMixin):
   @staticmethod
   def unique_const(fill_value:ConstType|UOp, **kwargs) -> Tensor: return Tensor(fill_value, _force_unique=True, **kwargs)
 
-  def requires_grad_(self, requires_grad=True) -> Tensor:
+  def requires_grad_(self, requires_grad:bool=True) -> Tensor:
     # make the UOp unique if it's a CONST to prevent gradient accumulation bugs with cached const UOps
     if requires_grad and self.uop.op is Ops.CONST: self.replace(Tensor(self.uop.arg, device=self.device, dtype=self.dtype, requires_grad=True))
     self.requires_grad = requires_grad
@@ -566,7 +563,7 @@ class Tensor(OpMixin):
     return Tensor._device_seeds[device], low.cat(high)
 
   @staticmethod
-  def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, contiguous:bool=True) -> Tensor:
+  def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, contiguous:bool=True) -> Tensor:
     """
     Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[0, 1)`.
 
@@ -595,7 +592,7 @@ class Tensor(OpMixin):
   # ***** creation helper functions *****
 
   @classmethod
-  def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool|None=None) -> Tensor:
+  def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool=True) -> Tensor:
     """
     Returns a 2-D tensor with `n` rows and `m` columns, with ones on the diagonal and zeros elsewhere.
 
@@ -617,9 +614,9 @@ class Tensor(OpMixin):
     if kwargs.get("device") is not None: raise RuntimeError("cannot specify `device` on `*_like` of a multi device tensor")
     if self.uop.axis is None: return fxn(self.shape, *args, dtype=dtype, **kwargs).shard(self.device)
     stacked = UOp.mstack(*[fxn(self.uop.shard_shape, *args, device=d, dtype=dtype, **kwargs).uop for d in self.device])
-    return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad"))
+    return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad", True))
 
-  def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad=None) -> Tensor:
+  def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad:bool=False) -> Tensor:
     """
     Creates a tensor with the same shape as `self`, filled with the given value.
     If `dtype` is not specified, the dtype of `self` is used.
@@ -631,12 +628,9 @@ class Tensor(OpMixin):
     print(Tensor.full_like(t, 42).numpy())
     ```
     """
-    if device is not None:
-      if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
-      return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)
-    if requires_grad:
-      return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=self.device).requires_grad_(requires_grad)
-    return super().full_like(fill_value, dtype)
+    if device is None: return super().full_like(fill_value, dtype).requires_grad_(requires_grad)
+    if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
+    return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)
 
   def rand_like(self, **kwargs) -> Tensor:
     """
@@ -655,7 +649,7 @@ class Tensor(OpMixin):
 
   # ***** random functions *****
 
-  def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
     """
     Creates a tensor with the same shape and sharding as `self`, filled with random values from a normal distribution with mean 0 and variance 1.
 
@@ -672,7 +666,7 @@ class Tensor(OpMixin):
     return (src[0].mul(2*math.pi).cos().mul((1 - src[1]).log().mul(-2).sqrt()).cast(dtype or self.dtype)).requires_grad_(requires_grad)
 
   @staticmethod
-  def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
     """
     Creates a tensor with the given shape, filled with random values from a normal distribution with mean `0` and standard deviation `1`.
     If `dtype` is not specified, the default type is used.
@@ -707,7 +701,7 @@ class Tensor(OpMixin):
     return Tensor.uniform(*shape, low=low, high=high, dtype=dtype, **kwargs)
 
   @staticmethod
-  def normal(*shape, mean=0.0, std=1.0, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def normal(*shape, mean=0.0, std=1.0, requires_grad:bool=True, **kwargs) -> Tensor:
     """
     Creates a tensor with the given shape, filled with random values from a normal distribution with the given `mean` and standard deviation `std`.
     Requires `std >= 0`.
@@ -724,7 +718,7 @@ class Tensor(OpMixin):
     return (std * Tensor.randn(*shape, **kwargs) + mean).requires_grad_(requires_grad)
 
   @staticmethod
-  def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
     """
     Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[low, high)`.
     Requires `low < high`.
@@ -815,7 +809,7 @@ class Tensor(OpMixin):
     print(Tensor.randperm(6).numpy())
     ```
     """
-    return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad"))
+    return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad", True))
 
   def multinomial(self:Tensor, num_samples:int = 1, replacement:bool = False) -> Tensor:
     """
@@ -883,7 +877,7 @@ class Tensor(OpMixin):
     """
     all_uops = self.uop.toposort()
     tensors_need_grad: list[Tensor] = [t for tref in all_tensors if (t:=tref()) is not None and \
-                                       t.uop in all_uops and t.requires_grad]
+                                       t.uop in all_uops and t.requires_grad and t.is_floating_point()]
     # clear contexts
     for t,g in zip(tensors_need_grad, self.gradient(*tensors_need_grad, gradient=gradient)):
       assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}"
@@ -1033,7 +1027,7 @@ class Tensor(OpMixin):
     if any(self.uop in t.uop.backward_slice_with_self and t.uop.base is not shared for tref in all_tensors
            if (t:=tref()) is not None and t is not self and t.uop is not v_uop and t.uop not in v_bw):
       raise RuntimeError("can't setitem on a tensor with other uses")
-    if self.requires_grad or (isinstance(v, Tensor) and v.requires_grad):
+    if not self.uop.base.is_realized and self.is_floating_point() and (self.requires_grad or (isinstance(v, Tensor) and v.requires_grad)):
       if not isinstance(v, Tensor): v = Tensor(v, device=self.device, dtype=self.dtype)
       # __iadd__/__isub__ creates AFTER(view, STORE(view, computed)); unwrap to get the computed value
       if v.uop.op is Ops.AFTER and any(s.op is Ops.STORE for s in v.uop.src[1:]): v = v._apply_uop(lambda x: x.src[1].src[1])
@@ -1426,8 +1420,6 @@ class Tensor(OpMixin):
     """
     Bitcasts `self` to the given `dtype` of the same itemsize.
 
-    `self` must not require a gradient.
-
     ```python exec="true" source="above" session="tensor" result="python"
     t = Tensor([-1, 2, 3], dtype=dtypes.int32)
     print(t.dtype, t.numpy())
@@ -1437,7 +1429,6 @@ class Tensor(OpMixin):
     print(t.dtype, t.numpy())
     ```
     """
-    if self.requires_grad: raise RuntimeError("can't backprop through bitcast")
     dt = to_dtype(dtype)
     if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
     if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os: