requires_grad cannot be None (#16212)

final goal is to remove requires_grad, first change the default to True, and don't allow None
2026-06-13 00:15:35 +08:00 · 2026-05-15 02:01:04 -04:00
parent c7870f11ff
commit 409bb0c9ad
10 changed files with 27 additions and 50 deletions
--- a/examples/gradaccum_mnist.py
+++ b/examples/gradaccum_mnist.py
@@ -35,9 +35,8 @@ if __name__ == "__main__":

  params = nn.state.get_parameters(model)

-  # init params, set requires grad on the ones we need gradients of
+  # init params
  for x in params:
-    if x.requires_grad is None: x.requires_grad_()
    x.replace(x.contiguous())
  Tensor.realize(*params)

--- a/examples/mlperf/models/flat_llama.py
+++ b/examples/mlperf/models/flat_llama.py
@@ -307,7 +307,7 @@ if __name__ == "__main__":

  # preallocate all the grad buffers and zero them out
  grads = {x:Tensor.zeros(x.shape, dtype=x.dtype, device=x.device).contiguous()
-           for x in state.values() if x.requires_grad is None}
+           for x in state.values() if x.requires_grad}

  # print model size
  sz = 0
--- a/extra/torch_backend/test_kernel_fusion.py
+++ b/extra/torch_backend/test_kernel_fusion.py
@@ -105,7 +105,7 @@ class TestKernelFusionRegression(unittest.TestCase):
      view = x[1:3]
      view += 1.0
      return x.sum()
-    self._check_kernel_count(fn, 8)
+    self._check_kernel_count(fn, 7)

  def test_batchnorm_running_stats_update(self):
    def fn():
--- a/test/backend/test_asm_gemm.py
+++ b/test/backend/test_asm_gemm.py
@@ -12,8 +12,8 @@ def is_cdna4(): return Device[Device.DEFAULT].renderer.target.arch.startswith("g

 def run_asm_gemm(a_shape, b_shape, dtype=dtypes.float16, a_shard=None, b_shard=None, gpus:int=1) -> None:
  Tensor.manual_seed(0)
-  a_rand = Tensor.randn(a_shape, dtype=dtypes.float).sub(0.5).cast(dtype)
-  b_rand = Tensor.randn(b_shape, dtype=dtypes.float).sub(0.5).cast(dtype)
+  a_rand = Tensor.randn(a_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype)
+  b_rand = Tensor.randn(b_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype)
  with Context(DEBUG=0):
    Tensor.realize(a_rand, b_rand)

--- a/test/backend/test_dtype.py
+++ b/test/backend/test_dtype.py
@@ -330,10 +330,6 @@ class TestBitCast(unittest.TestCase):
      # should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2
      Tensor.empty((3,), dtype=dtypes.int8).bitcast(dtypes.float16)

-    with self.assertRaises(RuntimeError):
-      # should fail because backprop through bitcast is undefined
-      Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True).bitcast(dtypes.float16)
-
  def test_bitcast_float_to_int32(self):
    a = Tensor([1.,2,3])
    b = a.bitcast(dtypes.int32)
--- a/test/backend/test_tensor.py
+++ b/test/backend/test_tensor.py
@@ -179,8 +179,7 @@ class TestTinygrad(unittest.TestCase):
    def test_tinygrad():
      w1 = Tensor(init)
      w2 = Tensor(init)
-      assert w1.requires_grad is None and w2.requires_grad is None
-      # optimizer sets requires_grad=True for params with requires_grad=None
+      assert w1.requires_grad is True and w2.requires_grad is True
      nn.optim.SGD([w1, w2], lr=0.01)
      assert w1.requires_grad is True and w2.requires_grad is True
      out = w1.add(w2)
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -92,10 +92,6 @@ class TestRawDiskBuffer(unittest.TestCase):
      # should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2
      Tensor.empty((3,), dtype=dtypes.int8, device=f"DISK:{tmp}").bitcast(dtypes.float16)

-    with self.assertRaises(RuntimeError):
-      # should fail because backprop through bitcast is undefined
-      Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True, device=f"DISK:{tmp}").bitcast(dtypes.float16)
-
    pathlib.Path(tmp).unlink()

@unittest.skipUnless(is_dtype_supported(dtypes.uint8), "need uint8")
--- a/test/unit/test_gradient.py
+++ b/test/unit/test_gradient.py
@@ -70,7 +70,7 @@ class TestTensorGradient(unittest.TestCase):
    self.assertIs(x.grad, old_grad)

  def test_gradient_through_clone_from_non_grad_src(self):
-    src = Tensor([1.0, 2.0, 3.0, 4.0])
+    src = Tensor([1.0, 2.0, 3.0, 4.0], requires_grad=False)
    x = src.clone().requires_grad_(True)
    (x * 2.0).sum().backward()
    np.testing.assert_allclose(x.grad.numpy(), [2.0, 2.0, 2.0, 2.0])
--- a/tinygrad/nn/optim.py
+++ b/tinygrad/nn/optim.py
@@ -10,10 +10,6 @@ class Optimizer:
  """
  def __init__(self, params: list[Tensor], lr: float, device=None, fused=FUSE_OPTIM):
    if lr < 0: raise ValueError(f"Invalid learning rate: {lr}")
-    # if requires_grad is None, but being put into an optimizer, set it to True
-    for x in params:
-      if x.requires_grad is None: x.requires_grad_(True)
-
    self.params: list[Tensor] = dedup([x for x in params if x.requires_grad])
    assert len(self.params) != 0, "optimizer must have at least one param"
    self.buffers: list[Tensor] = dedup([x for x in params if not x.requires_grad])   # buffers are still realized
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -92,7 +92,7 @@ class Tensor(OpMixin):
  training: ClassVar[bool] = False

  def __init__(self, data:ConstType|bytes|list|tuple|UOp|'numpy.ndarray'|pathlib.Path|None,
-               device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, _force_unique:bool=False):
+               device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, _force_unique:bool=False):
    if device is None:
      if isinstance(data, pathlib.Path): device = f"DISK:{data.resolve()}"  # keep it on the disk if device is None
      elif isinstance(data, UOp): device = data._device
@@ -103,9 +103,7 @@ class Tensor(OpMixin):
    # tensors can have gradients if you have called .backward
    self.grad:Tensor|None = None

-    # NOTE: this can be in three states. False and None: no gradient, True: gradient
-    # None (the default) will be updated to True if it's put in an optimizer
-    self.requires_grad:bool|None = requires_grad
+    self.requires_grad:bool = requires_grad

    # create a UOp from the different types of inputs
    if isinstance(data, UOp):
@@ -115,8 +113,8 @@ class Tensor(OpMixin):
    elif data is None:
      data = UOp.const(_dtype or dtypes.default_float, 0, _device)
    elif isinstance(data, get_args(ConstType)):
-      if _force_unique or requires_grad: data = UOp.unique_const(data, _dtype, _device)
-      else: data = UOp.const(_dtype or dtypes.from_py(data), data, _device)
+      dt = _dtype or dtypes.from_py(data)
+      data = UOp.unique_const(data, dt, _device) if _force_unique or (requires_grad and dtypes.is_float(dt)) else UOp.const(dt, data, _device)
    elif isinstance(data, bytes): data = _frompy(data, _dtype or dtypes.uint8, _device)
    elif isinstance(data, (list, tuple)):
      if _dtype is None:
@@ -151,11 +149,10 @@ class Tensor(OpMixin):
    srcs = (self,)+x
    new_uop: UOp = fxn(*[t.uop for t in srcs], *extra_args, **kwargs)
    if TRACEMETA >= 1 and (metadata:=_METADATA.get()) is not None: all_metadata[new_uop] = (metadata,)
-    needs_input_grad = [t.requires_grad for t in srcs]
    # directly create the Tensor
    ret = Tensor.__new__(Tensor)
    ret.uop, ret.grad = new_uop, None
-    ret.requires_grad = True if any(needs_input_grad) else None if None in needs_input_grad else False
+    ret.requires_grad = any(t.requires_grad for t in srcs)
    # add to all_tensors after construction succeeds
    all_tensors[weakref.ref(ret)] = None
    return ret
@@ -166,7 +163,7 @@ class Tensor(OpMixin):
  @staticmethod
  def unique_const(fill_value:ConstType|UOp, **kwargs) -> Tensor: return Tensor(fill_value, _force_unique=True, **kwargs)

-  def requires_grad_(self, requires_grad=True) -> Tensor:
+  def requires_grad_(self, requires_grad:bool=True) -> Tensor:
    # make the UOp unique if it's a CONST to prevent gradient accumulation bugs with cached const UOps
    if requires_grad and self.uop.op is Ops.CONST: self.replace(Tensor(self.uop.arg, device=self.device, dtype=self.dtype, requires_grad=True))
    self.requires_grad = requires_grad
@@ -566,7 +563,7 @@ class Tensor(OpMixin):
    return Tensor._device_seeds[device], low.cat(high)

  @staticmethod
-  def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, contiguous:bool=True) -> Tensor:
+  def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, contiguous:bool=True) -> Tensor:
    """
    Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[0, 1)`.

@@ -595,7 +592,7 @@ class Tensor(OpMixin):
  # ***** creation helper functions *****

  @classmethod
-  def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool|None=None) -> Tensor:
+  def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool=True) -> Tensor:
    """
    Returns a 2-D tensor with `n` rows and `m` columns, with ones on the diagonal and zeros elsewhere.

@@ -617,9 +614,9 @@ class Tensor(OpMixin):
    if kwargs.get("device") is not None: raise RuntimeError("cannot specify `device` on `*_like` of a multi device tensor")
    if self.uop.axis is None: return fxn(self.shape, *args, dtype=dtype, **kwargs).shard(self.device)
    stacked = UOp.mstack(*[fxn(self.uop.shard_shape, *args, device=d, dtype=dtype, **kwargs).uop for d in self.device])
-    return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad"))
+    return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad", True))

-  def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad=None) -> Tensor:
+  def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad:bool=False) -> Tensor:
    """
    Creates a tensor with the same shape as `self`, filled with the given value.
    If `dtype` is not specified, the dtype of `self` is used.
@@ -631,12 +628,9 @@ class Tensor(OpMixin):
    print(Tensor.full_like(t, 42).numpy())
    ```
    """
-    if device is not None:
-      if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
-      return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)
-    if requires_grad:
-      return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=self.device).requires_grad_(requires_grad)
-    return super().full_like(fill_value, dtype)
+    if device is None: return super().full_like(fill_value, dtype).requires_grad_(requires_grad)
+    if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
+    return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)

  def rand_like(self, **kwargs) -> Tensor:
    """
@@ -655,7 +649,7 @@ class Tensor(OpMixin):

  # ***** random functions *****

-  def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
    """
    Creates a tensor with the same shape and sharding as `self`, filled with random values from a normal distribution with mean 0 and variance 1.

@@ -672,7 +666,7 @@ class Tensor(OpMixin):
    return (src[0].mul(2*math.pi).cos().mul((1 - src[1]).log().mul(-2).sqrt()).cast(dtype or self.dtype)).requires_grad_(requires_grad)

  @staticmethod
-  def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
    """
    Creates a tensor with the given shape, filled with random values from a normal distribution with mean `0` and standard deviation `1`.
    If `dtype` is not specified, the default type is used.
@@ -707,7 +701,7 @@ class Tensor(OpMixin):
    return Tensor.uniform(*shape, low=low, high=high, dtype=dtype, **kwargs)

  @staticmethod
-  def normal(*shape, mean=0.0, std=1.0, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def normal(*shape, mean=0.0, std=1.0, requires_grad:bool=True, **kwargs) -> Tensor:
    """
    Creates a tensor with the given shape, filled with random values from a normal distribution with the given `mean` and standard deviation `std`.
    Requires `std >= 0`.
@@ -724,7 +718,7 @@ class Tensor(OpMixin):
    return (std * Tensor.randn(*shape, **kwargs) + mean).requires_grad_(requires_grad)

  @staticmethod
-  def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
+  def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
    """
    Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[low, high)`.
    Requires `low < high`.
@@ -815,7 +809,7 @@ class Tensor(OpMixin):
    print(Tensor.randperm(6).numpy())
    ```
    """
-    return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad"))
+    return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad", True))

  def multinomial(self:Tensor, num_samples:int = 1, replacement:bool = False) -> Tensor:
    """
@@ -883,7 +877,7 @@ class Tensor(OpMixin):
    """
    all_uops = self.uop.toposort()
    tensors_need_grad: list[Tensor] = [t for tref in all_tensors if (t:=tref()) is not None and \
-                                       t.uop in all_uops and t.requires_grad]
+                                       t.uop in all_uops and t.requires_grad and t.is_floating_point()]
    # clear contexts
    for t,g in zip(tensors_need_grad, self.gradient(*tensors_need_grad, gradient=gradient)):
      assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}"
@@ -1033,7 +1027,7 @@ class Tensor(OpMixin):
    if any(self.uop in t.uop.backward_slice_with_self and t.uop.base is not shared for tref in all_tensors
           if (t:=tref()) is not None and t is not self and t.uop is not v_uop and t.uop not in v_bw):
      raise RuntimeError("can't setitem on a tensor with other uses")
-    if self.requires_grad or (isinstance(v, Tensor) and v.requires_grad):
+    if not self.uop.base.is_realized and self.is_floating_point() and (self.requires_grad or (isinstance(v, Tensor) and v.requires_grad)):
      if not isinstance(v, Tensor): v = Tensor(v, device=self.device, dtype=self.dtype)
      # __iadd__/__isub__ creates AFTER(view, STORE(view, computed)); unwrap to get the computed value
      if v.uop.op is Ops.AFTER and any(s.op is Ops.STORE for s in v.uop.src[1:]): v = v._apply_uop(lambda x: x.src[1].src[1])
@@ -1426,8 +1420,6 @@ class Tensor(OpMixin):
    """
    Bitcasts `self` to the given `dtype` of the same itemsize.

-    `self` must not require a gradient.
-
    ```python exec="true" source="above" session="tensor" result="python"
    t = Tensor([-1, 2, 3], dtype=dtypes.int32)
    print(t.dtype, t.numpy())
@@ -1437,7 +1429,6 @@ class Tensor(OpMixin):
    print(t.dtype, t.numpy())
    ```
    """
-    if self.requires_grad: raise RuntimeError("can't backprop through bitcast")
    dt = to_dtype(dtype)
    if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
    if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os: