From 409bb0c9ada39dd00effaaee1887db828c9bfcf0 Mon Sep 17 00:00:00 2001 From: chenyu Date: Fri, 15 May 2026 02:01:04 -0400 Subject: [PATCH] requires_grad cannot be None (#16212) final goal is to remove requires_grad, first change the default to True, and don't allow None --- examples/gradaccum_mnist.py | 3 +- examples/mlperf/models/flat_llama.py | 2 +- extra/torch_backend/test_kernel_fusion.py | 2 +- test/backend/test_asm_gemm.py | 4 +- test/backend/test_dtype.py | 4 -- test/backend/test_tensor.py | 3 +- test/unit/test_disk_tensor.py | 4 -- test/unit/test_gradient.py | 2 +- tinygrad/nn/optim.py | 4 -- tinygrad/tensor.py | 49 +++++++++-------------- 10 files changed, 27 insertions(+), 50 deletions(-) diff --git a/examples/gradaccum_mnist.py b/examples/gradaccum_mnist.py index a660afddf4..2a0ac6f143 100644 --- a/examples/gradaccum_mnist.py +++ b/examples/gradaccum_mnist.py @@ -35,9 +35,8 @@ if __name__ == "__main__": params = nn.state.get_parameters(model) - # init params, set requires grad on the ones we need gradients of + # init params for x in params: - if x.requires_grad is None: x.requires_grad_() x.replace(x.contiguous()) Tensor.realize(*params) diff --git a/examples/mlperf/models/flat_llama.py b/examples/mlperf/models/flat_llama.py index 50fa509ccb..018d61f7aa 100644 --- a/examples/mlperf/models/flat_llama.py +++ b/examples/mlperf/models/flat_llama.py @@ -307,7 +307,7 @@ if __name__ == "__main__": # preallocate all the grad buffers and zero them out grads = {x:Tensor.zeros(x.shape, dtype=x.dtype, device=x.device).contiguous() - for x in state.values() if x.requires_grad is None} + for x in state.values() if x.requires_grad} # print model size sz = 0 diff --git a/extra/torch_backend/test_kernel_fusion.py b/extra/torch_backend/test_kernel_fusion.py index 0a748fe4ad..171872a68b 100644 --- a/extra/torch_backend/test_kernel_fusion.py +++ b/extra/torch_backend/test_kernel_fusion.py @@ -105,7 +105,7 @@ class TestKernelFusionRegression(unittest.TestCase): view = x[1:3] view += 1.0 return x.sum() - self._check_kernel_count(fn, 8) + self._check_kernel_count(fn, 7) def test_batchnorm_running_stats_update(self): def fn(): diff --git a/test/backend/test_asm_gemm.py b/test/backend/test_asm_gemm.py index 518aaeeb3e..6792040a6e 100644 --- a/test/backend/test_asm_gemm.py +++ b/test/backend/test_asm_gemm.py @@ -12,8 +12,8 @@ def is_cdna4(): return Device[Device.DEFAULT].renderer.target.arch.startswith("g def run_asm_gemm(a_shape, b_shape, dtype=dtypes.float16, a_shard=None, b_shard=None, gpus:int=1) -> None: Tensor.manual_seed(0) - a_rand = Tensor.randn(a_shape, dtype=dtypes.float).sub(0.5).cast(dtype) - b_rand = Tensor.randn(b_shape, dtype=dtypes.float).sub(0.5).cast(dtype) + a_rand = Tensor.randn(a_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype) + b_rand = Tensor.randn(b_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype) with Context(DEBUG=0): Tensor.realize(a_rand, b_rand) diff --git a/test/backend/test_dtype.py b/test/backend/test_dtype.py index 84bb16fe31..430bfa5db2 100644 --- a/test/backend/test_dtype.py +++ b/test/backend/test_dtype.py @@ -330,10 +330,6 @@ class TestBitCast(unittest.TestCase): # should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2 Tensor.empty((3,), dtype=dtypes.int8).bitcast(dtypes.float16) - with self.assertRaises(RuntimeError): - # should fail because backprop through bitcast is undefined - Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True).bitcast(dtypes.float16) - def test_bitcast_float_to_int32(self): a = Tensor([1.,2,3]) b = a.bitcast(dtypes.int32) diff --git a/test/backend/test_tensor.py b/test/backend/test_tensor.py index d699ed7bd9..4135217f58 100644 --- a/test/backend/test_tensor.py +++ b/test/backend/test_tensor.py @@ -179,8 +179,7 @@ class TestTinygrad(unittest.TestCase): def test_tinygrad(): w1 = Tensor(init) w2 = Tensor(init) - assert w1.requires_grad is None and w2.requires_grad is None - # optimizer sets requires_grad=True for params with requires_grad=None + assert w1.requires_grad is True and w2.requires_grad is True nn.optim.SGD([w1, w2], lr=0.01) assert w1.requires_grad is True and w2.requires_grad is True out = w1.add(w2) diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py index a851660bf4..3db019a138 100644 --- a/test/unit/test_disk_tensor.py +++ b/test/unit/test_disk_tensor.py @@ -92,10 +92,6 @@ class TestRawDiskBuffer(unittest.TestCase): # should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2 Tensor.empty((3,), dtype=dtypes.int8, device=f"DISK:{tmp}").bitcast(dtypes.float16) - with self.assertRaises(RuntimeError): - # should fail because backprop through bitcast is undefined - Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True, device=f"DISK:{tmp}").bitcast(dtypes.float16) - pathlib.Path(tmp).unlink() @unittest.skipUnless(is_dtype_supported(dtypes.uint8), "need uint8") diff --git a/test/unit/test_gradient.py b/test/unit/test_gradient.py index 80e46cd859..a52af55ed7 100644 --- a/test/unit/test_gradient.py +++ b/test/unit/test_gradient.py @@ -70,7 +70,7 @@ class TestTensorGradient(unittest.TestCase): self.assertIs(x.grad, old_grad) def test_gradient_through_clone_from_non_grad_src(self): - src = Tensor([1.0, 2.0, 3.0, 4.0]) + src = Tensor([1.0, 2.0, 3.0, 4.0], requires_grad=False) x = src.clone().requires_grad_(True) (x * 2.0).sum().backward() np.testing.assert_allclose(x.grad.numpy(), [2.0, 2.0, 2.0, 2.0]) diff --git a/tinygrad/nn/optim.py b/tinygrad/nn/optim.py index a912bd6501..a256c90684 100644 --- a/tinygrad/nn/optim.py +++ b/tinygrad/nn/optim.py @@ -10,10 +10,6 @@ class Optimizer: """ def __init__(self, params: list[Tensor], lr: float, device=None, fused=FUSE_OPTIM): if lr < 0: raise ValueError(f"Invalid learning rate: {lr}") - # if requires_grad is None, but being put into an optimizer, set it to True - for x in params: - if x.requires_grad is None: x.requires_grad_(True) - self.params: list[Tensor] = dedup([x for x in params if x.requires_grad]) assert len(self.params) != 0, "optimizer must have at least one param" self.buffers: list[Tensor] = dedup([x for x in params if not x.requires_grad]) # buffers are still realized diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 807e205ccf..2c1fb8adb3 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -92,7 +92,7 @@ class Tensor(OpMixin): training: ClassVar[bool] = False def __init__(self, data:ConstType|bytes|list|tuple|UOp|'numpy.ndarray'|pathlib.Path|None, - device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, _force_unique:bool=False): + device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, _force_unique:bool=False): if device is None: if isinstance(data, pathlib.Path): device = f"DISK:{data.resolve()}" # keep it on the disk if device is None elif isinstance(data, UOp): device = data._device @@ -103,9 +103,7 @@ class Tensor(OpMixin): # tensors can have gradients if you have called .backward self.grad:Tensor|None = None - # NOTE: this can be in three states. False and None: no gradient, True: gradient - # None (the default) will be updated to True if it's put in an optimizer - self.requires_grad:bool|None = requires_grad + self.requires_grad:bool = requires_grad # create a UOp from the different types of inputs if isinstance(data, UOp): @@ -115,8 +113,8 @@ class Tensor(OpMixin): elif data is None: data = UOp.const(_dtype or dtypes.default_float, 0, _device) elif isinstance(data, get_args(ConstType)): - if _force_unique or requires_grad: data = UOp.unique_const(data, _dtype, _device) - else: data = UOp.const(_dtype or dtypes.from_py(data), data, _device) + dt = _dtype or dtypes.from_py(data) + data = UOp.unique_const(data, dt, _device) if _force_unique or (requires_grad and dtypes.is_float(dt)) else UOp.const(dt, data, _device) elif isinstance(data, bytes): data = _frompy(data, _dtype or dtypes.uint8, _device) elif isinstance(data, (list, tuple)): if _dtype is None: @@ -151,11 +149,10 @@ class Tensor(OpMixin): srcs = (self,)+x new_uop: UOp = fxn(*[t.uop for t in srcs], *extra_args, **kwargs) if TRACEMETA >= 1 and (metadata:=_METADATA.get()) is not None: all_metadata[new_uop] = (metadata,) - needs_input_grad = [t.requires_grad for t in srcs] # directly create the Tensor ret = Tensor.__new__(Tensor) ret.uop, ret.grad = new_uop, None - ret.requires_grad = True if any(needs_input_grad) else None if None in needs_input_grad else False + ret.requires_grad = any(t.requires_grad for t in srcs) # add to all_tensors after construction succeeds all_tensors[weakref.ref(ret)] = None return ret @@ -166,7 +163,7 @@ class Tensor(OpMixin): @staticmethod def unique_const(fill_value:ConstType|UOp, **kwargs) -> Tensor: return Tensor(fill_value, _force_unique=True, **kwargs) - def requires_grad_(self, requires_grad=True) -> Tensor: + def requires_grad_(self, requires_grad:bool=True) -> Tensor: # make the UOp unique if it's a CONST to prevent gradient accumulation bugs with cached const UOps if requires_grad and self.uop.op is Ops.CONST: self.replace(Tensor(self.uop.arg, device=self.device, dtype=self.dtype, requires_grad=True)) self.requires_grad = requires_grad @@ -566,7 +563,7 @@ class Tensor(OpMixin): return Tensor._device_seeds[device], low.cat(high) @staticmethod - def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, contiguous:bool=True) -> Tensor: + def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, contiguous:bool=True) -> Tensor: """ Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[0, 1)`. @@ -595,7 +592,7 @@ class Tensor(OpMixin): # ***** creation helper functions ***** @classmethod - def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool|None=None) -> Tensor: + def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool=True) -> Tensor: """ Returns a 2-D tensor with `n` rows and `m` columns, with ones on the diagonal and zeros elsewhere. @@ -617,9 +614,9 @@ class Tensor(OpMixin): if kwargs.get("device") is not None: raise RuntimeError("cannot specify `device` on `*_like` of a multi device tensor") if self.uop.axis is None: return fxn(self.shape, *args, dtype=dtype, **kwargs).shard(self.device) stacked = UOp.mstack(*[fxn(self.uop.shard_shape, *args, device=d, dtype=dtype, **kwargs).uop for d in self.device]) - return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad")) + return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad", True)) - def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad=None) -> Tensor: + def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad:bool=False) -> Tensor: """ Creates a tensor with the same shape as `self`, filled with the given value. If `dtype` is not specified, the dtype of `self` is used. @@ -631,12 +628,9 @@ class Tensor(OpMixin): print(Tensor.full_like(t, 42).numpy()) ``` """ - if device is not None: - if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor") - return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad) - if requires_grad: - return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=self.device).requires_grad_(requires_grad) - return super().full_like(fill_value, dtype) + if device is None: return super().full_like(fill_value, dtype).requires_grad_(requires_grad) + if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor") + return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad) def rand_like(self, **kwargs) -> Tensor: """ @@ -655,7 +649,7 @@ class Tensor(OpMixin): # ***** random functions ***** - def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor: + def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor: """ Creates a tensor with the same shape and sharding as `self`, filled with random values from a normal distribution with mean 0 and variance 1. @@ -672,7 +666,7 @@ class Tensor(OpMixin): return (src[0].mul(2*math.pi).cos().mul((1 - src[1]).log().mul(-2).sqrt()).cast(dtype or self.dtype)).requires_grad_(requires_grad) @staticmethod - def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor: + def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor: """ Creates a tensor with the given shape, filled with random values from a normal distribution with mean `0` and standard deviation `1`. If `dtype` is not specified, the default type is used. @@ -707,7 +701,7 @@ class Tensor(OpMixin): return Tensor.uniform(*shape, low=low, high=high, dtype=dtype, **kwargs) @staticmethod - def normal(*shape, mean=0.0, std=1.0, requires_grad:bool|None=None, **kwargs) -> Tensor: + def normal(*shape, mean=0.0, std=1.0, requires_grad:bool=True, **kwargs) -> Tensor: """ Creates a tensor with the given shape, filled with random values from a normal distribution with the given `mean` and standard deviation `std`. Requires `std >= 0`. @@ -724,7 +718,7 @@ class Tensor(OpMixin): return (std * Tensor.randn(*shape, **kwargs) + mean).requires_grad_(requires_grad) @staticmethod - def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor: + def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor: """ Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[low, high)`. Requires `low < high`. @@ -815,7 +809,7 @@ class Tensor(OpMixin): print(Tensor.randperm(6).numpy()) ``` """ - return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad")) + return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad", True)) def multinomial(self:Tensor, num_samples:int = 1, replacement:bool = False) -> Tensor: """ @@ -883,7 +877,7 @@ class Tensor(OpMixin): """ all_uops = self.uop.toposort() tensors_need_grad: list[Tensor] = [t for tref in all_tensors if (t:=tref()) is not None and \ - t.uop in all_uops and t.requires_grad] + t.uop in all_uops and t.requires_grad and t.is_floating_point()] # clear contexts for t,g in zip(tensors_need_grad, self.gradient(*tensors_need_grad, gradient=gradient)): assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}" @@ -1033,7 +1027,7 @@ class Tensor(OpMixin): if any(self.uop in t.uop.backward_slice_with_self and t.uop.base is not shared for tref in all_tensors if (t:=tref()) is not None and t is not self and t.uop is not v_uop and t.uop not in v_bw): raise RuntimeError("can't setitem on a tensor with other uses") - if self.requires_grad or (isinstance(v, Tensor) and v.requires_grad): + if not self.uop.base.is_realized and self.is_floating_point() and (self.requires_grad or (isinstance(v, Tensor) and v.requires_grad)): if not isinstance(v, Tensor): v = Tensor(v, device=self.device, dtype=self.dtype) # __iadd__/__isub__ creates AFTER(view, STORE(view, computed)); unwrap to get the computed value if v.uop.op is Ops.AFTER and any(s.op is Ops.STORE for s in v.uop.src[1:]): v = v._apply_uop(lambda x: x.src[1].src[1]) @@ -1426,8 +1420,6 @@ class Tensor(OpMixin): """ Bitcasts `self` to the given `dtype` of the same itemsize. - `self` must not require a gradient. - ```python exec="true" source="above" session="tensor" result="python" t = Tensor([-1, 2, 3], dtype=dtypes.int32) print(t.dtype, t.numpy()) @@ -1437,7 +1429,6 @@ class Tensor(OpMixin): print(t.dtype, t.numpy()) ``` """ - if self.requires_grad: raise RuntimeError("can't backprop through bitcast") dt = to_dtype(dtype) if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast") if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os: