requires_grad cannot be None (#16212)

final goal is to remove requires_grad, first change the default to True, and don't allow None
This commit is contained in:
chenyu
2026-05-15 02:01:04 -04:00
committed by GitHub
parent c7870f11ff
commit 409bb0c9ad
10 changed files with 27 additions and 50 deletions

View File

@@ -35,9 +35,8 @@ if __name__ == "__main__":
params = nn.state.get_parameters(model)
# init params, set requires grad on the ones we need gradients of
# init params
for x in params:
if x.requires_grad is None: x.requires_grad_()
x.replace(x.contiguous())
Tensor.realize(*params)

View File

@@ -307,7 +307,7 @@ if __name__ == "__main__":
# preallocate all the grad buffers and zero them out
grads = {x:Tensor.zeros(x.shape, dtype=x.dtype, device=x.device).contiguous()
for x in state.values() if x.requires_grad is None}
for x in state.values() if x.requires_grad}
# print model size
sz = 0

View File

@@ -105,7 +105,7 @@ class TestKernelFusionRegression(unittest.TestCase):
view = x[1:3]
view += 1.0
return x.sum()
self._check_kernel_count(fn, 8)
self._check_kernel_count(fn, 7)
def test_batchnorm_running_stats_update(self):
def fn():

View File

@@ -12,8 +12,8 @@ def is_cdna4(): return Device[Device.DEFAULT].renderer.target.arch.startswith("g
def run_asm_gemm(a_shape, b_shape, dtype=dtypes.float16, a_shard=None, b_shard=None, gpus:int=1) -> None:
Tensor.manual_seed(0)
a_rand = Tensor.randn(a_shape, dtype=dtypes.float).sub(0.5).cast(dtype)
b_rand = Tensor.randn(b_shape, dtype=dtypes.float).sub(0.5).cast(dtype)
a_rand = Tensor.randn(a_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype)
b_rand = Tensor.randn(b_shape, dtype=dtypes.float, requires_grad=False).sub(0.5).cast(dtype)
with Context(DEBUG=0):
Tensor.realize(a_rand, b_rand)

View File

@@ -330,10 +330,6 @@ class TestBitCast(unittest.TestCase):
# should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2
Tensor.empty((3,), dtype=dtypes.int8).bitcast(dtypes.float16)
with self.assertRaises(RuntimeError):
# should fail because backprop through bitcast is undefined
Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True).bitcast(dtypes.float16)
def test_bitcast_float_to_int32(self):
a = Tensor([1.,2,3])
b = a.bitcast(dtypes.int32)

View File

@@ -179,8 +179,7 @@ class TestTinygrad(unittest.TestCase):
def test_tinygrad():
w1 = Tensor(init)
w2 = Tensor(init)
assert w1.requires_grad is None and w2.requires_grad is None
# optimizer sets requires_grad=True for params with requires_grad=None
assert w1.requires_grad is True and w2.requires_grad is True
nn.optim.SGD([w1, w2], lr=0.01)
assert w1.requires_grad is True and w2.requires_grad is True
out = w1.add(w2)

View File

@@ -92,10 +92,6 @@ class TestRawDiskBuffer(unittest.TestCase):
# should fail because 3 int8 is 3 bytes but float16 is two and 3 isn't a multiple of 2
Tensor.empty((3,), dtype=dtypes.int8, device=f"DISK:{tmp}").bitcast(dtypes.float16)
with self.assertRaises(RuntimeError):
# should fail because backprop through bitcast is undefined
Tensor.empty((4,), dtype=dtypes.int8, requires_grad=True, device=f"DISK:{tmp}").bitcast(dtypes.float16)
pathlib.Path(tmp).unlink()
@unittest.skipUnless(is_dtype_supported(dtypes.uint8), "need uint8")

View File

@@ -70,7 +70,7 @@ class TestTensorGradient(unittest.TestCase):
self.assertIs(x.grad, old_grad)
def test_gradient_through_clone_from_non_grad_src(self):
src = Tensor([1.0, 2.0, 3.0, 4.0])
src = Tensor([1.0, 2.0, 3.0, 4.0], requires_grad=False)
x = src.clone().requires_grad_(True)
(x * 2.0).sum().backward()
np.testing.assert_allclose(x.grad.numpy(), [2.0, 2.0, 2.0, 2.0])

View File

@@ -10,10 +10,6 @@ class Optimizer:
"""
def __init__(self, params: list[Tensor], lr: float, device=None, fused=FUSE_OPTIM):
if lr < 0: raise ValueError(f"Invalid learning rate: {lr}")
# if requires_grad is None, but being put into an optimizer, set it to True
for x in params:
if x.requires_grad is None: x.requires_grad_(True)
self.params: list[Tensor] = dedup([x for x in params if x.requires_grad])
assert len(self.params) != 0, "optimizer must have at least one param"
self.buffers: list[Tensor] = dedup([x for x in params if not x.requires_grad]) # buffers are still realized

View File

@@ -92,7 +92,7 @@ class Tensor(OpMixin):
training: ClassVar[bool] = False
def __init__(self, data:ConstType|bytes|list|tuple|UOp|'numpy.ndarray'|pathlib.Path|None,
device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, _force_unique:bool=False):
device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, _force_unique:bool=False):
if device is None:
if isinstance(data, pathlib.Path): device = f"DISK:{data.resolve()}" # keep it on the disk if device is None
elif isinstance(data, UOp): device = data._device
@@ -103,9 +103,7 @@ class Tensor(OpMixin):
# tensors can have gradients if you have called .backward
self.grad:Tensor|None = None
# NOTE: this can be in three states. False and None: no gradient, True: gradient
# None (the default) will be updated to True if it's put in an optimizer
self.requires_grad:bool|None = requires_grad
self.requires_grad:bool = requires_grad
# create a UOp from the different types of inputs
if isinstance(data, UOp):
@@ -115,8 +113,8 @@ class Tensor(OpMixin):
elif data is None:
data = UOp.const(_dtype or dtypes.default_float, 0, _device)
elif isinstance(data, get_args(ConstType)):
if _force_unique or requires_grad: data = UOp.unique_const(data, _dtype, _device)
else: data = UOp.const(_dtype or dtypes.from_py(data), data, _device)
dt = _dtype or dtypes.from_py(data)
data = UOp.unique_const(data, dt, _device) if _force_unique or (requires_grad and dtypes.is_float(dt)) else UOp.const(dt, data, _device)
elif isinstance(data, bytes): data = _frompy(data, _dtype or dtypes.uint8, _device)
elif isinstance(data, (list, tuple)):
if _dtype is None:
@@ -151,11 +149,10 @@ class Tensor(OpMixin):
srcs = (self,)+x
new_uop: UOp = fxn(*[t.uop for t in srcs], *extra_args, **kwargs)
if TRACEMETA >= 1 and (metadata:=_METADATA.get()) is not None: all_metadata[new_uop] = (metadata,)
needs_input_grad = [t.requires_grad for t in srcs]
# directly create the Tensor
ret = Tensor.__new__(Tensor)
ret.uop, ret.grad = new_uop, None
ret.requires_grad = True if any(needs_input_grad) else None if None in needs_input_grad else False
ret.requires_grad = any(t.requires_grad for t in srcs)
# add to all_tensors after construction succeeds
all_tensors[weakref.ref(ret)] = None
return ret
@@ -166,7 +163,7 @@ class Tensor(OpMixin):
@staticmethod
def unique_const(fill_value:ConstType|UOp, **kwargs) -> Tensor: return Tensor(fill_value, _force_unique=True, **kwargs)
def requires_grad_(self, requires_grad=True) -> Tensor:
def requires_grad_(self, requires_grad:bool=True) -> Tensor:
# make the UOp unique if it's a CONST to prevent gradient accumulation bugs with cached const UOps
if requires_grad and self.uop.op is Ops.CONST: self.replace(Tensor(self.uop.arg, device=self.device, dtype=self.dtype, requires_grad=True))
self.requires_grad = requires_grad
@@ -566,7 +563,7 @@ class Tensor(OpMixin):
return Tensor._device_seeds[device], low.cat(high)
@staticmethod
def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, contiguous:bool=True) -> Tensor:
def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, contiguous:bool=True) -> Tensor:
"""
Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[0, 1)`.
@@ -595,7 +592,7 @@ class Tensor(OpMixin):
# ***** creation helper functions *****
@classmethod
def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool|None=None) -> Tensor:
def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool=True) -> Tensor:
"""
Returns a 2-D tensor with `n` rows and `m` columns, with ones on the diagonal and zeros elsewhere.
@@ -617,9 +614,9 @@ class Tensor(OpMixin):
if kwargs.get("device") is not None: raise RuntimeError("cannot specify `device` on `*_like` of a multi device tensor")
if self.uop.axis is None: return fxn(self.shape, *args, dtype=dtype, **kwargs).shard(self.device)
stacked = UOp.mstack(*[fxn(self.uop.shard_shape, *args, device=d, dtype=dtype, **kwargs).uop for d in self.device])
return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad"))
return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad", True))
def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad=None) -> Tensor:
def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad:bool=False) -> Tensor:
"""
Creates a tensor with the same shape as `self`, filled with the given value.
If `dtype` is not specified, the dtype of `self` is used.
@@ -631,12 +628,9 @@ class Tensor(OpMixin):
print(Tensor.full_like(t, 42).numpy())
```
"""
if device is not None:
if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)
if requires_grad:
return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=self.device).requires_grad_(requires_grad)
return super().full_like(fill_value, dtype)
if device is None: return super().full_like(fill_value, dtype).requires_grad_(requires_grad)
if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)
def rand_like(self, **kwargs) -> Tensor:
"""
@@ -655,7 +649,7 @@ class Tensor(OpMixin):
# ***** random functions *****
def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
"""
Creates a tensor with the same shape and sharding as `self`, filled with random values from a normal distribution with mean 0 and variance 1.
@@ -672,7 +666,7 @@ class Tensor(OpMixin):
return (src[0].mul(2*math.pi).cos().mul((1 - src[1]).log().mul(-2).sqrt()).cast(dtype or self.dtype)).requires_grad_(requires_grad)
@staticmethod
def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
"""
Creates a tensor with the given shape, filled with random values from a normal distribution with mean `0` and standard deviation `1`.
If `dtype` is not specified, the default type is used.
@@ -707,7 +701,7 @@ class Tensor(OpMixin):
return Tensor.uniform(*shape, low=low, high=high, dtype=dtype, **kwargs)
@staticmethod
def normal(*shape, mean=0.0, std=1.0, requires_grad:bool|None=None, **kwargs) -> Tensor:
def normal(*shape, mean=0.0, std=1.0, requires_grad:bool=True, **kwargs) -> Tensor:
"""
Creates a tensor with the given shape, filled with random values from a normal distribution with the given `mean` and standard deviation `std`.
Requires `std >= 0`.
@@ -724,7 +718,7 @@ class Tensor(OpMixin):
return (std * Tensor.randn(*shape, **kwargs) + mean).requires_grad_(requires_grad)
@staticmethod
def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
"""
Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[low, high)`.
Requires `low < high`.
@@ -815,7 +809,7 @@ class Tensor(OpMixin):
print(Tensor.randperm(6).numpy())
```
"""
return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad"))
return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad", True))
def multinomial(self:Tensor, num_samples:int = 1, replacement:bool = False) -> Tensor:
"""
@@ -883,7 +877,7 @@ class Tensor(OpMixin):
"""
all_uops = self.uop.toposort()
tensors_need_grad: list[Tensor] = [t for tref in all_tensors if (t:=tref()) is not None and \
t.uop in all_uops and t.requires_grad]
t.uop in all_uops and t.requires_grad and t.is_floating_point()]
# clear contexts
for t,g in zip(tensors_need_grad, self.gradient(*tensors_need_grad, gradient=gradient)):
assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}"
@@ -1033,7 +1027,7 @@ class Tensor(OpMixin):
if any(self.uop in t.uop.backward_slice_with_self and t.uop.base is not shared for tref in all_tensors
if (t:=tref()) is not None and t is not self and t.uop is not v_uop and t.uop not in v_bw):
raise RuntimeError("can't setitem on a tensor with other uses")
if self.requires_grad or (isinstance(v, Tensor) and v.requires_grad):
if not self.uop.base.is_realized and self.is_floating_point() and (self.requires_grad or (isinstance(v, Tensor) and v.requires_grad)):
if not isinstance(v, Tensor): v = Tensor(v, device=self.device, dtype=self.dtype)
# __iadd__/__isub__ creates AFTER(view, STORE(view, computed)); unwrap to get the computed value
if v.uop.op is Ops.AFTER and any(s.op is Ops.STORE for s in v.uop.src[1:]): v = v._apply_uop(lambda x: x.src[1].src[1])
@@ -1426,8 +1420,6 @@ class Tensor(OpMixin):
"""
Bitcasts `self` to the given `dtype` of the same itemsize.
`self` must not require a gradient.
```python exec="true" source="above" session="tensor" result="python"
t = Tensor([-1, 2, 3], dtype=dtypes.int32)
print(t.dtype, t.numpy())
@@ -1437,7 +1429,6 @@ class Tensor(OpMixin):
print(t.dtype, t.numpy())
```
"""
if self.requires_grad: raise RuntimeError("can't backprop through bitcast")
dt = to_dtype(dtype)
if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os: