|
|
|
|
@@ -92,7 +92,7 @@ class Tensor(OpMixin):
|
|
|
|
|
training: ClassVar[bool] = False
|
|
|
|
|
|
|
|
|
|
def __init__(self, data:ConstType|bytes|list|tuple|UOp|'numpy.ndarray'|pathlib.Path|None,
|
|
|
|
|
device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, _force_unique:bool=False):
|
|
|
|
|
device:str|tuple|list|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, _force_unique:bool=False):
|
|
|
|
|
if device is None:
|
|
|
|
|
if isinstance(data, pathlib.Path): device = f"DISK:{data.resolve()}" # keep it on the disk if device is None
|
|
|
|
|
elif isinstance(data, UOp): device = data._device
|
|
|
|
|
@@ -103,9 +103,7 @@ class Tensor(OpMixin):
|
|
|
|
|
# tensors can have gradients if you have called .backward
|
|
|
|
|
self.grad:Tensor|None = None
|
|
|
|
|
|
|
|
|
|
# NOTE: this can be in three states. False and None: no gradient, True: gradient
|
|
|
|
|
# None (the default) will be updated to True if it's put in an optimizer
|
|
|
|
|
self.requires_grad:bool|None = requires_grad
|
|
|
|
|
self.requires_grad:bool = requires_grad
|
|
|
|
|
|
|
|
|
|
# create a UOp from the different types of inputs
|
|
|
|
|
if isinstance(data, UOp):
|
|
|
|
|
@@ -115,8 +113,8 @@ class Tensor(OpMixin):
|
|
|
|
|
elif data is None:
|
|
|
|
|
data = UOp.const(_dtype or dtypes.default_float, 0, _device)
|
|
|
|
|
elif isinstance(data, get_args(ConstType)):
|
|
|
|
|
if _force_unique or requires_grad: data = UOp.unique_const(data, _dtype, _device)
|
|
|
|
|
else: data = UOp.const(_dtype or dtypes.from_py(data), data, _device)
|
|
|
|
|
dt = _dtype or dtypes.from_py(data)
|
|
|
|
|
data = UOp.unique_const(data, dt, _device) if _force_unique or (requires_grad and dtypes.is_float(dt)) else UOp.const(dt, data, _device)
|
|
|
|
|
elif isinstance(data, bytes): data = _frompy(data, _dtype or dtypes.uint8, _device)
|
|
|
|
|
elif isinstance(data, (list, tuple)):
|
|
|
|
|
if _dtype is None:
|
|
|
|
|
@@ -151,11 +149,10 @@ class Tensor(OpMixin):
|
|
|
|
|
srcs = (self,)+x
|
|
|
|
|
new_uop: UOp = fxn(*[t.uop for t in srcs], *extra_args, **kwargs)
|
|
|
|
|
if TRACEMETA >= 1 and (metadata:=_METADATA.get()) is not None: all_metadata[new_uop] = (metadata,)
|
|
|
|
|
needs_input_grad = [t.requires_grad for t in srcs]
|
|
|
|
|
# directly create the Tensor
|
|
|
|
|
ret = Tensor.__new__(Tensor)
|
|
|
|
|
ret.uop, ret.grad = new_uop, None
|
|
|
|
|
ret.requires_grad = True if any(needs_input_grad) else None if None in needs_input_grad else False
|
|
|
|
|
ret.requires_grad = any(t.requires_grad for t in srcs)
|
|
|
|
|
# add to all_tensors after construction succeeds
|
|
|
|
|
all_tensors[weakref.ref(ret)] = None
|
|
|
|
|
return ret
|
|
|
|
|
@@ -166,7 +163,7 @@ class Tensor(OpMixin):
|
|
|
|
|
@staticmethod
|
|
|
|
|
def unique_const(fill_value:ConstType|UOp, **kwargs) -> Tensor: return Tensor(fill_value, _force_unique=True, **kwargs)
|
|
|
|
|
|
|
|
|
|
def requires_grad_(self, requires_grad=True) -> Tensor:
|
|
|
|
|
def requires_grad_(self, requires_grad:bool=True) -> Tensor:
|
|
|
|
|
# make the UOp unique if it's a CONST to prevent gradient accumulation bugs with cached const UOps
|
|
|
|
|
if requires_grad and self.uop.op is Ops.CONST: self.replace(Tensor(self.uop.arg, device=self.device, dtype=self.dtype, requires_grad=True))
|
|
|
|
|
self.requires_grad = requires_grad
|
|
|
|
|
@@ -566,7 +563,7 @@ class Tensor(OpMixin):
|
|
|
|
|
return Tensor._device_seeds[device], low.cat(high)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool|None=None, contiguous:bool=True) -> Tensor:
|
|
|
|
|
def rand(*shape, device:str|None=None, dtype:DTypeLike|None=None, requires_grad:bool=True, contiguous:bool=True) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[0, 1)`.
|
|
|
|
|
|
|
|
|
|
@@ -595,7 +592,7 @@ class Tensor(OpMixin):
|
|
|
|
|
# ***** creation helper functions *****
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool|None=None) -> Tensor:
|
|
|
|
|
def eye(cls, n:int, m:int|None=None, dtype=None, device=None, requires_grad:bool=True) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Returns a 2-D tensor with `n` rows and `m` columns, with ones on the diagonal and zeros elsewhere.
|
|
|
|
|
|
|
|
|
|
@@ -617,9 +614,9 @@ class Tensor(OpMixin):
|
|
|
|
|
if kwargs.get("device") is not None: raise RuntimeError("cannot specify `device` on `*_like` of a multi device tensor")
|
|
|
|
|
if self.uop.axis is None: return fxn(self.shape, *args, dtype=dtype, **kwargs).shard(self.device)
|
|
|
|
|
stacked = UOp.mstack(*[fxn(self.uop.shard_shape, *args, device=d, dtype=dtype, **kwargs).uop for d in self.device])
|
|
|
|
|
return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad"))
|
|
|
|
|
return Tensor(stacked.multi(self.uop.axis), requires_grad=kwargs.get("requires_grad", True))
|
|
|
|
|
|
|
|
|
|
def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad=None) -> Tensor:
|
|
|
|
|
def full_like(self, fill_value:ConstType, dtype=None, device=None, requires_grad:bool=False) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Creates a tensor with the same shape as `self`, filled with the given value.
|
|
|
|
|
If `dtype` is not specified, the dtype of `self` is used.
|
|
|
|
|
@@ -631,12 +628,9 @@ class Tensor(OpMixin):
|
|
|
|
|
print(Tensor.full_like(t, 42).numpy())
|
|
|
|
|
```
|
|
|
|
|
"""
|
|
|
|
|
if device is not None:
|
|
|
|
|
if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
|
|
|
|
|
return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)
|
|
|
|
|
if requires_grad:
|
|
|
|
|
return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=self.device).requires_grad_(requires_grad)
|
|
|
|
|
return super().full_like(fill_value, dtype)
|
|
|
|
|
if device is None: return super().full_like(fill_value, dtype).requires_grad_(requires_grad)
|
|
|
|
|
if isinstance(self.device, tuple): raise RuntimeError("cannot specify `device` on `full_like` of a multi device tensor")
|
|
|
|
|
return Tensor.full(self.shape, fill_value, dtype=dtype or self.dtype, device=device).requires_grad_(requires_grad)
|
|
|
|
|
|
|
|
|
|
def rand_like(self, **kwargs) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
@@ -655,7 +649,7 @@ class Tensor(OpMixin):
|
|
|
|
|
|
|
|
|
|
# ***** random functions *****
|
|
|
|
|
|
|
|
|
|
def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
|
|
|
|
|
def randn_like(self, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Creates a tensor with the same shape and sharding as `self`, filled with random values from a normal distribution with mean 0 and variance 1.
|
|
|
|
|
|
|
|
|
|
@@ -672,7 +666,7 @@ class Tensor(OpMixin):
|
|
|
|
|
return (src[0].mul(2*math.pi).cos().mul((1 - src[1]).log().mul(-2).sqrt()).cast(dtype or self.dtype)).requires_grad_(requires_grad)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
|
|
|
|
|
def randn(*shape, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Creates a tensor with the given shape, filled with random values from a normal distribution with mean `0` and standard deviation `1`.
|
|
|
|
|
If `dtype` is not specified, the default type is used.
|
|
|
|
|
@@ -707,7 +701,7 @@ class Tensor(OpMixin):
|
|
|
|
|
return Tensor.uniform(*shape, low=low, high=high, dtype=dtype, **kwargs)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def normal(*shape, mean=0.0, std=1.0, requires_grad:bool|None=None, **kwargs) -> Tensor:
|
|
|
|
|
def normal(*shape, mean=0.0, std=1.0, requires_grad:bool=True, **kwargs) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Creates a tensor with the given shape, filled with random values from a normal distribution with the given `mean` and standard deviation `std`.
|
|
|
|
|
Requires `std >= 0`.
|
|
|
|
|
@@ -724,7 +718,7 @@ class Tensor(OpMixin):
|
|
|
|
|
return (std * Tensor.randn(*shape, **kwargs) + mean).requires_grad_(requires_grad)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool|None=None, **kwargs) -> Tensor:
|
|
|
|
|
def uniform(*shape, low=0.0, high=1.0, dtype:DTypeLike|None=None, requires_grad:bool=True, **kwargs) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
Creates a tensor with the given shape, filled with random values from a uniform distribution over the interval `[low, high)`.
|
|
|
|
|
Requires `low < high`.
|
|
|
|
|
@@ -815,7 +809,7 @@ class Tensor(OpMixin):
|
|
|
|
|
print(Tensor.randperm(6).numpy())
|
|
|
|
|
```
|
|
|
|
|
"""
|
|
|
|
|
return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad"))
|
|
|
|
|
return Tensor.rand(n, device=device, **kwargs).argsort().cast(dtype).requires_grad_(kwargs.get("requires_grad", True))
|
|
|
|
|
|
|
|
|
|
def multinomial(self:Tensor, num_samples:int = 1, replacement:bool = False) -> Tensor:
|
|
|
|
|
"""
|
|
|
|
|
@@ -883,7 +877,7 @@ class Tensor(OpMixin):
|
|
|
|
|
"""
|
|
|
|
|
all_uops = self.uop.toposort()
|
|
|
|
|
tensors_need_grad: list[Tensor] = [t for tref in all_tensors if (t:=tref()) is not None and \
|
|
|
|
|
t.uop in all_uops and t.requires_grad]
|
|
|
|
|
t.uop in all_uops and t.requires_grad and t.is_floating_point()]
|
|
|
|
|
# clear contexts
|
|
|
|
|
for t,g in zip(tensors_need_grad, self.gradient(*tensors_need_grad, gradient=gradient)):
|
|
|
|
|
assert g.shape == t.shape, f"grad shape must match tensor shape, {g.shape!r} != {t.shape!r}"
|
|
|
|
|
@@ -1033,7 +1027,7 @@ class Tensor(OpMixin):
|
|
|
|
|
if any(self.uop in t.uop.backward_slice_with_self and t.uop.base is not shared for tref in all_tensors
|
|
|
|
|
if (t:=tref()) is not None and t is not self and t.uop is not v_uop and t.uop not in v_bw):
|
|
|
|
|
raise RuntimeError("can't setitem on a tensor with other uses")
|
|
|
|
|
if self.requires_grad or (isinstance(v, Tensor) and v.requires_grad):
|
|
|
|
|
if not self.uop.base.is_realized and self.is_floating_point() and (self.requires_grad or (isinstance(v, Tensor) and v.requires_grad)):
|
|
|
|
|
if not isinstance(v, Tensor): v = Tensor(v, device=self.device, dtype=self.dtype)
|
|
|
|
|
# __iadd__/__isub__ creates AFTER(view, STORE(view, computed)); unwrap to get the computed value
|
|
|
|
|
if v.uop.op is Ops.AFTER and any(s.op is Ops.STORE for s in v.uop.src[1:]): v = v._apply_uop(lambda x: x.src[1].src[1])
|
|
|
|
|
@@ -1426,8 +1420,6 @@ class Tensor(OpMixin):
|
|
|
|
|
"""
|
|
|
|
|
Bitcasts `self` to the given `dtype` of the same itemsize.
|
|
|
|
|
|
|
|
|
|
`self` must not require a gradient.
|
|
|
|
|
|
|
|
|
|
```python exec="true" source="above" session="tensor" result="python"
|
|
|
|
|
t = Tensor([-1, 2, 3], dtype=dtypes.int32)
|
|
|
|
|
print(t.dtype, t.numpy())
|
|
|
|
|
@@ -1437,7 +1429,6 @@ class Tensor(OpMixin):
|
|
|
|
|
print(t.dtype, t.numpy())
|
|
|
|
|
```
|
|
|
|
|
"""
|
|
|
|
|
if self.requires_grad: raise RuntimeError("can't backprop through bitcast")
|
|
|
|
|
dt = to_dtype(dtype)
|
|
|
|
|
if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
|
|
|
|
|
if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os:
|
|
|
|
|
|