From 1cfccc34c196a5c3625754e90cf92542cc937f08 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 10:49:49 -0500 Subject: [PATCH 01/16] remove uop mutability [pr] --- tinygrad/ops.py | 9 ++++----- tinygrad/tensor.py | 21 +++++++++++++++++++-- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tinygrad/ops.py b/tinygrad/ops.py index cfd3344073..69f4d39af7 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -230,6 +230,8 @@ buffers:weakref.WeakKeyDictionary[UOp, Buffer] = weakref.WeakKeyDictionary() # t all_metadata:weakref.WeakKeyDictionary[UOp, Metadata] = weakref.WeakKeyDictionary() forced_realize:weakref.WeakSet[UOp] = weakref.WeakSet() +becomes_map: weakref.WeakKeyDictionary[UOp, UOp] = weakref.WeakKeyDictionary() + # NOTE: this should be frozen, but frozen is slower @dataclass(eq=False, slots=True) class UOp(MathTrait, metaclass=UOpMetaClass): @@ -468,12 +470,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass): @property def forced_realize(self): return self in forced_realize - # *** danger zone *** + # *** less danger zone *** - # CAUTION: MUTABILITY! - def become(self, u:UOp): - del UOpMetaClass.ucache[(self.op, self.dtype, self.src, self.arg)] - self.op, self.dtype, self.src, self.arg = u.op, u.dtype, u.src, u.arg + def become(self, u:UOp): becomes_map[self] = u # *** uop movement ops *** diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index b1c24ac326..ad95e6b229 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -1,6 +1,6 @@ # inspired by https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py from __future__ import annotations -import time, math, itertools, functools, struct, sys, inspect, pathlib, string, dataclasses, hashlib +import time, math, itertools, functools, struct, sys, inspect, pathlib, string, dataclasses, hashlib, weakref from contextlib import ContextDecorator from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence, cast, get_args, Literal, TYPE_CHECKING, SupportsIndex from tinygrad.dtype import DType, DTypeLike, dtypes, ImageDType, ConstType, least_upper_float, least_upper_dtype, sum_acc_dtype, to_dtype, truncate @@ -8,12 +8,17 @@ from tinygrad.helpers import argfix, make_tuple, flatten, prod, all_int, round_u from tinygrad.helpers import IMAGE, DEBUG, WINO, _METADATA, Metadata, TRACEMETA, ceildiv, fetch, polyN, unwrap from tinygrad.multi import MultiLazyBuffer from tinygrad.gradient import compute_gradient -from tinygrad.ops import smax, smin, resolve, UOp, Ops, sint, Variable, SimpleMathTrait, identity_element +from tinygrad.ops import smax, smin, resolve, UOp, Ops, sint, Variable, SimpleMathTrait, identity_element, becomes_map from tinygrad.device import Device, Buffer, BufferSpec from tinygrad.engine.realize import run_schedule from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars +# *** all in scope Tensors are here. this is the only way to get children *** +# TODO: different "universes" for disconnected Tensors + +all_tensors: weakref.WeakSet[Tensor] = weakref.WeakSet() + # **** start with two base classes, Tensor and Function **** class Function: @@ -33,6 +38,7 @@ class Function: ret = Tensor.__new__(Tensor) ret.lazydata, ret.requires_grad, ret.grad = ctx.forward(*[t.lazydata for t in x], **kwargs), ctx.requires_grad, None ret._ctx = ctx if ctx.requires_grad and not Tensor.no_grad else None # used by autograd engine + all_tensors.add(ret) return ret import tinygrad.function as F @@ -170,6 +176,7 @@ class Tensor(SimpleMathTrait): else: assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}" self.lazydata = data + all_tensors.add(self) def requires_grad_(self, requires_grad=True) -> Tensor: self.requires_grad = requires_grad @@ -217,6 +224,16 @@ class Tensor(SimpleMathTrait): NOTE: A Tensor can only be scheduled once. """ schedule, var_vals = create_schedule_with_vars(flatten([x.lazydata.lbs for x in (self,)+lst])) + # TODO: becomes_map should be returned from create_schedule_with_vars + + fixed_tensors: list[Tensor] = list(all_tensors) + sink = UOp.sink(*[x.lazydata for x in fixed_tensors]) + new_sink = sink.substitute(becomes_map) + becomes_map.clear() + for t,s,ns in zip(fixed_tensors, sink.src, new_sink.src): + if s is ns: continue + t.lazydata = ns + return memory_planner(schedule), var_vals def schedule(self, *lst:Tensor) -> list[ScheduleItem]: From 44ff1b7424502ca87f8182e702f970e471d7b02b Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 10:58:35 -0500 Subject: [PATCH 02/16] test fixups --- test/test_schedule.py | 4 +-- test/unit/test_tensor_uop_representation.py | 37 +++++++++++++++++++-- tinygrad/tensor.py | 4 ++- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/test/test_schedule.py b/test/test_schedule.py index fbe92bedae..7e6adc9039 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -22,11 +22,11 @@ from extra.models.llama import precompute_freqs_cis class KernelCountException(Exception): pass def check_schedule(t:Union[Tensor, List[Tensor], UOp], allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_sink=True): + if to_prerealize: + for pre in to_prerealize: pre.schedule() if isinstance(t, Tensor): outs = t.lazydata.lbs elif isinstance(t, List): outs = flatten([r.lazydata.lbs for r in t]) else: outs = [t] - if to_prerealize: - for pre in to_prerealize: pre.schedule() sched = create_schedule(outs) if filter_sink: sched = [s for s in sched if s.ast.op is Ops.SINK] if len(sched) != allowed: diff --git a/test/unit/test_tensor_uop_representation.py b/test/unit/test_tensor_uop_representation.py index d9d2b48a22..f0792acbeb 100644 --- a/test/unit/test_tensor_uop_representation.py +++ b/test/unit/test_tensor_uop_representation.py @@ -1,10 +1,43 @@ import unittest from tinygrad import Tensor -from tinygrad.ops import UPat, Ops +from tinygrad.ops import UPat, Ops, UOp realized_pattern = UPat(Ops.VIEW, src=(UPat(Ops.BUFFER),)) const_pattern = UPat(Ops.CONST, src=(UPat(Ops.VIEW, src=(UPat(Ops.DEVICE),),))) -def is_pattern(ten:Tensor, pat:UPat): assert pat.match(ten.lazydata, {}) +def is_pattern_uop(u:UOp, pat:UPat): assert pat.match(u, {}), f"{u}\nis not\n{pat}" +def is_pattern(ten:Tensor, pat:UPat): is_pattern_uop(ten.lazydata, pat) + +class TestTensorMutates(unittest.TestCase): + def test_mutate_add(self): + a = Tensor([1,2,3]) + b = Tensor([4,5,6]) + ret = a+b + pa = a.lazydata + pb = b.lazydata + pr = ret.lazydata + ret.schedule() + self.assertIsNot(pa, a.lazydata) + self.assertIsNot(pb, b.lazydata) + self.assertIsNot(pr, ret.lazydata) + for t in [a,b,ret]: is_pattern(t, realized_pattern) + + def test_reshape_is_same_parent(self): + a = Tensor([1,2,3]) + b = Tensor([4,5,6]) + c = a+b + d = (a+b).reshape(3,1) + d.realize() + is_pattern_uop(d.lazydata.base, realized_pattern) + is_pattern_uop(c.lazydata.base, realized_pattern) + + def test_reshape_is_same_child(self): + a = Tensor([1,2,3]) + b = Tensor([4,5,6]) + c = a+b + d = (a+b).reshape(3,1) + c.realize() + is_pattern_uop(c.lazydata.base, realized_pattern) + is_pattern_uop(d.lazydata.base, realized_pattern) class TestTensorUopRepresentation(unittest.TestCase): def test_realized(self): diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index ad95e6b229..464c0ab8dd 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -226,8 +226,10 @@ class Tensor(SimpleMathTrait): schedule, var_vals = create_schedule_with_vars(flatten([x.lazydata.lbs for x in (self,)+lst])) # TODO: becomes_map should be returned from create_schedule_with_vars + # NOTE: this is potentially a lot of Tensors. see above about the universes fixed_tensors: list[Tensor] = list(all_tensors) - sink = UOp.sink(*[x.lazydata for x in fixed_tensors]) + sink = UOp.sink(*flatten([x.lazydata.lbs for x in fixed_tensors])) + # TODO: multi is wrong new_sink = sink.substitute(becomes_map) becomes_map.clear() for t,s,ns in zip(fixed_tensors, sink.src, new_sink.src): From 933227199ed96db08581f677048b0522c34c37ac Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 11:42:14 -0500 Subject: [PATCH 03/16] most tests pass --- test/test_conv_shapetracker.py | 2 +- test/test_profiler.py | 3 +-- tinygrad/tensor.py | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/test/test_conv_shapetracker.py b/test/test_conv_shapetracker.py index 36e9956d1e..3c14545d2f 100644 --- a/test/test_conv_shapetracker.py +++ b/test/test_conv_shapetracker.py @@ -13,7 +13,7 @@ class TestConvShapetracker(unittest.TestCase): conv = Conv2d(16, 32, (3, 3)) # first run to init the weights, they are scheduled. - create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata]) + conv(Tensor.empty(1, 16, 10, 10)).schedule() # run it again to get the kernels sched = [si for si in create_schedule([conv(Tensor.empty(1, 16, 10, 10)).lazydata]) if si.ast.op is Ops.SINK] assert len(sched) == 1, f"conv should only have one kernel, getting {len(sched)}" diff --git a/test/test_profiler.py b/test/test_profiler.py index b307c45ac4..9836259ef6 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -3,7 +3,6 @@ from tinygrad import Device, Tensor, dtypes, TinyJit from tinygrad.helpers import CI, getenv, Context from tinygrad.device import Buffer, BufferSpec, Compiled, ProfileRangeEvent, ProfileDeviceEvent, ProfileGraphEvent from tinygrad.runtime.support.hcq import HCQCompiled -from tinygrad.engine.schedule import create_schedule from tinygrad.engine.realize import get_runner MOCKGPU = getenv("MOCKGPU") @@ -34,7 +33,7 @@ class TestProfiler(unittest.TestCase): TestProfiler.a = Tensor([0.,1.], device=Device.DEFAULT).realize() TestProfiler.b = self.a + 1 - si = create_schedule([self.b.lazydata])[-1] + si = self.b.schedule()[-1] TestProfiler.runner = get_runner(TestProfiler.d0.device, si.ast) TestProfiler.b.lazydata.buffer.allocate() diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 464c0ab8dd..3a5d07540e 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -228,13 +228,13 @@ class Tensor(SimpleMathTrait): # NOTE: this is potentially a lot of Tensors. see above about the universes fixed_tensors: list[Tensor] = list(all_tensors) - sink = UOp.sink(*flatten([x.lazydata.lbs for x in fixed_tensors])) - # TODO: multi is wrong + sink = UOp.sink(*[UOp.sink(*t.lazydata.lbs) if isinstance(t.lazydata, MultiLazyBuffer) else t.lazydata for t in fixed_tensors]) new_sink = sink.substitute(becomes_map) becomes_map.clear() for t,s,ns in zip(fixed_tensors, sink.src, new_sink.src): if s is ns: continue - t.lazydata = ns + if isinstance(t.lazydata, MultiLazyBuffer): t.lazydata.lbs = list(ns.src) + else: t.lazydata = ns return memory_planner(schedule), var_vals From 2a741b61feb2ad4186b96997786ee5bd2f4cd301 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 11:45:10 -0500 Subject: [PATCH 04/16] more tests pass --- test/test_const_folding.py | 3 +-- test/test_nn.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/test/test_const_folding.py b/test/test_const_folding.py index 495f8db305..2a3215935f 100644 --- a/test/test_const_folding.py +++ b/test/test_const_folding.py @@ -1,14 +1,13 @@ import unittest, math from tinygrad import Tensor, Device, dtypes from tinygrad.ops import Ops -from tinygrad.engine.schedule import create_schedule from tinygrad.helpers import CI import numpy as np from tinygrad.device import is_dtype_supported def _check_ast_count(desired_count:int, t:Tensor): # NOTE: this has side effect because everything can be scheduled only once - schedule = create_schedule(t.lazydata.lbs) + schedule = t.schedule() asts = [s for s in schedule if s.ast.op is Ops.SINK] assert len(asts) == desired_count, f"{len(asts)} != {desired_count}" diff --git a/test/test_nn.py b/test/test_nn.py index 3e45d5c6f9..e36b805c48 100755 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -8,7 +8,6 @@ from tinygrad.helpers import CI, Context from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, Embedding from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell from tinygrad.nn.state import load_state_dict -from tinygrad.engine.schedule import create_schedule from tinygrad.engine.realize import run_schedule from tinygrad.device import is_dtype_supported @@ -517,7 +516,7 @@ class TestNN(unittest.TestCase): a = Tensor([[1, 5, 9, 11], [12, 19, 8, 1]]) result = layer(a) - schedule = create_schedule([result.lazydata]) + schedule = result.schedule() self.assertEqual(3, len([item for item in schedule if item.ast.op is Ops.SINK]), "first run realizes arange, weight, and embedding") run_schedule(schedule) @@ -525,7 +524,7 @@ class TestNN(unittest.TestCase): [4, 5, 6], [7, 8, 9]]) result = layer(b) - schedule = create_schedule([result.lazydata]) + schedule = result.schedule() self.assertEqual(1, len([item for item in schedule if item.ast.op is Ops.SINK]), "second run realizes embedding only") run_schedule(schedule) From 3bfa2e195c412e25693ff2fe8805062d18a2b77e Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 11:55:24 -0500 Subject: [PATCH 05/16] lil test fixups --- test/test_schedule.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_schedule.py b/test/test_schedule.py index 7e6adc9039..fa2f12af2e 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -235,7 +235,8 @@ class TestSchedule(unittest.TestCase): src = Tensor.ones(4).contiguous().realize() a = src.clone() b = src.clone() - sched = check_schedule([a, b], 2, filter_sink=False) + sched = Tensor.schedule(a, b) + self.assertEqual(len(sched), 2) run_schedule(sched) # a and b are assigned to different device Buffers self.assertIsNot(a.lazydata.realized, b.lazydata.realized) @@ -243,7 +244,8 @@ class TestSchedule(unittest.TestCase): def test_no_dedup_empty(self): a = Tensor.empty((4,)) b = Tensor.empty((4,)) - sched = check_schedule([a, b], 2, filter_sink=False) + sched = Tensor.schedule(a, b) + self.assertEqual(len(sched), 2) run_schedule(sched) self.assertIsNot(a.lazydata.realized, b.lazydata.realized) @@ -2007,6 +2009,8 @@ class TestView(unittest.TestCase): late_mul = a*bv other_child = b+2 s = check_schedule([late_mul, other_child], 2) + # this has to be here now + Tensor.schedule(late_mul, other_child) # the arange realizes self.assertIsNotNone(b.lazydata.base.realized) # mul still collapses From ff7f15efce90410bbaec276c76270f1d7fea8c0f Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 11:56:21 -0500 Subject: [PATCH 06/16] them too --- test/test_schedule.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_schedule.py b/test/test_schedule.py index fa2f12af2e..2ea76996f7 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -1059,8 +1059,10 @@ class TestSchedule(unittest.TestCase): # should not create extra kernel if output will be realized anyways dummy = x.sum().half().float() check_schedule(dummy, 1) + dummy.schedule() dummy = x.sum().half().float().contiguous() + 1 check_schedule(dummy, 2) + dummy.schedule() # shared between two outputs shared = x.sum().half().float() From 530343a13f70ccf5263b1dab946d07f0d1ee8cee Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 12:10:33 -0500 Subject: [PATCH 07/16] fix test --- tinygrad/tensor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 3a5d07540e..23f3314fa5 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -38,7 +38,6 @@ class Function: ret = Tensor.__new__(Tensor) ret.lazydata, ret.requires_grad, ret.grad = ctx.forward(*[t.lazydata for t in x], **kwargs), ctx.requires_grad, None ret._ctx = ctx if ctx.requires_grad and not Tensor.no_grad else None # used by autograd engine - all_tensors.add(ret) return ret import tinygrad.function as F @@ -127,6 +126,11 @@ class Tensor(SimpleMathTrait): training: ClassVar[bool] = False no_grad: ClassVar[bool] = False + def __new__(cls, *args, **kwargs): + instance = super().__new__(cls) + all_tensors.add(instance) + return instance + def __init__(self, data:Union[None, ConstType, bytes, List, Tuple, UOp, MultiLazyBuffer, 'np.ndarray', pathlib.Path], # type: ignore [name-defined] # noqa: F821 device:Optional[Union[str, tuple, list]]=None, dtype:Optional[DTypeLike]=None, requires_grad:Optional[bool]=None): if dtype is not None: dtype = to_dtype(dtype) @@ -176,7 +180,6 @@ class Tensor(SimpleMathTrait): else: assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}" self.lazydata = data - all_tensors.add(self) def requires_grad_(self, requires_grad=True) -> Tensor: self.requires_grad = requires_grad From de5c5e8108710519a12f17c59d905dc898cad840 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 12:15:55 -0500 Subject: [PATCH 08/16] unneeded --- test/test_schedule.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/test/test_schedule.py b/test/test_schedule.py index af951db0ff..fbe92bedae 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -14,7 +14,7 @@ from tinygrad.dtype import DType, ImageDType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic -from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context +from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, flatten, getenv, SPLIT_REDUCEOP, unwrap, prod, Context from tinygrad.codegen.kernel import Kernel, verify_ast from tinygrad.engine.schedule import BUF_LIMIT, ScheduleContext, ScheduleItem, create_schedule, view_right, view_left, remove_movement_ops, to_uop from tinygrad.engine.realize import CompiledRunner, get_runner, run_schedule @@ -22,13 +22,12 @@ from extra.models.llama import precompute_freqs_cis class KernelCountException(Exception): pass def check_schedule(t:Union[Tensor, List[Tensor], UOp], allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_sink=True): + if isinstance(t, Tensor): outs = t.lazydata.lbs + elif isinstance(t, List): outs = flatten([r.lazydata.lbs for r in t]) + else: outs = [t] if to_prerealize: for pre in to_prerealize: pre.schedule() - if isinstance(t, Tensor): sched = t.schedule() - elif isinstance(t, List) and isinstance(t[0], Tensor): sched = Tensor.schedule(*t) - else: - assert isinstance(t, UOp), f"can't schedule {t}" - sched = create_schedule([t]) + sched = create_schedule(outs) if filter_sink: sched = [s for s in sched if s.ast.op is Ops.SINK] if len(sched) != allowed: print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}") @@ -236,8 +235,7 @@ class TestSchedule(unittest.TestCase): src = Tensor.ones(4).contiguous().realize() a = src.clone() b = src.clone() - sched = Tensor.schedule(a, b) - self.assertEqual(len(sched), 2) + sched = check_schedule([a, b], 2, filter_sink=False) run_schedule(sched) # a and b are assigned to different device Buffers self.assertIsNot(a.lazydata.realized, b.lazydata.realized) @@ -245,8 +243,7 @@ class TestSchedule(unittest.TestCase): def test_no_dedup_empty(self): a = Tensor.empty((4,)) b = Tensor.empty((4,)) - sched = Tensor.schedule(a, b) - self.assertEqual(len(sched), 2) + sched = check_schedule([a, b], 2, filter_sink=False) run_schedule(sched) self.assertIsNot(a.lazydata.realized, b.lazydata.realized) @@ -1060,10 +1057,8 @@ class TestSchedule(unittest.TestCase): # should not create extra kernel if output will be realized anyways dummy = x.sum().half().float() check_schedule(dummy, 1) - dummy.schedule() dummy = x.sum().half().float().contiguous() + 1 check_schedule(dummy, 2) - dummy.schedule() # shared between two outputs shared = x.sum().half().float() @@ -2012,8 +2007,6 @@ class TestView(unittest.TestCase): late_mul = a*bv other_child = b+2 s = check_schedule([late_mul, other_child], 2) - # this has to be here now - Tensor.schedule(late_mul, other_child) # the arange realizes self.assertIsNotNone(b.lazydata.base.realized) # mul still collapses From 300542aef192eaaace6bc16c283611f720f4c988 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 12:16:29 -0500 Subject: [PATCH 09/16] err, that --- test/test_schedule.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/test_schedule.py b/test/test_schedule.py index fbe92bedae..4269abb18e 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -14,7 +14,7 @@ from tinygrad.dtype import DType, ImageDType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.view import View from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic -from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, flatten, getenv, SPLIT_REDUCEOP, unwrap, prod, Context +from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context from tinygrad.codegen.kernel import Kernel, verify_ast from tinygrad.engine.schedule import BUF_LIMIT, ScheduleContext, ScheduleItem, create_schedule, view_right, view_left, remove_movement_ops, to_uop from tinygrad.engine.realize import CompiledRunner, get_runner, run_schedule @@ -22,12 +22,13 @@ from extra.models.llama import precompute_freqs_cis class KernelCountException(Exception): pass def check_schedule(t:Union[Tensor, List[Tensor], UOp], allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_sink=True): - if isinstance(t, Tensor): outs = t.lazydata.lbs - elif isinstance(t, List): outs = flatten([r.lazydata.lbs for r in t]) - else: outs = [t] if to_prerealize: for pre in to_prerealize: pre.schedule() - sched = create_schedule(outs) + if isinstance(t, Tensor): sched = t.schedule() + elif isinstance(t, List) and isinstance(t[0], Tensor): sched = Tensor.schedule(*t) + else: + assert isinstance(t, UOp), f"can't schedule {t}" + sched = create_schedule([t]) if filter_sink: sched = [s for s in sched if s.ast.op is Ops.SINK] if len(sched) != allowed: print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}") From ea7f7bf812aea9d9eebebc37299ed794f1c2a8fe Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 12:45:06 -0500 Subject: [PATCH 10/16] fix test_hcq --- test/test_hcq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_hcq.py b/test/test_hcq.py index 38108b8b30..626fd72e8e 100644 --- a/test/test_hcq.py +++ b/test/test_hcq.py @@ -17,7 +17,7 @@ class TestHCQ(unittest.TestCase): TestHCQ.d0 = Device[Device.DEFAULT] TestHCQ.a = Tensor([0.,1.], device=Device.DEFAULT).realize() TestHCQ.b = self.a + 1 - si = create_schedule([self.b.lazydata])[-1] + si = self.b.schedule()[-1] TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast) TestHCQ.b.lazydata.buffer.allocate() From 040177fac112f0e1c521499d0ad57e32104a566d Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 12:55:02 -0500 Subject: [PATCH 11/16] fix test failures --- docs/abstractions2.py | 4 ++++ tinygrad/ops.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/abstractions2.py b/docs/abstractions2.py index 07e2670b85..6dcdeeb5df 100644 --- a/docs/abstractions2.py +++ b/docs/abstractions2.py @@ -101,6 +101,10 @@ print(sched[-1].ast) # run that schedule run_schedule(sched) +# NOTE: UOps are no longer mutable, you have to fetch this from the becomes_map +from tinygrad.ops import becomes_map +out = becomes_map[out] + # check the data out assert out.realized is not None and out.realized.as_buffer().cast('I')[0] == 5 diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 69f4d39af7..63b7c507aa 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -1197,7 +1197,9 @@ symbolic_simple = PatternMatcher([ # NOTE: this can be wrong for loaded NaN (UPat.var("x") * 0, lambda x: x.const_like(float("nan") if isinstance(x.arg, float) and (math.isnan(x.arg) or math.isinf(x.arg)) else 0)), # ** constant folding ** - (UPat(GroupOp.ALU, name="a", src=UPat((Ops.VCONST, Ops.CONST))), lambda a: a.const_like(exec_alu(a.op, a.dtype, [x.arg for x in a.src], False))), + # TODO: add const folding for Ops.THREEFRY + (UPat(GroupOp.ALU, name="a", src=UPat((Ops.VCONST, Ops.CONST))), + lambda a: a.const_like(exec_alu(a.op, a.dtype, [x.arg for x in a.src], False)) if a.op is not Ops.THREEFRY else None), # bool MUL is AND, ADD/MAX is OR. prevents other rules to rewrite bool ADD/MUL incorrectly (UPat.var('x', dtype=dtypes.bool) * UPat.var('y', dtype=dtypes.bool), lambda x,y: x&y), (UPat.var('x', dtype=dtypes.bool) + UPat.var('y', dtype=dtypes.bool), lambda x,y: x|y), From 6d034c71ef288be7e8943751803f7c814a8168f6 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 13:12:08 -0500 Subject: [PATCH 12/16] fix that test --- test/test_linearizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_linearizer.py b/test/test_linearizer.py index 0774f11018..10a01a1483 100644 --- a/test/test_linearizer.py +++ b/test/test_linearizer.py @@ -2060,7 +2060,7 @@ class TestKernelOpts(unittest.TestCase): def test_padto_sum_ok(self): N = 18 * 18 # NOTE: this setup prevents 17 * 17 contiguous merged into one dimension - a = Tensor.rand(N, N).shrink(((0, 17), (0, 17))) * 100 + a = Tensor.rand(N, N).realize().shrink(((0, 17), (0, 17))) * 100 b = (Tensor.rand(N, N) < 0.5).realize().shrink(((0, 17), (0, 17))) helper_linearizer_opt(a.sum(0), [ From c21301852a8365e99d8bd0414e1676a7792f4fe9 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 19:23:43 -0500 Subject: [PATCH 13/16] tensor universe --- tinygrad/tensor.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 8392c15cd8..cb965940a8 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -1,6 +1,6 @@ # inspired by https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py from __future__ import annotations -import time, math, itertools, functools, struct, sys, inspect, pathlib, string, dataclasses, hashlib, weakref +import time, math, itertools, functools, struct, sys, inspect, pathlib, string, dataclasses, hashlib, weakref, contextlib from contextlib import ContextDecorator from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence, cast, get_args, Literal, TYPE_CHECKING, SupportsIndex from tinygrad.dtype import DType, DTypeLike, dtypes, ImageDType, ConstType, least_upper_float, least_upper_dtype, sum_acc_dtype, to_dtype, truncate @@ -14,11 +14,6 @@ from tinygrad.engine.realize import run_schedule from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars -# *** all in scope Tensors are here. this is the only way to get children *** -# TODO: different "universes" for disconnected Tensors - -all_tensors: weakref.WeakSet[Tensor] = weakref.WeakSet() - # **** start with two base classes, Tensor and Function **** class Function: @@ -38,6 +33,16 @@ class Function: ret = Tensor.__new__(Tensor) ret.lazydata, ret.requires_grad, ret.grad = ctx.forward(*[t.lazydata for t in x], **kwargs), ctx.requires_grad, None ret._ctx = ctx if ctx.requires_grad and not Tensor.no_grad else None # used by autograd engine + # merge the Tensor universe of all in x + unis:list[Tensor] = sorted(dedup(x), key=lambda x: -len(x.universe)) + # choose the biggest universe to merge into + merged_universe = unis[0].universe + merged_universe[ret.ref] = None + for t in unis[1:]: + merged_universe.update(t.universe) + for s in t.universe: + if (tt:=s()) is not None: tt.universe = merged_universe + ret.universe = merged_universe return ret import tinygrad.function as F @@ -126,11 +131,10 @@ class Tensor(SimpleMathTrait): training: ClassVar[bool] = False no_grad: ClassVar[bool] = False - def __new__(cls, *args, **kwargs): - instance = super().__new__(cls) - all_tensors.add(instance) - return instance - + @functools.cached_property + def ref(self) -> weakref.ref[Tensor]: return weakref.ref(self) + def __del__(self): + with contextlib.suppress(AttributeError): del self.universe[self.ref] def __init__(self, data:Union[None, ConstType, bytes, List, Tuple, UOp, MultiLazyBuffer, 'np.ndarray', pathlib.Path], # type: ignore [name-defined] # noqa: F821 device:Optional[Union[str, tuple, list]]=None, dtype:Optional[DTypeLike]=None, requires_grad:Optional[bool]=None): if dtype is not None: dtype = to_dtype(dtype) @@ -181,6 +185,9 @@ class Tensor(SimpleMathTrait): assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}" self.lazydata = data + # all Tensors in the same universe as this one. if this is a realized Tensor it doesn't have to be in own universe + self.universe = {self.ref:None} + def requires_grad_(self, requires_grad=True) -> Tensor: self.requires_grad = requires_grad return self @@ -230,7 +237,7 @@ class Tensor(SimpleMathTrait): # TODO: becomes_map should be returned from create_schedule_with_vars # NOTE: this is potentially a lot of Tensors. see above about the universes - fixed_tensors: list[Tensor] = list(all_tensors) + fixed_tensors: list[Tensor] = dedup(flatten([[x for xref in t.universe if (x:=xref()) is not None] for t in (self,)+lst])) sink = UOp.sink(*[UOp.sink(*t.lazydata.lbs) if isinstance(t.lazydata, MultiLazyBuffer) else t.lazydata for t in fixed_tensors]) new_sink = sink.substitute(becomes_map) becomes_map.clear() @@ -238,7 +245,7 @@ class Tensor(SimpleMathTrait): if s is ns: continue if isinstance(t.lazydata, MultiLazyBuffer): t.lazydata.lbs = list(ns.src) else: t.lazydata = ns - + # TODO: we can update the universe here to reflect the realization return memory_planner(schedule), var_vals def schedule(self, *lst:Tensor) -> list[ScheduleItem]: @@ -260,6 +267,7 @@ class Tensor(SimpleMathTrait): assert getattr(self, '_ctx', None) is None assert self.shape == x.shape, f"replace shape mismatch {self.shape} != {x.shape}" self.lazydata = x.lazydata + self.universe.update(x.universe) return self def assign(self, x) -> Tensor: @@ -279,6 +287,7 @@ class Tensor(SimpleMathTrait): assert not x.requires_grad # self requires_grad is okay? if not self.lazydata.is_realized: return self.replace(x) self.lazydata = self.lazydata.assign(x.lazydata) + self.universe.update(x.universe) return self def detach(self) -> Tensor: From ed516b3169f0c48dc48e08af6909c20c72a237f2 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 29 Dec 2024 19:31:50 -0500 Subject: [PATCH 14/16] does this pass test --- tinygrad/tensor.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index cb965940a8..e0db4705da 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -14,6 +14,8 @@ from tinygrad.engine.realize import run_schedule from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars +all_tensors: dict[weakref.ref[Tensor], None] = {} + # **** start with two base classes, Tensor and Function **** class Function: @@ -131,10 +133,15 @@ class Tensor(SimpleMathTrait): training: ClassVar[bool] = False no_grad: ClassVar[bool] = False + def __new__(cls, *args, **kwargs): + instance = super().__new__(cls) + all_tensors[instance.ref] = None + return instance + @functools.cached_property def ref(self) -> weakref.ref[Tensor]: return weakref.ref(self) def __del__(self): - with contextlib.suppress(AttributeError): del self.universe[self.ref] + with contextlib.suppress(AttributeError, KeyError): del self.universe[self.ref] def __init__(self, data:Union[None, ConstType, bytes, List, Tuple, UOp, MultiLazyBuffer, 'np.ndarray', pathlib.Path], # type: ignore [name-defined] # noqa: F821 device:Optional[Union[str, tuple, list]]=None, dtype:Optional[DTypeLike]=None, requires_grad:Optional[bool]=None): if dtype is not None: dtype = to_dtype(dtype) @@ -237,7 +244,8 @@ class Tensor(SimpleMathTrait): # TODO: becomes_map should be returned from create_schedule_with_vars # NOTE: this is potentially a lot of Tensors. see above about the universes - fixed_tensors: list[Tensor] = dedup(flatten([[x for xref in t.universe if (x:=xref()) is not None] for t in (self,)+lst])) + #fixed_tensors: list[Tensor] = dedup(flatten([[x for xref in t.universe if (x:=xref()) is not None] for t in (self,)+lst])) + fixed_tensors: list[Tensor] = [x for xref in all_tensors if (x:=xref()) is not None] sink = UOp.sink(*[UOp.sink(*t.lazydata.lbs) if isinstance(t.lazydata, MultiLazyBuffer) else t.lazydata for t in fixed_tensors]) new_sink = sink.substitute(becomes_map) becomes_map.clear() @@ -245,7 +253,8 @@ class Tensor(SimpleMathTrait): if s is ns: continue if isinstance(t.lazydata, MultiLazyBuffer): t.lazydata.lbs = list(ns.src) else: t.lazydata = ns - # TODO: we can update the universe here to reflect the realization + # update the universe to reflect the realization + #if t.lazydata.is_realized: del t.universe[t.ref] return memory_planner(schedule), var_vals def schedule(self, *lst:Tensor) -> list[ScheduleItem]: From 0419963880c6f243a8870dc104883ea0e5459a40 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Mon, 30 Dec 2024 10:03:18 -0500 Subject: [PATCH 15/16] Revert "does this pass test" This reverts commit ed516b3169f0c48dc48e08af6909c20c72a237f2. --- tinygrad/tensor.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index e0db4705da..cb965940a8 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -14,8 +14,6 @@ from tinygrad.engine.realize import run_schedule from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars -all_tensors: dict[weakref.ref[Tensor], None] = {} - # **** start with two base classes, Tensor and Function **** class Function: @@ -133,15 +131,10 @@ class Tensor(SimpleMathTrait): training: ClassVar[bool] = False no_grad: ClassVar[bool] = False - def __new__(cls, *args, **kwargs): - instance = super().__new__(cls) - all_tensors[instance.ref] = None - return instance - @functools.cached_property def ref(self) -> weakref.ref[Tensor]: return weakref.ref(self) def __del__(self): - with contextlib.suppress(AttributeError, KeyError): del self.universe[self.ref] + with contextlib.suppress(AttributeError): del self.universe[self.ref] def __init__(self, data:Union[None, ConstType, bytes, List, Tuple, UOp, MultiLazyBuffer, 'np.ndarray', pathlib.Path], # type: ignore [name-defined] # noqa: F821 device:Optional[Union[str, tuple, list]]=None, dtype:Optional[DTypeLike]=None, requires_grad:Optional[bool]=None): if dtype is not None: dtype = to_dtype(dtype) @@ -244,8 +237,7 @@ class Tensor(SimpleMathTrait): # TODO: becomes_map should be returned from create_schedule_with_vars # NOTE: this is potentially a lot of Tensors. see above about the universes - #fixed_tensors: list[Tensor] = dedup(flatten([[x for xref in t.universe if (x:=xref()) is not None] for t in (self,)+lst])) - fixed_tensors: list[Tensor] = [x for xref in all_tensors if (x:=xref()) is not None] + fixed_tensors: list[Tensor] = dedup(flatten([[x for xref in t.universe if (x:=xref()) is not None] for t in (self,)+lst])) sink = UOp.sink(*[UOp.sink(*t.lazydata.lbs) if isinstance(t.lazydata, MultiLazyBuffer) else t.lazydata for t in fixed_tensors]) new_sink = sink.substitute(becomes_map) becomes_map.clear() @@ -253,8 +245,7 @@ class Tensor(SimpleMathTrait): if s is ns: continue if isinstance(t.lazydata, MultiLazyBuffer): t.lazydata.lbs = list(ns.src) else: t.lazydata = ns - # update the universe to reflect the realization - #if t.lazydata.is_realized: del t.universe[t.ref] + # TODO: we can update the universe here to reflect the realization return memory_planner(schedule), var_vals def schedule(self, *lst:Tensor) -> list[ScheduleItem]: From cd9bab6bca3af3d23c048606b24cefe239838949 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Mon, 30 Dec 2024 10:03:25 -0500 Subject: [PATCH 16/16] Revert "tensor universe" This reverts commit c21301852a8365e99d8bd0414e1676a7792f4fe9. --- tinygrad/tensor.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index cb965940a8..8392c15cd8 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -1,6 +1,6 @@ # inspired by https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py from __future__ import annotations -import time, math, itertools, functools, struct, sys, inspect, pathlib, string, dataclasses, hashlib, weakref, contextlib +import time, math, itertools, functools, struct, sys, inspect, pathlib, string, dataclasses, hashlib, weakref from contextlib import ContextDecorator from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence, cast, get_args, Literal, TYPE_CHECKING, SupportsIndex from tinygrad.dtype import DType, DTypeLike, dtypes, ImageDType, ConstType, least_upper_float, least_upper_dtype, sum_acc_dtype, to_dtype, truncate @@ -14,6 +14,11 @@ from tinygrad.engine.realize import run_schedule from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars +# *** all in scope Tensors are here. this is the only way to get children *** +# TODO: different "universes" for disconnected Tensors + +all_tensors: weakref.WeakSet[Tensor] = weakref.WeakSet() + # **** start with two base classes, Tensor and Function **** class Function: @@ -33,16 +38,6 @@ class Function: ret = Tensor.__new__(Tensor) ret.lazydata, ret.requires_grad, ret.grad = ctx.forward(*[t.lazydata for t in x], **kwargs), ctx.requires_grad, None ret._ctx = ctx if ctx.requires_grad and not Tensor.no_grad else None # used by autograd engine - # merge the Tensor universe of all in x - unis:list[Tensor] = sorted(dedup(x), key=lambda x: -len(x.universe)) - # choose the biggest universe to merge into - merged_universe = unis[0].universe - merged_universe[ret.ref] = None - for t in unis[1:]: - merged_universe.update(t.universe) - for s in t.universe: - if (tt:=s()) is not None: tt.universe = merged_universe - ret.universe = merged_universe return ret import tinygrad.function as F @@ -131,10 +126,11 @@ class Tensor(SimpleMathTrait): training: ClassVar[bool] = False no_grad: ClassVar[bool] = False - @functools.cached_property - def ref(self) -> weakref.ref[Tensor]: return weakref.ref(self) - def __del__(self): - with contextlib.suppress(AttributeError): del self.universe[self.ref] + def __new__(cls, *args, **kwargs): + instance = super().__new__(cls) + all_tensors.add(instance) + return instance + def __init__(self, data:Union[None, ConstType, bytes, List, Tuple, UOp, MultiLazyBuffer, 'np.ndarray', pathlib.Path], # type: ignore [name-defined] # noqa: F821 device:Optional[Union[str, tuple, list]]=None, dtype:Optional[DTypeLike]=None, requires_grad:Optional[bool]=None): if dtype is not None: dtype = to_dtype(dtype) @@ -185,9 +181,6 @@ class Tensor(SimpleMathTrait): assert data.device == device, f"MultiLazyBuffer device mismatch, {data.device} != {device}" self.lazydata = data - # all Tensors in the same universe as this one. if this is a realized Tensor it doesn't have to be in own universe - self.universe = {self.ref:None} - def requires_grad_(self, requires_grad=True) -> Tensor: self.requires_grad = requires_grad return self @@ -237,7 +230,7 @@ class Tensor(SimpleMathTrait): # TODO: becomes_map should be returned from create_schedule_with_vars # NOTE: this is potentially a lot of Tensors. see above about the universes - fixed_tensors: list[Tensor] = dedup(flatten([[x for xref in t.universe if (x:=xref()) is not None] for t in (self,)+lst])) + fixed_tensors: list[Tensor] = list(all_tensors) sink = UOp.sink(*[UOp.sink(*t.lazydata.lbs) if isinstance(t.lazydata, MultiLazyBuffer) else t.lazydata for t in fixed_tensors]) new_sink = sink.substitute(becomes_map) becomes_map.clear() @@ -245,7 +238,7 @@ class Tensor(SimpleMathTrait): if s is ns: continue if isinstance(t.lazydata, MultiLazyBuffer): t.lazydata.lbs = list(ns.src) else: t.lazydata = ns - # TODO: we can update the universe here to reflect the realization + return memory_planner(schedule), var_vals def schedule(self, *lst:Tensor) -> list[ScheduleItem]: @@ -267,7 +260,6 @@ class Tensor(SimpleMathTrait): assert getattr(self, '_ctx', None) is None assert self.shape == x.shape, f"replace shape mismatch {self.shape} != {x.shape}" self.lazydata = x.lazydata - self.universe.update(x.universe) return self def assign(self, x) -> Tensor: @@ -287,7 +279,6 @@ class Tensor(SimpleMathTrait): assert not x.requires_grad # self requires_grad is okay? if not self.lazydata.is_realized: return self.replace(x) self.lazydata = self.lazydata.assign(x.lazydata) - self.universe.update(x.universe) return self def detach(self) -> Tensor: