From a5e9ea7a60d11a21fdffac85ecd0313bbba722de Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Sat, 25 Apr 2026 12:36:55 +0300
Subject: [PATCH] remove schedule batch 4 (#15927)

* remove schedule batch 4

* fini
---
 test/backend/test_schedule.py                 | 21 ++++-----
 test/external/external_benchmark_schedule.py  |  4 +-
 test/external/external_test_hcq.py            |  4 +-
 test/external/external_test_onnx_runner.py    |  4 +-
 .../external_test_schedule_scaling.py         |  4 +-
 test/external/external_uop_gc.py              | 14 +++---
 test/external/fuzz_graph.py                   |  4 +-
 test/null/test_schedule.py                    | 47 ++++++++++---------
 8 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/test/backend/test_schedule.py b/test/backend/test_schedule.py
index cf41a94776..22e5f19e29 100644
--- a/test/backend/test_schedule.py
+++ b/test/backend/test_schedule.py
@@ -12,7 +12,7 @@ from tinygrad.device import is_dtype_supported
 from tinygrad.dtype import DType
 from tinygrad.uop.ops import UOp, Ops, UPat
 from tinygrad.helpers import CI, DEBUG, OSX, GlobalCounters, Context, getenv, all_same, temp
-from tinygrad.engine.realize import CompiledRunner, compile_linear, run_linear
+from tinygrad.engine.realize import compile_linear, run_linear
 
 class KernelCountException(Exception): pass
 def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Tensor]|None=None, filter_sink=True):
@@ -787,7 +787,7 @@ class TestSchedule(unittest.TestCase):
     gc.collect()
     base = GlobalCounters.mem_used
     x = Tensor.ones(256).contiguous().realize()
-    (x+Tensor.ones(256).contiguous()).schedule()
+    (x+Tensor.ones(256).contiguous()).schedule_linear()
     gc.collect()
     self.assertEqual(GlobalCounters.mem_used-base, 1024)
 
@@ -797,9 +797,8 @@ class TestSchedule(unittest.TestCase):
       def cnt():
         x, y, z = Tensor.empty((64, 64), dtype='float'), Tensor.empty((64, 64), dtype='float'), Tensor.empty((64, 64), dtype='float')
         a = (x @ y).relu()
-        sched = ((a @ z).relu() + a).schedule()
-        for si in sched: si.lower()
-        return len([si for si in sched if isinstance(si.prg, CompiledRunner)])
+        linear = compile_linear(((a @ z).relu() + a).schedule_linear())
+        return len([call for call in linear.src if call.src[0].op is Ops.PROGRAM])
 
       with Context(IMAGE=1):
         self.assertEqual(cnt(), 5)
@@ -814,9 +813,8 @@ class TestSchedule(unittest.TestCase):
         rb = (((((inp @ b1) + c1).relu() @ b2) + c2).relu() + inp).relu()
         b16, c16 = Tensor.empty((512, 16), dtype='float'), Tensor.empty((16,), dtype='float')
         b32, c32 = Tensor.empty((512, 32), dtype='float'), Tensor.empty((32,), dtype='float')
-        sched = Tensor.schedule((rb @ b16 + c16).relu(), (rb @ b32 + c32).relu())
-        for si in sched: si.lower()
-        return len([si for si in sched if isinstance(si.prg, CompiledRunner)])
+        linear = compile_linear(Tensor.schedule_linear((rb @ b16 + c16).relu(), (rb @ b32 + c32).relu()))
+        return len([call for call in linear.src if call.src[0].op is Ops.PROGRAM])
 
       with Context(IMAGE=1):
         self.assertEqual(cnt(), 9)
@@ -828,9 +826,8 @@ class TestSchedule(unittest.TestCase):
         x, y, z = Tensor.empty((1, 4, 3, 3)), Tensor.empty((4, 1, 3, 3)), Tensor.empty((4, 1, 7, 7))
         a = x.conv2d(y, Tensor.empty(4), groups=4, padding=1)
         b = a.conv2d(z, groups=4, padding=3)
-        sched = (a + b).schedule()
-        for si in sched: si.lower()
-        return len([si for si in sched if isinstance(si.prg, CompiledRunner)])
+        linear = compile_linear((a + b).schedule_linear())
+        return len([call for call in linear.src if call.src[0].op is Ops.PROGRAM])
 
       with Context(IMAGE=1):
         self.assertEqual(cnt(), 5)
@@ -1332,7 +1329,7 @@ class TestCopyFolding(unittest.TestCase):
     b = Tensor.empty(4, device="CPU")
     add = a+b
     assert all_same([x.device for x in add.uop.src]), f"ALU has different devices! {[x.device for x in add.src]}"
-    add.schedule()
+    add.schedule_linear()
 
   def test_alu_before_copy(self):
     buf = Tensor.ones(1).contiguous().realize()
diff --git a/test/external/external_benchmark_schedule.py b/test/external/external_benchmark_schedule.py
index 86879e7489..d7efc7be87 100644
--- a/test/external/external_benchmark_schedule.py
+++ b/test/external/external_benchmark_schedule.py
@@ -23,10 +23,10 @@ if __name__ == "__main__":
     if not FORWARD_ONLY:
       with Timing("***** model schedule in  "):
         with Profiling(PROFILE >= 3):
-          sched = out.schedule()
+          linear = out.schedule_linear()
 
       if not SCHEDULE_ONLY:
-        asts = list({x.ast.key:x.ast for x in sched if x.ast.op is Ops.SINK}.values())
+        asts = list({call.src[0].key:call.src[0] for call in linear.src if call.src[0].op is Ops.SINK}.values())
         if (restrict_kernel := getenv("RESTRICT_KERNEL", -1)) != -1: asts = asts[restrict_kernel:restrict_kernel+1]
 
         with Profiling(PROFILE, fn="/tmp/rewrite.prof"):
diff --git a/test/external/external_test_hcq.py b/test/external/external_test_hcq.py
index 6e1da82703..ad2a5f0cbe 100644
--- a/test/external/external_test_hcq.py
+++ b/test/external/external_test_hcq.py
@@ -20,8 +20,8 @@ class TestHCQ(unittest.TestCase):
     #TestHCQ.d1: AMDDevice = Device["AMD:1"]
     TestHCQ.a = Tensor([0.,1.], device=Device.DEFAULT).realize()
     TestHCQ.b = self.a + 1
-    si = self.b.schedule()[-1]
-    TestHCQ.runner = get_runner(TestHCQ.d0.device, si.ast)
+    linear = self.b.schedule_linear()
+    TestHCQ.runner = get_runner(TestHCQ.d0.device, linear.src[-1].src[0])
     TestHCQ.b.uop.buffer.allocate()
     # wow that's a lot of abstraction layers
     TestHCQ.addr = struct.pack("QQ", TestHCQ.b.uop.buffer._buf, TestHCQ.a.uop.buffer._buf)
diff --git a/test/external/external_test_onnx_runner.py b/test/external/external_test_onnx_runner.py
index 3d58f9c323..d91a2aec59 100644
--- a/test/external/external_test_onnx_runner.py
+++ b/test/external/external_test_onnx_runner.py
@@ -10,8 +10,8 @@ from hypothesis import given, strategies as st
 # copied from test_const_folding.py
 def _check_ast_count(desired_count:int, t:Tensor):
   # NOTE: this has side effect because everything can be scheduled only once
-  schedule = t.schedule()
-  asts = [s for s in schedule if s.ast.op is Ops.SINK]
+  linear = t.schedule_linear()
+  asts = [call for call in linear.src if call.src[0].op is Ops.SINK]
   assert len(asts) == desired_count, f"{len(asts)} != {desired_count}"
 
 def build_onnx(nodes, from_disk:bool=True, **kwargs):
diff --git a/test/external/external_test_schedule_scaling.py b/test/external/external_test_schedule_scaling.py
index 2e07ea4e25..e7026508ee 100644
--- a/test/external/external_test_schedule_scaling.py
+++ b/test/external/external_test_schedule_scaling.py
@@ -6,7 +6,7 @@ class TestScheduleScaling(unittest.TestCase):
 
   def _assert_linear(self, fn, n_small=200, n_large=1000):
     """Assert schedule time scales at most ~linearly: time(n_large)/time(n_small) should be close to n_large/n_small."""
-    fn(n_small).schedule()  # warmup
+    fn(n_small).schedule_linear()  # warmup
     t_small = min(self._time_schedule(fn, n) for n in [n_small]*3)
     t_large = min(self._time_schedule(fn, n) for n in [n_large]*3)
     size_ratio = n_large / n_small  # 5.0
@@ -19,7 +19,7 @@ class TestScheduleScaling(unittest.TestCase):
   @staticmethod
   def _time_schedule(fn, n) -> float:
     st = time.perf_counter()
-    fn(n).schedule()
+    fn(n).schedule_linear()
     return time.perf_counter() - st
 
   # *** rangeify: ending_ranges accumulation and consumer merge ***
diff --git a/test/external/external_uop_gc.py b/test/external/external_uop_gc.py
index 74712a09e3..0590111774 100644
--- a/test/external/external_uop_gc.py
+++ b/test/external/external_uop_gc.py
@@ -14,13 +14,13 @@ def print_uops():
 def start(): pass
 def single_tensor(): Tensor([2])
 def two_plus_two(): Tensor([2])+Tensor([2])
-def two_plus_two_schedule(): (Tensor([2])+Tensor([2])).schedule()
+def two_plus_two_schedule(): (Tensor([2])+Tensor([2])).schedule_linear()
 def two_plus_two_kernel():
-  si = (Tensor([2])+Tensor([2])).schedule()[-1]
-  get_program(si.ast, Device.default.renderer)
+  linear = (Tensor([2])+Tensor([2])).schedule_linear()
+  get_program(linear.src[-1].src[0], Device.default.renderer)
 def two_plus_two_linearize():
-  si = (Tensor([2])+Tensor([2])).schedule()[-1]
-  get_program(si.ast, Device.default.renderer)
+  linear = (Tensor([2])+Tensor([2])).schedule_linear()
+  get_program(linear.src[-1].src[0], Device.default.renderer)
 def two_plus_two_realize(): (Tensor([2])+Tensor([2])).realize()
 def two_plus_two_item(): (Tensor([2])+Tensor([2])).item()
 def gradient_test():
@@ -36,8 +36,8 @@ def kernel_matmul():
   x = Tensor.eye(3, requires_grad=True)
   y = Tensor([[2.0,0,-2.0]], requires_grad=True)
   z = y.matmul(x)
-  si = z.schedule()[-1]
-  get_program(si.ast, Device.default.renderer)
+  linear = z.schedule_linear()
+  get_program(linear.src[-1].src[0], Device.default.renderer)
 def realized_matmul():
   x = Tensor.eye(3, requires_grad=True)
   y = Tensor([[2.0,0,-2.0]], requires_grad=True)
diff --git a/test/external/fuzz_graph.py b/test/external/fuzz_graph.py
index c168e63b51..fd6ec5f18f 100644
--- a/test/external/fuzz_graph.py
+++ b/test/external/fuzz_graph.py
@@ -20,8 +20,8 @@ def gen_prg(device, inputs_cnt):
     s = fst[0]
     for i in range(1, inputs_cnt): s = s.bitwise_xor(fst[i])
 
-    si = s.schedule()[-1]
-    prg = get_runner(device, si.ast)
+    linear = s.schedule_linear()
+    prg = get_runner(device, linear.src[-1].src[0])
   cached_prgs[(device, inputs_cnt)] = prg
   return prg
 
diff --git a/test/null/test_schedule.py b/test/null/test_schedule.py
index 5be6d5d6dc..c58c0adb55 100644
--- a/test/null/test_schedule.py
+++ b/test/null/test_schedule.py
@@ -4,6 +4,7 @@ from tinygrad import nn, dtypes, Device, Tensor
 from tinygrad.uop.ops import UOp, Ops, GroupOp, UPat, KernelInfo
 from tinygrad.helpers import DEBUG, GlobalCounters, Context
 from tinygrad.engine.realize import compile_linear, run_linear
+from tinygrad.codegen import get_program
 
 class KernelCountException(Exception): pass
 def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Tensor]|None=None, filter_sink=True):
@@ -141,7 +142,7 @@ class TestSimpleSchedule(unittest.TestCase):
     a = Tensor.empty(16,16).sum(axis=1)
     a1 = a.reshape(4,4)
     a2 = a.reshape(16,1,1)
-    self.assertEqual(len(Tensor.schedule(a1, a2)), 1)
+    self.assertEqual(len(Tensor.schedule_linear(a1, a2).src), 1)
 
 class TestSchedule(unittest.TestCase):
   def test_create_schedule_handles_multi_kernel_after_and_after_deps(self):
@@ -166,8 +167,8 @@ class TestSchedule(unittest.TestCase):
     kc = Tensor.custom_kernel(out, src_after, fxn=named_copy("kc"))[0]
     out_after = Tensor(kc.uop.src[0].after(*kc.uop.src[1:], kd.uop))
 
-    schedule = out_after.schedule()
-    names = [si.ast.arg.name for si in schedule]
+    linear = out_after.schedule_linear()
+    names = [call.src[0].arg.name for call in linear.src]
     self.assertEqual(set(names), {"ka", "kb", "kc", "kd"})
     self.assertEqual(names[-1], "kc")
     self.assertLess(names.index("ka"), names.index("kc"))
@@ -667,9 +668,9 @@ class TestSchedule(unittest.TestCase):
     check_schedule(c, 2)
 
   def _alu_from_tensor(self, t:Tensor):
-    s = [s for s in t.schedule() if s.ast.op is Ops.SINK]
+    s = [s for s in t.schedule_linear().src if s.src[0].op is Ops.SINK]
     self.assertEqual(len(s), 1)
-    return [u.op for u in s[0].ast.toposort() if u.op in GroupOp.ALU]
+    return [u.op for u in s[0].src[0].toposort() if u.op in GroupOp.ALU]
 
   def test_2_pow_is_exp2(self):
     t = 2.0 ** Tensor([1.0, 2.0, 3.0])
@@ -798,12 +799,12 @@ class TestSchedule(unittest.TestCase):
     Tensor.manual_seed(0)
     x = Tensor.randn(4, 12, 64, 64, dtype=dtypes.half).realize()
     out = x.softmax(dtype=dtypes.float)
-    sched = out.schedule()
-    self.assertEqual(len(sched), 3)
+    linear = out.schedule_linear()
+    self.assertEqual(len(linear.src), 3)
     # max reduction stays in input dtype (no numerical loss), upcast happens after subtracting max
-    self.assertEqual(sched[0].bufs[0].dtype, dtypes.half)
-    self.assertEqual(sched[1].bufs[0].dtype, dtypes.float)
-    self.assertEqual(sched[2].bufs[0].dtype, dtypes.float)
+    self.assertEqual(linear.src[0].src[1].dtype, dtypes.half)
+    self.assertEqual(linear.src[1].src[1].dtype, dtypes.float)
+    self.assertEqual(linear.src[2].src[1].dtype, dtypes.float)
 
   def test_softmax_backward(self):
     Tensor.manual_seed(0)
@@ -960,7 +961,7 @@ class TestSchedule(unittest.TestCase):
     gc.collect()
     base = GlobalCounters.mem_used
     Tensor.ones(256).contiguous().realize()
-    Tensor.ones(5, 5).contiguous().schedule()
+    Tensor.ones(5, 5).contiguous().schedule_linear()
     gc.collect()
     self.assertEqual(GlobalCounters.mem_used-base, 0)
 
@@ -1173,24 +1174,24 @@ class TestFusionOp(unittest.TestCase):
     st = time.perf_counter()
     a = Tensor([1,2,3,4])
     for _ in range(24): a = a + a
-    sched = a.schedule()
-    sched[-1].lower()
+    linear = a.schedule_linear()
+    prg = get_program(linear.src[-1].src[0], renderer=Device[Device.DEFAULT].renderer)
     self.assertLess(time.perf_counter()-st, 2.0)
-    assert len(sched[-1].prg.p.src.splitlines()) < 250
+    assert len(prg.src.splitlines()) < 250
 
   def test_recursive_add_cmp(self):
     st = time.perf_counter()
     a = Tensor([1,2,3,4])
     for _ in range(24): a = a + a
-    sched1 = a.schedule()
+    linear1 = a.schedule_linear()
     b = Tensor([1,2,3,4])
     for _ in range(24): b = b + b
-    sched2 = b.schedule()
+    linear2 = b.schedule_linear()
     c = Tensor([1,2,3,4])
     for _ in range(23): c = c + c
-    sched3 = c.schedule()
-    self.assertEqual(sched1[-1].ast, sched2[-1].ast)
-    with self.assertRaises(AssertionError): self.assertEqual(sched1[-1].ast, sched3[-1].ast)
+    linear3 = c.schedule_linear()
+    self.assertEqual(linear1.src[-1].src[0], linear2.src[-1].src[0])
+    with self.assertRaises(AssertionError): self.assertEqual(linear1.src[-1].src[0], linear3.src[-1].src[0])
     self.assertLess(time.perf_counter()-st, 2.0)
 
   def test_recursive_pad(self):
@@ -1198,8 +1199,8 @@ class TestFusionOp(unittest.TestCase):
     val = 1.0
     a = Tensor(val)
     for _ in range(24): a = Tensor.stack(a, a)[0]
-    sched = a.schedule()
-    self.assertLessEqual(len(sched), 1)
+    linear = a.schedule_linear()
+    self.assertLessEqual(len(linear.src), 1)
     self.assertLess(time.perf_counter()-st, 2.0)
 
   def test_recursive_reshape(self):
@@ -1208,8 +1209,8 @@ class TestFusionOp(unittest.TestCase):
     b = Tensor.empty(16, 2).realize()
     r = a.sum(1)
     for _ in range(24): r = r.reshape(16, 2) + b
-    sched = r.schedule()
-    self.assertEqual(len(sched), 1)
+    linear = r.schedule_linear()
+    self.assertEqual(len(linear.src), 1)
     self.assertLess(time.perf_counter()-st, 2.0)
 
 # NOTE: the NULL backend supports BUFFER_VIEW